Skip to content

Srujanasri6/uni_course_data

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

3 Commits
 
 
 
 
 
 

Repository files navigation

uni_course_data

Just working on a csv file using basic python code. import requests from bs4 import BeautifulSoup import pandas as pd import uuid

universities = [ {"university_id": "UNI001", "university_name": "Massachusetts Institute of Technology", "country": "United States", "city": "Cambridge", "website": "https://www.mit.edu"}, {"university_id": "UNI002", "university_name": "University of Oxford", "country": "United Kingdom", "city": "Oxford", "website": "https://www.ox.ac.uk"}, {"university_id": "UNI003", "university_name": "Indian Institute of Technology Delhi", "country": "India", "city": "New Delhi", "website": "https://www.iitd.ac.in"} ]

def scrape_courses(university): url = university["website"] r = requests.get(url, timeout=15) soup = BeautifulSoup(r.text, "lxml") titles = [t.get_text(strip=True) for t in soup.find_all(["h2","h3"]) if t.get_text(strip=True)] titles = titles[:5] courses = [] if not titles: for i in range(5): courses.append({ "course_id": str(uuid.uuid4())[:8], "university_id": university["university_id"], "course_name": f"{university['university_name']} Sample Course {i+1}", "level": "Bachelor's", "discipline": "N/A", "duration": "N/A", "fees": "N/A", "eligibility": "N/A" }) else: for t in titles: courses.append({ "course_id": str(uuid.uuid4())[:8], "university_id": university["university_id"], "course_name": t, "level": "Bachelor's", "discipline": "N/A", "duration": "N/A", "fees": "N/A", "eligibility": "N/A" }) return courses

uni_df = pd.DataFrame(universities) course_rows = [] for u in universities: course_rows.extend(scrape_courses(u)) courses_df = pd.DataFrame(course_rows)

uni_df = uni_df.drop_duplicates(subset=["website"]).reset_index(drop=True) courses_df = courses_df.drop_duplicates(subset=["course_name","university_id"]).reset_index(drop=True)

uni_df = uni_df.fillna("N/A") courses_df = courses_df.fillna("N/A")

for col in uni_df.columns: if uni_df[col].dtype == "object": uni_df[col] = uni_df[col].str.strip() for col in courses_df.columns: if courses_df[col].dtype == "object": courses_df[col] = courses_df[col].str.strip()

valid_uni_ids = set(uni_df["university_id"]) courses_df = courses_df[courses_df["university_id"].isin(valid_uni_ids)].reset_index(drop=True)

with pd.ExcelWriter("universities_courses_scraped.xlsx", engine="openpyxl") as writer: uni_df.to_excel(writer, sheet_name="Universities", index=False)

courses_df.to_excel(writer, sheet_name="Courses", index=False)

About

Just working on a csv file using basic python code.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

 
 
 

Contributors