Just working on a csv file using basic python code. import requests from bs4 import BeautifulSoup import pandas as pd import uuid
universities = [ {"university_id": "UNI001", "university_name": "Massachusetts Institute of Technology", "country": "United States", "city": "Cambridge", "website": "https://www.mit.edu"}, {"university_id": "UNI002", "university_name": "University of Oxford", "country": "United Kingdom", "city": "Oxford", "website": "https://www.ox.ac.uk"}, {"university_id": "UNI003", "university_name": "Indian Institute of Technology Delhi", "country": "India", "city": "New Delhi", "website": "https://www.iitd.ac.in"} ]
def scrape_courses(university): url = university["website"] r = requests.get(url, timeout=15) soup = BeautifulSoup(r.text, "lxml") titles = [t.get_text(strip=True) for t in soup.find_all(["h2","h3"]) if t.get_text(strip=True)] titles = titles[:5] courses = [] if not titles: for i in range(5): courses.append({ "course_id": str(uuid.uuid4())[:8], "university_id": university["university_id"], "course_name": f"{university['university_name']} Sample Course {i+1}", "level": "Bachelor's", "discipline": "N/A", "duration": "N/A", "fees": "N/A", "eligibility": "N/A" }) else: for t in titles: courses.append({ "course_id": str(uuid.uuid4())[:8], "university_id": university["university_id"], "course_name": t, "level": "Bachelor's", "discipline": "N/A", "duration": "N/A", "fees": "N/A", "eligibility": "N/A" }) return courses
uni_df = pd.DataFrame(universities) course_rows = [] for u in universities: course_rows.extend(scrape_courses(u)) courses_df = pd.DataFrame(course_rows)
uni_df = uni_df.drop_duplicates(subset=["website"]).reset_index(drop=True) courses_df = courses_df.drop_duplicates(subset=["course_name","university_id"]).reset_index(drop=True)
uni_df = uni_df.fillna("N/A") courses_df = courses_df.fillna("N/A")
for col in uni_df.columns: if uni_df[col].dtype == "object": uni_df[col] = uni_df[col].str.strip() for col in courses_df.columns: if courses_df[col].dtype == "object": courses_df[col] = courses_df[col].str.strip()
valid_uni_ids = set(uni_df["university_id"]) courses_df = courses_df[courses_df["university_id"].isin(valid_uni_ids)].reset_index(drop=True)
with pd.ExcelWriter("universities_courses_scraped.xlsx", engine="openpyxl") as writer: uni_df.to_excel(writer, sheet_name="Universities", index=False)
courses_df.to_excel(writer, sheet_name="Courses", index=False)