From 68ee2ea88a05bd0df03e7fd61d7ab61657da6c0c Mon Sep 17 00:00:00 2001 From: RafaelJohn9 Date: Mon, 13 May 2024 22:57:47 +0300 Subject: [PATCH 1/3] added the Coursera Scrapper Signed-off-by: RafaelJohn9 --- Python/Coursera_Scrapper/.gitignore | 1 + Python/Coursera_Scrapper/README.md | 23 ++ Python/Coursera_Scrapper/coursera_scapper.py | 241 +++++++++++++++++++ Python/Coursera_Scrapper/requirements.txt | 2 + 4 files changed, 267 insertions(+) create mode 100644 Python/Coursera_Scrapper/.gitignore create mode 100644 Python/Coursera_Scrapper/README.md create mode 100755 Python/Coursera_Scrapper/coursera_scapper.py create mode 100644 Python/Coursera_Scrapper/requirements.txt diff --git a/Python/Coursera_Scrapper/.gitignore b/Python/Coursera_Scrapper/.gitignore new file mode 100644 index 000000000..afed0735d --- /dev/null +++ b/Python/Coursera_Scrapper/.gitignore @@ -0,0 +1 @@ +*.csv diff --git a/Python/Coursera_Scrapper/README.md b/Python/Coursera_Scrapper/README.md new file mode 100644 index 000000000..112654114 --- /dev/null +++ b/Python/Coursera_Scrapper/README.md @@ -0,0 +1,23 @@ +• This Python script is a web scraper for the Coursera website. It fetches information about courses based on a user-provided query and saves the data to a CSV file. Here's a breakdown of the script: + +• The script imports necessary libraries: *bs4* for parsing HTML, *csv* for writing to CSV files, *os* for interacting with the OS, and *requests* for making HTTP requests. + +• It defines constants for the base URL and search URL of Coursera. + +• The get_soup function makes a GET request to the provided URL and returns a BeautifulSoup object of the HTML content. + +• The get_courses_links function extracts the links of the courses from the search results page. + +• The get_title, get_course_ratings_reviews, get_start_date, get_course_duration, get_difficulty, and get_skills functions extract specific details about a course from its page. + +• The append_to_csv function writes the scraped data to a CSV file. If the file doesn't exist, it creates one and writes the header and data; otherwise, it appends the data. + +• The full_query function is the main function that coordinates the scraping process. It calls the other functions to scrape the data, handles any None values, and writes the data to the CSV file. + +• In the if __name__ == '__main__': block, the script prompts the user for a course query and the limit of courses to scrape, then calls the full_query function with these inputs. + +# Note: This script uses CSS selectors to locate elements on the webpage. If Coursera changes its website structure, the script may stop working. + +# Note: The Errors that are being printed in the stdout means that that particular course does not have that specific attribute and they should be ignored + +![image for the working scrapper](https://imgur.com/a/RfDzZxw) diff --git a/Python/Coursera_Scrapper/coursera_scapper.py b/Python/Coursera_Scrapper/coursera_scapper.py new file mode 100755 index 000000000..8d45d8d25 --- /dev/null +++ b/Python/Coursera_Scrapper/coursera_scapper.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +""" +This module is used to scrape the Coursera website to get course data. +""" +import csv +import os +import requests +import bs4 + +BASE_URL = "https://www.coursera.org" +SEARCH_BASE_URL = "https://www.coursera.org/search?query=" + +# pylint: disable=W0718 +# pylint: disable=W0621 +def get_soup(url: str, query: str, page: int = None) -> bs4.BeautifulSoup: + """ + This function takes the query, URL to parse it into and returns the soup object. + """ + try: + if page is None: + url = url + query + else: + url = url + query + f'&page={page}' + + response = requests.get(url, timeout=10) + response.raise_for_status() + soup = bs4.BeautifulSoup(response.text, 'html.parser') + return soup + except requests.exceptions.RequestException as e: + print(f"Error occurred while making the request: {e}") + return None + except Exception as e: + print(f"\nError occurred while parsing the HTML: {e}\tSkipping...\n") + return None + + +def get_course_links(soup: bs4.BeautifulSoup) -> list[str]: + """ + This function takes the soup object and returns the course links. + """ + try: + if soup.select_one('div[data-e2e="NumberOfResultsSection"] span') is not None: + result_text = soup.select_one('div[data-e2e="NumberOfResultsSection"] span').text + if result_text.startswith('No results found for'): + return None + + course_links = soup.select('a[id*=product-card-title]') + courses_links = [link.get('href') for link in course_links] + return courses_links + except Exception as e: + print(f"\nError occurred while getting course links: {e}\tSkipping...\n") + return None + + +def get_title(soup: bs4.BeautifulSoup) -> str: + """ + Given the soup object, this function returns the course title. + """ + try: + title = soup.select_one('h1[data-e2e=hero-title]').text + return title + except Exception as e: + print(f"\nError occurred while getting course title: {e}\tSkipping...\n") + return None + + +def get_course_ratings_reviews(soup: bs4.BeautifulSoup) -> str: + """ + Given the soup object, this function returns the course ratings and reviews. + """ + ratings = None + try: + tags = soup.select('div.cds-119.cds-Typography-base.css-h1jogs.cds-121') + for tag in tags: + try: + rating = float(tag.text.strip()) + ratings = rating + break + except ValueError: + continue + except Exception as e: + print(f"\nError occurred while getting ratings: {e}\tSkipping...\n") + try: + reviews = soup.select_one('p.css-vac8rf:-soup-contains("review")').text + except Exception as e: + print(f"\nError occurred while getting reviews: {e}\tSkipping...\n") + reviews = None + + result = f"{ratings} {reviews}" + return result + + +def get_start_date(soup: bs4.BeautifulSoup) -> str: + """ + Given the soup object, this function returns the start date of the course. + """ + try: + start_date = soup.select_one('div.startdate').text + start_date = start_date.replace('Starts', '').strip() + return start_date + except Exception as e: + print(f"\nError occurred while getting start date: {e}\tSkipping...\n") + return None + + +def get_course_duration(soup: bs4.BeautifulSoup) -> str: + """ + Given the soup object, this function returns the course duration. + """ + try: + tags = soup.select('div.cds-119.cds-Typography-base.css-h1jogs.cds-121') + for tag in tags: + text = tag.text.strip() + if 'month' in text or 'week' in text or 'hours' in text or 'minutes' in text: + course_duration = text + break + return course_duration + except Exception as e: + print(f"\nError occurred while getting course duration: {e}\tSkipping...\n") + return None + + +def get_difficulty(soup: bs4.BeautifulSoup) -> str: + """ + Given the soup object, this function returns the course difficulty. + """ + try: + tags = soup.select('div.cds-119.cds-Typography-base.css-h1jogs.cds-121') + for tag in tags: + text = tag.text.strip() + if 'level' in text and len(text.split()) < 4: + difficulty = text + break + return difficulty + except Exception as e: + print(f"\nError occurred while getting difficulty: {e}\tSkipping...\n") + return None + + +def get_skills(soup: bs4.BeautifulSoup) -> str: + """ + Given the soup object, this function returns the skills you'll gain. + """ + try: + skills_list = [] + ul_tag = soup.find('ul', class_='css-yk0mzy') + if ul_tag: + li_tags = ul_tag.find_all('li') + skills_list = [li.text for li in li_tags] + return '; '.join(skills_list) + except Exception as e: + print(f"\nError occurred while getting skills: {e}\tSkipping...\n") + return None + + +def append_to_csv(query: str, line: str) -> None: + """ + Append the line to the CSV file. + """ + filename = f"{query}.csv" + header = ["Index", + "Title", + "Course Link", + "Ratings & Reviews", + "Difficulty", + "Start Date", + "Course Duration", + "Skills Gained" + ] + data = [line.split(", ")] + + if os.path.isfile(filename): + with open(filename, 'a', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerows(data) + else: + with open(filename, 'w', newline='', encoding='utf-8') as file: + writer = csv.writer(file) + writer.writerow(header) + writer.writerows(data) + + +def full_query(query: str, limit: int = 50) -> None: + """ + Perform a full query to scrape course data. + """ + page = 1 + soup = get_soup(SEARCH_BASE_URL, query, page) + courses_links = get_course_links(soup) + index = 1 + while courses_links is not None: + for course_link in courses_links: + if index > limit: + print("Courses successfully retrieved!") + return + # get course data + html_soup = get_soup(BASE_URL, course_link) + title = get_title(html_soup) + ratings_reviews = get_course_ratings_reviews(html_soup) + difficulty = get_difficulty(html_soup) + start_date = get_start_date(html_soup) + course_duration = get_course_duration(html_soup) + skills = get_skills(html_soup) + + # Handle None values + title = title.replace(',', '') if title else 'None' + ratings_reviews = ratings_reviews.replace(',', '') if ratings_reviews else 'None' + difficulty = difficulty.replace(',', '') if difficulty else 'None' + start_date = start_date.replace(',', '') if start_date else 'None' + course_duration = course_duration.replace(',', '') if course_duration else 'None' + skills = skills.replace(',', '') if skills else 'None' + + line = ( + f"{index}, " + f"{title.replace(',', '')}, " + f"{BASE_URL + course_link}, " + f"{ratings_reviews.replace(',', '')}, " + f"{difficulty.replace(',', '')}, " + f"{start_date}, " + f"{course_duration}, " + f"{skills}" + ) + + append_to_csv(query, line) + print(f"Data for {title} has been saved to {query}.csv") + index += 1 + page += 1 + soup = get_soup(SEARCH_BASE_URL, query, page) + courses_links = get_course_links(soup) + + +if __name__ == '__main__': + print("Welcome to Coursera Course Scraper!\n") + query = input("Please enter the course you would like to be scraped: ") + try: + LIMIT = int(input("Please enter the limit of the courses you want (an integer value): ")) + print("Limit value received. Beginning scrape...") + except ValueError: + print("Incorrect value entered. Falling back to default...") + LIMIT = 100 + full_query(query, LIMIT) diff --git a/Python/Coursera_Scrapper/requirements.txt b/Python/Coursera_Scrapper/requirements.txt new file mode 100644 index 000000000..b7a5722d0 --- /dev/null +++ b/Python/Coursera_Scrapper/requirements.txt @@ -0,0 +1,2 @@ +bs4==0.0.1 +requests=2.31.0 \ No newline at end of file From 456ddebcc897f00fe304096c96652572b39fb2dd Mon Sep 17 00:00:00 2001 From: JohnKagunda <125447154+RafaelJohn9@users.noreply.github.com> Date: Mon, 13 May 2024 23:14:35 +0300 Subject: [PATCH 2/3] Update README.md: Fixed problem with broken image link --- Python/Coursera_Scrapper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/Coursera_Scrapper/README.md b/Python/Coursera_Scrapper/README.md index 112654114..84987acae 100644 --- a/Python/Coursera_Scrapper/README.md +++ b/Python/Coursera_Scrapper/README.md @@ -20,4 +20,4 @@ # Note: The Errors that are being printed in the stdout means that that particular course does not have that specific attribute and they should be ignored -![image for the working scrapper](https://imgur.com/a/RfDzZxw) +![image for the working scrapper](https://imgur.com/hNRi2cY) From d02939ed381f9e9dd24c46983f095122398a1e1d Mon Sep 17 00:00:00 2001 From: RafaelJohn9 Date: Tue, 14 May 2024 05:50:19 +0300 Subject: [PATCH 3/3] Fixed pylint and broken Image problem in imgur Signed-off-by: RafaelJohn9 --- Python/Coursera_Scrapper/README.md | 2 +- Python/Coursera_Scrapper/coursera_scapper.py | 51 +++++++++++--------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/Python/Coursera_Scrapper/README.md b/Python/Coursera_Scrapper/README.md index 84987acae..631dedcec 100644 --- a/Python/Coursera_Scrapper/README.md +++ b/Python/Coursera_Scrapper/README.md @@ -20,4 +20,4 @@ # Note: The Errors that are being printed in the stdout means that that particular course does not have that specific attribute and they should be ignored -![image for the working scrapper](https://imgur.com/hNRi2cY) +![image for the working scrapper](https://i.imgur.com/hNRi2cY.png) diff --git a/Python/Coursera_Scrapper/coursera_scapper.py b/Python/Coursera_Scrapper/coursera_scapper.py index 8d45d8d25..f1d0dc19e 100755 --- a/Python/Coursera_Scrapper/coursera_scapper.py +++ b/Python/Coursera_Scrapper/coursera_scapper.py @@ -10,7 +10,6 @@ BASE_URL = "https://www.coursera.org" SEARCH_BASE_URL = "https://www.coursera.org/search?query=" -# pylint: disable=W0718 # pylint: disable=W0621 def get_soup(url: str, query: str, page: int = None) -> bs4.BeautifulSoup: """ @@ -26,11 +25,11 @@ def get_soup(url: str, query: str, page: int = None) -> bs4.BeautifulSoup: response.raise_for_status() soup = bs4.BeautifulSoup(response.text, 'html.parser') return soup - except requests.exceptions.RequestException as e: - print(f"Error occurred while making the request: {e}") + except requests.exceptions.RequestException as error: + print(f"Error occurred while making the request: {error}") return None - except Exception as e: - print(f"\nError occurred while parsing the HTML: {e}\tSkipping...\n") + except bs4.FeatureNotFound as error: + print(f"\nError occurred while parsing the HTML: {error}\tSkipping...\n") return None @@ -47,8 +46,8 @@ def get_course_links(soup: bs4.BeautifulSoup) -> list[str]: course_links = soup.select('a[id*=product-card-title]') courses_links = [link.get('href') for link in course_links] return courses_links - except Exception as e: - print(f"\nError occurred while getting course links: {e}\tSkipping...\n") + except (AttributeError, ValueError, IndexError) as error: + print(f"\nError occurred while getting course links: {error}\tSkipping...\n") return None @@ -59,8 +58,8 @@ def get_title(soup: bs4.BeautifulSoup) -> str: try: title = soup.select_one('h1[data-e2e=hero-title]').text return title - except Exception as e: - print(f"\nError occurred while getting course title: {e}\tSkipping...\n") + except (AttributeError, ValueError) as error: + print(f"\nError occurred while getting course title: {error}\tSkipping...\n") return None @@ -78,12 +77,15 @@ def get_course_ratings_reviews(soup: bs4.BeautifulSoup) -> str: break except ValueError: continue - except Exception as e: - print(f"\nError occurred while getting ratings: {e}\tSkipping...\n") + except (AttributeError, ValueError) as error: + print(f"\nError occurred while getting ratings: {error}\tSkipping...\n") try: reviews = soup.select_one('p.css-vac8rf:-soup-contains("review")').text - except Exception as e: - print(f"\nError occurred while getting reviews: {e}\tSkipping...\n") + except requests.exceptions.RequestException as error: + print(f"\nError occurred while making the request: {error}\tSkipping...\n") + reviews = None + except (bs4.FeatureNotFound, AttributeError, ValueError) as error: + print(f"\nError occurred while parsing the HTML: {error}\tSkipping...\n") reviews = None result = f"{ratings} {reviews}" @@ -98,8 +100,11 @@ def get_start_date(soup: bs4.BeautifulSoup) -> str: start_date = soup.select_one('div.startdate').text start_date = start_date.replace('Starts', '').strip() return start_date - except Exception as e: - print(f"\nError occurred while getting start date: {e}\tSkipping...\n") + except requests.exceptions.RequestException as error: + print(f"\nError occurred while making the request: {error}\tSkipping...\n") + return None + except (AttributeError, ValueError, bs4.FeatureNotFound) as error: + print(f"\nError occurred while parsing the HTML: {error}\tSkipping...\n") return None @@ -115,8 +120,8 @@ def get_course_duration(soup: bs4.BeautifulSoup) -> str: course_duration = text break return course_duration - except Exception as e: - print(f"\nError occurred while getting course duration: {e}\tSkipping...\n") + except (AttributeError, ValueError, UnboundLocalError) as error: + print(f"\nError occurred while getting course duration: {error}\tSkipping...\n") return None @@ -132,8 +137,8 @@ def get_difficulty(soup: bs4.BeautifulSoup) -> str: difficulty = text break return difficulty - except Exception as e: - print(f"\nError occurred while getting difficulty: {e}\tSkipping...\n") + except (AttributeError, ValueError, UnboundLocalError) as error: + print(f"\nError occurred while getting difficulty: {error}\tSkipping...\n") return None @@ -148,8 +153,8 @@ def get_skills(soup: bs4.BeautifulSoup) -> str: li_tags = ul_tag.find_all('li') skills_list = [li.text for li in li_tags] return '; '.join(skills_list) - except Exception as e: - print(f"\nError occurred while getting skills: {e}\tSkipping...\n") + except (AttributeError, ValueError, UnboundLocalError) as error: + print(f"\nError occurred while getting skills: {error}\tSkipping...\n") return None @@ -161,8 +166,8 @@ def append_to_csv(query: str, line: str) -> None: header = ["Index", "Title", "Course Link", - "Ratings & Reviews", - "Difficulty", + "Ratings & Reviews", + "Difficulty", "Start Date", "Course Duration", "Skills Gained"