automatic-web-scraper/app.py at main · zuhaibbutt786/automatic-web-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Import required libraries
import streamlit as st
import requests
from bs4 import BeautifulSoup
import pandas as pd
from itertools import chain


# Function to scrape data from a given URL and extract specified classes from a single page
def scrape_data_from_single_page(url, selected_classes):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        data = {}

        for class_name in selected_classes:
            elements = soup.find_all(class_=class_name)
            data[class_name] = [elem.text.strip() for elem in elements]

        return data

    except Exception as e:
        st.error(f"Error: {e}")
        return None


# Function to scrape data from a given URL and extract specified classes from all pages
def scrape_data_from_all_pages(base_url, selected_classes, total_pages):
    all_data = {}

    for page in range(1, total_pages + 1):
        url = f"{base_url}?page={page}"
        page_data = scrape_data_from_single_page(url, selected_classes)
        if page_data:
            for class_name, values in page_data.items():
                all_data.setdefault(class_name, []).extend(values)

    return all_data


# Function to get available classes from the provided URL
def get_available_classes(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        classes = set(chain.from_iterable(elem.get("class") for elem in soup.find_all(class_=True) if elem.get("class")))
        return list(classes)

    except Exception as e:
        st.error(f"Error fetching available classes: {e}")
        return []

# Introduction section
def introduction():
    st.title("Web Scraping App")
    st.write("Welcome to the Web Scraping App! This app allows you to extract data from web pages "
             "by specifying the URL and selecting HTML classes to scrape. No coding skills required!")

# About section
def about():
    st.header("About")
    st.write("Web scraping is a technique used to extract data from websites. This app simplifies the process "
             "and lets you scrape data easily. It uses the BeautifulSoup library for parsing HTML content and "
             "provides a user-friendly interface powered by Streamlit.")

# How it works section
def how_it_works():
    st.header("How It Works")
    st.write("1. Enter the URL of the website you want to scrape data from in the text box above.")
    st.write("2. The app will fetch the available classes from the provided URL and display them as options.")
    st.write("3. Select the classes you want to scrape data from using the multi-select dropdown.")
    st.write("4. Click the 'Scrape' button to extract the data from the selected classes.")
    st.write("5. The scraped data will be displayed in a table below.")
    st.write("6. You can download the scraped data as a CSV file using the 'Download CSV' button.")

# Tips section
def tips():
    st.header("Tips for Successful Scraping")
    st.write("1. Make sure you have proper permissions to scrape data from the website.")
    st.write("2. Respect the website's terms of service and robots.txt file.")
    st.write("3. Be considerate of the website's resources and avoid aggressive scraping.")
    st.write("4. Test the app with different websites to ensure it works as expected.")
    st.write("5. Regularly check and update the scraping code, as website structures may change.")

def contact():
    st.header("Contact for data science projects")
    st.write("✔ Email: zuhaibbutt3@gmail.com")
    st.write("✔ Linkedin: https://www.linkedin.com/in/zuhaib-hussain-butt-6628141a4/")
    st.write("✔ Whatsapp: +923167609216")


# Streamlit web app
def main():
    introduction()
    st.write("----")
    about()
    st.write("----")
    how_it_works()
    st.write("----")
    tips()
    st.write("----")
    contact()
    st.write("----")

    # User input: URL and classes to scrape
    st.write("Enter the URL of the website and select the classes to scrape.")
    url = st.text_input("URL:")
    available_classes = get_available_classes(url)
    selected_classes = st.multiselect("Select classes to scrape:", available_classes)

    # User input: Number of pages to scrape
    total_pages = st.number_input("Total Pages", min_value=1, value=1)

    if st.button("Scrape"):
        if url and selected_classes:

            data = scrape_data_from_all_pages(url, selected_classes, total_pages)

            if data:
                # Convert data to a DataFrame and create a CSV file
                df = pd.DataFrame(data)
                st.dataframe(df)

                # Create and download CSV file
                csv_file = df.to_csv(index=False)
                st.download_button(label="Download CSV", data=csv_file, file_name="scraped_data.csv", mime="text/csv", key="csv-download")
            else:
                st.warning("No data scraped.")
        else:
            st.warning("Please enter a valid URL and select at least one class.")

if __name__ == "__main__":
    main()