AI_Web_Scraper/main.py at main · abdul-28930/AI_Web_Scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
from scrape import scrape_website, clean_body_content, split_dom_content
from ai_handler import process_content_with_ai
import concurrent.futures
import pandas as pd
import io

def convert_to_dataframe(results):
    """Convert real estate data to a structured DataFrame"""
    data = []
    for idx, content in results:
        # Initialize default values
        row = {
            'Section': f'Part {idx}',
            'Location': '',
            'Price': '',
            'Property Type': '',
            'Size': '',
            'Developer': ''
        }

        # Parse the structured content
        lines = content.split('\n')
        for line in lines:
            if ':' in line:
                key, value = line.split(':', 1)
                key = key.strip()
                value = value.strip()

                if key in row:
                    row[key] = value

        data.append(row)

    return pd.DataFrame(data)

# Streamlit UI
st.title("AI Web Scraper")

# Initialize session state for storing results
if 'results' not in st.session_state:
    st.session_state.results = []
if 'cleaned_content' not in st.session_state:
    st.session_state.cleaned_content = None

# Input URL
url = st.text_input("Enter Website URL")

if url:
    if st.button("Scrape Website"):
        with st.spinner("Scraping website..."):
            # Scrape website
            dom_content = scrape_website(url)

            if dom_content:
                # Clean the content
                st.session_state.cleaned_content = clean_body_content(dom_content)
                st.success("Website scraped successfully!")
            else:
                st.error("Failed to scrape the website. Please try again.")

# Create tabs for different views
if st.session_state.cleaned_content is not None:
    content_tab, analysis_tab = st.tabs(["Raw Content", "AI Analysis"])

    with content_tab:
        st.text_area("DOM Content", st.session_state.cleaned_content, height=300)

    with analysis_tab:
        # Get user query
        user_query = st.text_area("What would you like to know about this content?")

        if user_query and st.button("Analyze Content"):
            # Split content into chunks
            content_chunks = split_dom_content(st.session_state.cleaned_content)

            progress_bar = st.progress(0)
            status_text = st.empty()

            # Process chunks in parallel
            with concurrent.futures.ThreadPoolExecutor() as executor:
                futures = [
                    executor.submit(process_content_with_ai, chunk, user_query)
                    for chunk in content_chunks
                ]

                # Collect results
                st.session_state.results = []
                for idx, future in enumerate(concurrent.futures.as_completed(futures), 1):
                    result = future.result()
                    st.session_state.results.append((idx, result))
                    # Update progress
                    progress = idx / len(content_chunks)
                    progress_bar.progress(progress)
                    status_text.text(f"Processing chunk {idx}/{len(content_chunks)}")

            # Clear progress indicators
            progress_bar.empty()
            status_text.empty()

            # Display results
            st.success("Analysis complete!")

            # Create a DataFrame from results
            df = convert_to_dataframe(st.session_state.results)

            # Add export buttons
            col1, col2 = st.columns(2)

            # CSV export
            csv = df.to_csv(index=False).encode('utf-8')
            with col1:
                st.download_button(
                    label="Download CSV",
                    data=csv,
                    file_name="analysis_results.csv",
                    mime="text/csv"
                )

            # Excel export
            buffer = io.BytesIO()
            with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
                df.to_excel(writer, index=False, sheet_name='Analysis Results')

            with col2:
                st.download_button(
                    label="Download Excel",
                    data=buffer.getvalue(),
                    file_name="analysis_results.xlsx",
                    mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                )

            # Display results in expandable sections
            for idx, result in sorted(st.session_state.results):
                with st.expander(f"Analysis Part {idx}"):
                    st.markdown(result)