-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
136 lines (111 loc) · 4.71 KB
/
main.py
File metadata and controls
136 lines (111 loc) · 4.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import streamlit as st
from scrape import scrape_website, clean_body_content, split_dom_content
from ai_handler import process_content_with_ai
import concurrent.futures
import pandas as pd
import io
def convert_to_dataframe(results):
"""Convert real estate data to a structured DataFrame"""
data = []
for idx, content in results:
# Initialize default values
row = {
'Section': f'Part {idx}',
'Location': '',
'Price': '',
'Property Type': '',
'Size': '',
'Developer': ''
}
# Parse the structured content
lines = content.split('\n')
for line in lines:
if ':' in line:
key, value = line.split(':', 1)
key = key.strip()
value = value.strip()
if key in row:
row[key] = value
data.append(row)
return pd.DataFrame(data)
# Streamlit UI
st.title("AI Web Scraper")
# Initialize session state for storing results
if 'results' not in st.session_state:
st.session_state.results = []
if 'cleaned_content' not in st.session_state:
st.session_state.cleaned_content = None
# Input URL
url = st.text_input("Enter Website URL")
if url:
if st.button("Scrape Website"):
with st.spinner("Scraping website..."):
# Scrape website
dom_content = scrape_website(url)
if dom_content:
# Clean the content
st.session_state.cleaned_content = clean_body_content(dom_content)
st.success("Website scraped successfully!")
else:
st.error("Failed to scrape the website. Please try again.")
# Create tabs for different views
if st.session_state.cleaned_content is not None:
content_tab, analysis_tab = st.tabs(["Raw Content", "AI Analysis"])
with content_tab:
st.text_area("DOM Content", st.session_state.cleaned_content, height=300)
with analysis_tab:
# Get user query
user_query = st.text_area("What would you like to know about this content?")
if user_query and st.button("Analyze Content"):
# Split content into chunks
content_chunks = split_dom_content(st.session_state.cleaned_content)
progress_bar = st.progress(0)
status_text = st.empty()
# Process chunks in parallel
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [
executor.submit(process_content_with_ai, chunk, user_query)
for chunk in content_chunks
]
# Collect results
st.session_state.results = []
for idx, future in enumerate(concurrent.futures.as_completed(futures), 1):
result = future.result()
st.session_state.results.append((idx, result))
# Update progress
progress = idx / len(content_chunks)
progress_bar.progress(progress)
status_text.text(f"Processing chunk {idx}/{len(content_chunks)}")
# Clear progress indicators
progress_bar.empty()
status_text.empty()
# Display results
st.success("Analysis complete!")
# Create a DataFrame from results
df = convert_to_dataframe(st.session_state.results)
# Add export buttons
col1, col2 = st.columns(2)
# CSV export
csv = df.to_csv(index=False).encode('utf-8')
with col1:
st.download_button(
label="Download CSV",
data=csv,
file_name="analysis_results.csv",
mime="text/csv"
)
# Excel export
buffer = io.BytesIO()
with pd.ExcelWriter(buffer, engine='openpyxl') as writer:
df.to_excel(writer, index=False, sheet_name='Analysis Results')
with col2:
st.download_button(
label="Download Excel",
data=buffer.getvalue(),
file_name="analysis_results.xlsx",
mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
)
# Display results in expandable sections
for idx, result in sorted(st.session_state.results):
with st.expander(f"Analysis Part {idx}"):
st.markdown(result)