-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
153 lines (113 loc) · 4.58 KB
/
Copy pathscraper.py
File metadata and controls
153 lines (113 loc) · 4.58 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import requests
import pandas as pd
from datetime import datetime
# IMPORTANT: Remotive API JSON deta hai, HTML nahi!
def scrape_jobs(max_jobs=100):
"""
Remotive API se jobs scrape karega
Parameters:
- max_jobs: Kitne jobs chahiye (default 100)
Returns:
- List of dictionaries (har job ek dictionary)
"""
all_jobs = []
# Remotive API URL
api_url = "https://remotive.com/api/remote-jobs"
print("🔍 Scraping is going on...\n")
try:
# API ko request bhejega
print("Sending request to Remotive API...")
response = requests.get(api_url, timeout=10)
# Status check
if response.status_code == 200:
print("API responded successfully!\n")
# JSON data parse karega (ye automatically dictionary banata hai)
data = response.json()
# 'jobs' key mein saare jobs hain
jobs_data = data.get('jobs', [])
print(f"Total jobs available: {len(jobs_data)}\n")
# Limit tak jobs process karega
jobs_to_process = jobs_data[:max_jobs]
print(f"Processing {len(jobs_to_process)} jobs...\n")
# Har job ko process karega
for idx, job in enumerate(jobs_to_process, 1):
try:
# Job data extract karega
job_data = {
'Job Title': job.get('title', 'N/A'),
'Company': job.get('company_name', 'N/A'),
'Location': job.get('candidate_required_location', 'Worldwide'),
'Job Type': job.get('job_type', 'N/A'),
'Category': job.get('category', 'N/A'),
'Tags': ', '.join(job.get('tags', [])),
'Salary': job.get('salary', 'Not specified'),
'Description': job.get('description', 'N/A')[:200] + '...',
'URL': job.get('url', 'N/A'),
'Publication Date': job.get('publication_date', 'N/A'),
'Scraped On': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
all_jobs.append(job_data)
# Progress show
if idx % 10 == 0:
print(
f" ✓ Processed {idx}/{len(jobs_to_process)} jobs...")
except Exception as e:
print(f"⚠️ Error in job {idx}: {e}")
continue
print(f"\n Successfully scraped {len(all_jobs)} jobs!")
else:
print(f"❌ API Error! Status code: {response.status_code}")
print(f"Response: {response.text[:200]}")
except requests.exceptions.Timeout:
print("❌ Request timeout - API too slow")
except requests.exceptions.ConnectionError:
print("❌ Connection error - Check your internet")
except Exception as e:
print(f"❌ Unexpected error: {e}")
return all_jobs
# STEP 2: Data ko CSV mein save karna hai ab
def save_to_csv(jobs_list, filename='jobs_data.csv'):
"""
Jobs list ko CSV file mein save karega
Parameters:
- jobs_list: List of job dictionaries
- filename: Output CSV ka naam
"""
if jobs_list:
# Pandas DataFrame banate hain
df = pd.DataFrame(jobs_list)
# CSV mein save karte hain
df.to_csv(filename, index=False, encoding='utf-8')
print("\n" + "=" * 70)
print(f"{len(jobs_list)} jobs saved to {filename}")
print(f"Columns: {', '.join(list(df.columns)[:5])}...")
print("=" * 70)
return df
else:
print("\n❌ NO Jobs found in scraping!")
return None
# MAIN EXECUTION
if __name__ == "__main__":
print("\n" + "=" * 70)
print("JOB SCRAPER STARTING - REMOTIVE API")
print("=" * 70 + "\n")
# Scraping shuru hogi (50 jobs)
jobs = scrape_jobs(max_jobs=50)
# CSV mein save
df = save_to_csv(jobs)
# Quick preview
if df is not None:
print("\n" + "=" * 70)
print("PREVIEW (First 5 jobs):")
print("=" * 70)
print(df[['Job Title', 'Company', 'Category', 'Job Type']].head())
print("\n" + "=" * 70)
print("QUICK STATS:")
print("=" * 70)
print(f"Total Jobs: {len(df)}")
print(f"Unique Companies: {df['Company'].nunique()}")
print(f"Categories: {', '.join(df['Category'].unique()[:5])}")
print(f"Job Types: {', '.join(df['Job Type'].unique())}")
print("\n" + "=" * 70)
print("DONE! Now run analyzer.py")
print("=" * 70 + "\n")