-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlook_module.py
More file actions
161 lines (116 loc) · 5.02 KB
/
look_module.py
File metadata and controls
161 lines (116 loc) · 5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from analize_module import analisis, elaborate, reset
from bs4 import BeautifulSoup
import requests
import re
import sys
# ===========================================================
def build_dict():
reset()
default_dict = {}
with open("list_of_keys.txt", "r") as file:
for line in file:
default_dict[line.strip()] = 0
return default_dict
# ===========================================================
def search(country , place, job_search, user_dict={}, page=0):
# empty set
searched_ids = set()
default_dict = build_dict()
# if user dict is defined, add it
if user_dict:
default_dict.update(user_dict)
jobList = extract_from_page(country=country, place=place, job_search=job_search, page=page)
if not jobList:
return "400 - Invalid search" #invalid
else:
# se il job id non è nella lista, pull description dal suo link
# try:
for j in jobList:
if j['id'] not in searched_ids:
# add it to the set
searched_ids.add(j['id'])
# pull the listing for the offer
page_soup = pull_listing_data(f'http://{country}.indeed.com' + j['job_link'])
# call func to get description from the soup
description = get_description(page_soup)
# for every job "id" page that has not yet been analized call analisis()
analisis(description, default_dict)
# after parsing all, launch elaborate()
elaborate()
# ===========================================================
# formats user input to match url specifications
def format_entry(entry):
# replace space with '+' with regular expression
# strip trailing whitespaces
formatted = re.sub(r"\s+", '+', entry.strip())
return formatted.lower()
# ===========================================================
def extract_from_page(country, place, job_search, page):
# empty job list to be filled with dicts
jobList = []
if page == 1:
# if page 1 is searched, the url requests "0" as main page
jobList = transform(extract(country=country, page=0, place=place, job_search=job_search))
#if page argument given >1 , loop over and .extend the jobList adding all job dictionaries
elif page > 1:
# transform value *10 because url indeed uses 0, 10, 20, 30 to 40(=page 5)
page = page * 10
for p in range(0, page , 10):
jobList.extend(transform(extract(country=country, page=p, place=place, job_search=job_search)))
if not jobList:
return None
else:
return jobList
# ===========================================================
# returns the HTML of the page
def extract(country, page, place, job_search):
# global url
agent = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Vivaldi/5.3.2679.70.'}
# page 1 starts at 0, then increments of 10
url = f'http://{country}.indeed.com/jobs?q={job_search}&l={place}&start={page}&vjk=ab0f880e61368268'
# url_usa = f'https://www.indeed.com/jobs?q={job_search}&l={place}&start={page}&vjk=ab0f880e61368268'
r = requests.get(url, headers=agent)
if r.status_code != 200:
sys.exit(f"Request returned <{r.status_code}>")
else:
# returns the DOM object
soup = BeautifulSoup(r.content, 'html.parser')
return soup
# ===========================================================
# gets all the job offer divs
def transform(soup):
jobList = []
# all the card divs
divs = soup.find_all('div', class_='job_seen_beacon')
if not divs:
return None
else:
for item in divs:
# job title is in the <a> tag as text
jobTitle = item.find('a').text.strip()
companyName = item.find('span', class_='companyName').text.strip()
id = item.find('a').get('id')
# get link from the <a> href attribute
job_link = item.find('a').get('href')
# create a job dictionary
job = {
'id' : id,
'title': jobTitle,
'company': companyName,
'job_link': job_link
}
# every loop appends a dictionary to the list
jobList.append(job)
return jobList
# ===========================================================
# extracts the DOM from every job link page
def pull_listing_data(job_link):
agent = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Vivaldi/5.3.2679.70.'}
r = requests.get(job_link, headers=agent)
pageSoup = BeautifulSoup(r.content, 'html.parser')
return pageSoup
# ===========================================================
# returns the text for the job offer description
def get_description(jobSoup):
description = jobSoup.find('div', {'id': 'jobDescriptionText'}).text.strip()
return description.strip().lower()