PyJob/look_module.py at web_app · dluca22/PyJob · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from analize_module import analisis, elaborate, reset
from bs4 import BeautifulSoup
import requests
import re
import sys


# ===========================================================
def build_dict():

    reset()
    default_dict = {}

    with open("list_of_keys.txt", "r") as file:
        for line in file:
            default_dict[line.strip()] = 0
    return  default_dict


# ===========================================================

def search(country , place, job_search, user_dict={}, page=0):
    # empty set

    searched_ids = set()

    default_dict = build_dict()
    # if user dict is defined, add it
    if user_dict:
        default_dict.update(user_dict)

    jobList = extract_from_page(country=country, place=place, job_search=job_search, page=page)

    if not jobList:
        return "400 - Invalid search" #invalid

    else:
        # se il job id non è nella lista, pull description dal suo link
        # try:
        for j in jobList:
            if j['id'] not in searched_ids:
                # add it to the set
                searched_ids.add(j['id'])
                # pull the listing for the offer
                page_soup = pull_listing_data(f'http://{country}.indeed.com' + j['job_link'])
                # call func to get description from the soup
                description = get_description(page_soup)

            # for every job "id" page that has not yet been analized call analisis()
                analisis(description, default_dict)

        # after parsing all, launch elaborate()

        elaborate()

# ===========================================================

# formats user input to match url specifications
def format_entry(entry):
    # replace space with '+' with regular expression
    # strip trailing whitespaces
    formatted = re.sub(r"\s+", '+', entry.strip())

    return formatted.lower()

# ===========================================================

def extract_from_page(country, place, job_search, page):
    # empty job list to be filled with dicts
    jobList = []
    if page == 1:
        # if page 1 is searched, the url requests "0" as main page
        jobList = transform(extract(country=country, page=0, place=place, job_search=job_search))
            #if page argument given >1 , loop over and .extend the jobList adding all job dictionaries
    elif page > 1:
        # transform value *10 because url indeed uses 0, 10, 20, 30 to 40(=page 5)
        page = page * 10

        for p in range(0, page , 10):
            jobList.extend(transform(extract(country=country, page=p, place=place, job_search=job_search)))
    if not jobList:
        return None
    else:
        return jobList

# ===========================================================


# returns the HTML of the page
def extract(country, page, place, job_search):
    # global url
    agent = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Vivaldi/5.3.2679.70.'}
    # page 1 starts at 0, then increments of 10
    url = f'http://{country}.indeed.com/jobs?q={job_search}&l={place}&start={page}&vjk=ab0f880e61368268'
    # url_usa = f'https://www.indeed.com/jobs?q={job_search}&l={place}&start={page}&vjk=ab0f880e61368268'

    r = requests.get(url, headers=agent)
    if r.status_code != 200:
        sys.exit(f"Request returned <{r.status_code}>")

    else:
        # returns the DOM object
        soup = BeautifulSoup(r.content, 'html.parser')

        return soup

# ===========================================================


# gets all the job offer divs
def transform(soup):
    jobList = []
    # all the card divs
    divs = soup.find_all('div', class_='job_seen_beacon')

    if not divs:
        return None
    else:
        for item in divs:

            # job title is in the <a> tag as text
            jobTitle = item.find('a').text.strip()
            companyName = item.find('span', class_='companyName').text.strip()
            id = item.find('a').get('id')
            # get link from the <a> href attribute
            job_link = item.find('a').get('href')
            # create a job dictionary
            job = {
                'id' : id,
                'title': jobTitle,
                'company': companyName,
                'job_link': job_link
            }
            # every loop appends a dictionary to the list
            jobList.append(job)

        return jobList

# ===========================================================


# extracts the DOM from every job link page
def pull_listing_data(job_link):
    agent = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Vivaldi/5.3.2679.70.'}
    r = requests.get(job_link, headers=agent)

    pageSoup = BeautifulSoup(r.content, 'html.parser')

    return pageSoup


# ===========================================================

# returns the text for the job offer description
def get_description(jobSoup):

    description = jobSoup.find('div', {'id': 'jobDescriptionText'}).text.strip()
    return description.strip().lower()