Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 15 additions & 66 deletions legacy/LinkedinScrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,59 +4,42 @@
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support import expected_conditions as expect
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import random
import re
import pandas as pd
import numpy as np
import os

from datetime import datetime
def get_env_variable(var_name):
value = os.getenv(var_name)
if not value:
raise ValueError(f"Environment variable {var_name} is not set")
return value

from selenium.common.exceptions import (
ElementNotVisibleException,
ElementClickInterceptedException,
WebDriverException,
TimeoutException,
)

from os.path import dirname, join


username = input('Enter your linkedin Email : ')
password = input('Enter your linkedin Password : ')
username = get_env_variable('LINKEDIN_EMAIL')
password = get_env_variable('LINKEDIN_PASSWORD')
file_name = input('Enter your file name : ')


# username = ''
# password = ''
# file_name = ''

# search_query = ""
search_query = input('Enter your search query : ')
search_query = search_query.replace(" ", "%20")

place_name = input('Enter targed place name : ')
# place_name = ""


options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()),options=options) # version_main allows to specify your chrome version instead of following chrome global version
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # version_main allows to specify your chrome version instead of following chrome global version
driver.maximize_window()



def loging():
driver.get('https://www.linkedin.com')
WebDriverWait(driver, 10).until(expect.visibility_of_element_located((By.XPATH, '//input[@id="session_key"]')))
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//input[@id="session_key"]')))
driver.find_element(By.XPATH, '//input[@id="session_key"]').send_keys(username)
driver.find_element(By.XPATH,"//input[@id='session_password']").send_keys(password)
driver.find_element(By.XPATH,"//button[contains(text(),'Sign in')]").click()



def scrap_available_profie():
Expand All @@ -67,11 +50,11 @@ def scrap_available_profie():
count = 0

driver.get('https://www.linkedin.com/search/results/PEOPLE/?keywords='+search_query)
WebDriverWait(driver, 10).until(expect.visibility_of_element_located((By.XPATH, '//button[text()="Locations"]')))
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, '//button[text()="Locations"]')))

driver.find_element(By.XPATH, '//button[text()="Locations"]').click()

WebDriverWait(driver, 10).until(expect.visibility_of_element_located((By.XPATH, "//input[@placeholder='Add a location']")))
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.XPATH, "//input[@placeholder='Add a location']")))
driver.find_element(By.XPATH, "//input[@placeholder='Add a location']").send_keys(place_name)
time.sleep(2)
driver.find_element(By.XPATH, "//*[contains(@id, 'basic-result-')]").click()
Expand All @@ -81,44 +64,30 @@ def scrap_available_profie():
time.sleep(4)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# html = driver.page_source
# soup = BeautifulSoup(html)
# with open("output4.html", "w", encoding = 'utf-8') as file:
# file.write(str(soup.prettify()))
# pegination = driver.find_element(By.CSS_SELECTOR, '.artdeco-pagination.artdeco-pagination--has-controls.ember-view.pv5.ph2')
# list = driver.find_element(By.CLASS_NAME, 'artdeco-pagination__indicator')

html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")

for ultag in soup.find_all('ul'):
for litag in ultag.find_all('li'):
#print(litag.text)
li_text = litag.text
li_text = li_text.strip()
if(li_text.isnumeric()):
total_pages.append(litag.text)

for search_rslt_tag in soup.find_all("div", {"class": "ph0 pv2 artdeco-card mb2"}):
for a in search_rslt_tag.find_all('a', href=True):
#print ("Found the URL:", a['href'])
if (count % 2) == 0:
Linkedin_link.append(a['href'])

count = count + 1

for designation in search_rslt_tag.find_all('div', {"class":"entity-result__primary-subtitle t-14 t-black t-normal"}):
#print (designation.text)
Linked_in_designation.append(designation.text)


if(len(total_pages) == 0):
last_page = 1
else:
last_page = int(re.search(r'\d+', total_pages[len(total_pages)-1]).group())
#last_page = 2

print(driver.current_url)

for x in range(2, last_page+1):
time.sleep(10)
Expand All @@ -129,40 +98,29 @@ def scrap_available_profie():
time.sleep(4)
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# html = driver.page_source
# soup = BeautifulSoup(html)
# with open("output4.html", "w", encoding = 'utf-8') as file:
# file.write(str(soup.prettify()))
# pegination = driver.find_element(By.CSS_SELECTOR, '.artdeco-pagination.artdeco-pagination--has-controls.ember-view.pv5.ph2')
# list = driver.find_element(By.CLASS_NAME, 'artdeco-pagination__indicator')

html = driver.page_source
soup = BeautifulSoup(html, features="html.parser")

for ultag in soup.find_all('ul'):
for litag in ultag.find_all('li'):
#print(litag.text)
li_text = litag.text
li_text = li_text.strip()
if(li_text.isnumeric()):
total_pages.append(litag.text)

for search_rslt_tag in soup.find_all("div", {"class": "ph0 pv2 artdeco-card mb2"}):
for a in search_rslt_tag.find_all('a', href=True):
#print ("Found the URL:", a['href'])
if (count % 2) == 0:
Linkedin_link.append(a['href'])

count = count + 1

for designation in search_rslt_tag.find_all('div', {"class":"entity-result__primary-subtitle t-14 t-black t-normal"}):
#print (designation.text)
if (count2 % 2) == 0:
Linked_in_designation.append(designation.text)
else:
count2 = count2 + 1


Linkedin_link = list(dict.fromkeys(Linkedin_link))
Linked_in_designation = list(dict.fromkeys(Linked_in_designation))

Expand All @@ -171,10 +129,6 @@ def scrap_available_profie():
if 'headless?' not in L_l:
clean_linkedin_links.append(L_l)

print(len(clean_linkedin_links))
print(len(Linked_in_designation))


a = np.array(clean_linkedin_links)
b = np.array(Linked_in_designation)

Expand All @@ -184,11 +138,6 @@ def scrap_available_profie():
df2 = pd.DataFrame({"Designation" : b})
df2.to_csv(file_name+"_Designation.csv", index=False)





loging()
time.sleep(15)
scrap_available_profie()

scrap_available_profie()