This repository was archived by the owner on Apr 19, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebsite_parser.py
More file actions
41 lines (37 loc) · 2.01 KB
/
website_parser.py
File metadata and controls
41 lines (37 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from website_downloader import WebsiteDownloader
from bs4 import BeautifulSoup
from urllib.parse import urljoin
class WebsiteParser:
"""downloads DOM from website and parses it to use with the website_generator"""
def __init__(self, properties):
self.properties = properties
self.downloader = WebsiteDownloader()
self.html = self.downloader.download(properties.source_url)
self.html_tags_dict = dict()
self.parse()
def parse(self):
"""parses website and extracts specific tags in html_tag class"""
soup = BeautifulSoup(self.html, "html.parser")
for element in self.properties.tags_to_download:
"""element == tag --> e.g. img """
self.html_tags_dict[element] = soup.find_all(element)
self.complete_relative_url()
def complete_relative_url(self):
for key in self.html_tags_dict:
for element in self.html_tags_dict[key]:
href = element.get("href")
src = element.get("src")
data_src = element.get("data-src")
data_src_high = element.get("data-src-high")
if href is not None and href != "":
new_href = urljoin(self.properties.source_url, href)
element["href"] = element["href"].replace(href, new_href)
if src is not None and src != "":
new_src = urljoin(self.properties.source_url, src)
element["src"] = element["src"].replace(src, new_src)
if data_src is not None and data_src != "":
new_data_src = urljoin(self.properties.source_url, data_src)
element["data-src"] = element["data-src"].replace(data_src, new_data_src)
if data_src_high is not None and data_src_high != "":
new_data_src_high = urljoin(self.properties.source_url, data_src_high)
element["data-src-high"] = element["data-src-high"].replace(data_src_high, new_data_src_high)