-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcontentparser.py
More file actions
84 lines (68 loc) · 3.49 KB
/
contentparser.py
File metadata and controls
84 lines (68 loc) · 3.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from bs4 import BeautifulSoup
import bcolors
import sys
class ContentParser():
def __init__(self):
self.title = ''
self.property_og_title = ''
self.property_og_url = ''
self.property_og_description = ''
self.property_og_image = ''
self.itemprop_name = ''
self.itemprop_img = ''
self.itemprop_description = ''
self.name_img = ''
self.name_description = ''
self.name_author = ''
self.body = ''
def content_parsing(self, context):
bc = bcolors.bcolors()
r = BeautifulSoup(context, 'lxml')
sys.stdout.write("\033[K")
sys.stdout.write('[*] Parsing Contents...\r')
sys.stdout.flush()
try:
self.title = r.title.string
title = r.find("meta", attrs={"property":"og:title"})
url = r.find("meta", attrs={"property":"og:url"})
description = r.find("meta", attrs={"property":"og:description"})
image = r.find("meta", attrs={"property":"og:image"})
self.property_og_title = title["content"] if title else ""
self.property_og_url = url["content"] if url else self.property_og_url
self.property_og_description = description["content"] if description else self.property_og_description
self.property_og_image = image["content"] if image else self.property_og_image
name = r.find("meta", attrs={"itemprop":"name"})
description = r.find("meta", attrs={"itemprop":"description"})
image = r.find("meta", attrs={"itemprop":"og:image"})
self.itemprop_name = name["content"] if name else self.itemprop_name
self.property_og_description = description["content"] if description else self.property_og_description
self.property_og_image = image["content"] if image else self.property_og_image
author = r.find("meta", attrs={"name":"author"})
description = r.find("meta", attrs={"name":"description"})
image = r.find("meta", attrs={"name":"image"})
self.name_author = author["content"] if author else self.name_author
self.property_og_description = description["content"] if description else self.property_og_description
self.property_og_image = image["content"] if image else self.property_og_image
except:
print('%s[!] Content Parsing Error - Unknown Elements%s\r' %
(bc.OKBLUE, bc.ENDC))
self.title = ''
for script in r(["script", "style", "nav", "form", "footer", "noscript", "header"]):
script.decompose() # rip it out
# for code in r()
self.body = r.get_text()
# break into lines and remove leading and trailing space on each
self.body = (line.strip() for line in self.body.splitlines())
# break multi-headlines into a line each
self.body = (phrase.strip()
for line in self.body for phrase in line.split(" "))
# drop blank lines
self.body = ' '.join(chunk for chunk in self.body if chunk)
return {'title': self.title,
'property_og_title': self.property_og_title,
'property_og_url': self.property_og_url,
'property_og_description': self.property_og_description,
'property_og_image': self.property_og_image,
'itemprop_name': self.itemprop_name,
'name_author': self.name_author,
'body': self.body}