Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ http://www.pip-installer.org/en/latest/
After installing them, you should be able to install the following packages: <br />
```bash
$ pip install nltk
$ pip install urllib
$ pip install lxml
```

Expand Down
52 changes: 26 additions & 26 deletions region.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,17 @@ def find_node_text(self,node):
"""
node_text = "\n"
for des in node.iter():
try:
if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\
and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\
or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1:
node_text += re.sub(r"\n|\r|\t","",des.text_content())
if des.tag == 'p':
node_text += "\n"
elif des.tail is not None and des.tag in ['table']:
node_text += des.tail
except:
node_text += ""
try:
if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\
and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\
or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1:
node_text += re.sub(r"\n|\r|\t","",des.text_content())
if des.tag == 'p':
node_text += "\n"
elif des.tail is not None and des.tag in ['table']:
node_text += des.tail
except:
node_text += ""

node_text = node_text.replace("&gt",">")
node_text = re.sub(r" |,,|--|==|<!--(.|\s)*?-->|<!\[CDATA(.|\s)*?\]\]>","",node_text)
Expand Down Expand Up @@ -117,7 +117,7 @@ def calculate_id(self):
Returns the id attribute of the node if it exists, otherwise it returns
an empty string.
"""
if self.root_node.attrib.has_key('id'):
if 'id' in self.root_node.attrib:
return self.root_node.attrib['id']
else:
return ""
Expand All @@ -127,26 +127,26 @@ def calculate_class_name(self):
Returns the CSS class attribute of the node if it exists, otherwise it returns
an empty string.
"""
if self.root_node.attrib.has_key('class'):
if 'class' in self.root_node.attrib:
return self.root_node.attrib['class']
else:
return ""

def _print(self):
# Uncomment the following for debugging of the regions.
"""
print Tcolors.CYAN + "[x] Region:", self.root
print "-----------------------------------------------------------------------------"
print Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag
print " Class:", self.class_name
print " Id:", self.id
print " Level:", self.distance_from_root
print " Parts:", self.parts
print " Density:", self.density
print " Distance from max:", self.distance_from_max
print " Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node)
print " Full text: \n", self.full_text
print "\n" + Tcolors.ENDC
print(Tcolors.CYAN + "[x] Region:", self.root)
print("-----------------------------------------------------------------------------")
print(Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag)
print(" Class:", self.class_name)
print(" Id:", self.id)
print(" Level:", self.distance_from_root)
print(" Parts:", self.parts)
print(" Density:", self.density)
print(" Distance from max:", self.distance_from_max)
print(" Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node))
print(" Full text: \n", self.full_text)
print("\n" + Tcolors.ENDC)
"""
pass
pass

116 changes: 58 additions & 58 deletions sd_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,18 @@ def __init__(self):

def analyze_page(self):

print "[*] Create DOM tree..."
print("[*] Create DOM tree...")
tree = self.construct_page_tree()
node = tree.getroot()
self.cross_tree(node)
print "[*] Calculating initial groups..."
print "[*] Merging groups..."
print("[*] Calculating initial groups...")
print("[*] Merging groups...")
self.merge_groups(tree)
print "[*] Creating regions..."
print("[*] Creating regions...")
self.create_regions(tree)
print "[*] Calculating distances from max region..."
print("[*] Calculating distances from max region...")
self.calculate_distances_from_max(tree)
print "[*] Printing regions...\n"
print("[*] Printing regions...\n")
for region in self.regions:
region._print()

Expand All @@ -76,14 +76,14 @@ def construct_page_tree(self):
Downloads the HTML page given the URL and creates the DOM page tree.
Only the nodes that are useful for the segmentation are kept.
"""
page = urllib.urlopen(self.url)
html_body = page.read()
with urllib.request.urlopen(self.url) as response:
html_body = response.read()
doc = html.fromstring(html_body)
cleaner = Cleaner(**ARGS)
try:
doc = cleaner.clean_html(doc)
doc = cleaner.clean_html(doc)
except:
pass
pass
tree = doc.getroottree()
return tree

Expand All @@ -102,45 +102,45 @@ def classify_page(self):
if article_exists:
max_group = self.get_candidate_article(article, grouped_comments)

if grouped_comments.has_key(max_group):
if max_group in grouped_comments:
if grouped_comments != {}:
validated = self.candidate_group_level_validated(max_group, article, grouped_comments)

context_validated = self.candidate_context_validated(article, grouped_comments, max_group)
if self.big_areas_in_same_level(article, grouped_comments, max_group) and not validated:
print Tcolors.INFO + " Multiple similar regions detected!"
print "Class: "
print Tcolors.RES + " " + grouped_comments[max_group][0].class_name
print "Texts: "
print(Tcolors.INFO + " Multiple similar regions detected!")
print("Class: ")
print(Tcolors.RES + " " + grouped_comments[max_group][0].class_name)
print("Texts: " )
for reg in grouped_comments[max_group]:
print reg.full_text
print(reg.full_text)
return None, None, grouped_comments[max_group]
elif not context_validated:
print
print()
self.print_article(article)
print
print Tcolors.INFO + " No comments found."
print()
print(Tcolors.INFO + " No comments found.")
return article, None, None
elif context_validated:
print
print Tcolors.INFO + " Article with comments detected!"
print()
print(Tcolors.INFO + " Article with comments detected!")
self.print_article(article)
print
print "Comment class:"
print Tcolors.RES + " " + max_group
print "Comments:"
print()
print("Comment class:")
print(Tcolors.RES + " " + max_group)
print("Comments:")
for com in grouped_comments[max_group]:
print com.full_text
print(com.full_text)
return article, grouped_comments[max_group], None
else:
self.print_article(article)
return article, None, None
else:
print Tcolors.INFO + " Multiple similar regions detected!"
print Tcolors.RES
print "Texts: "
print(Tcolors.INFO + " Multiple similar regions detected!" )
print(Tcolors.RES)
print("Texts: ")
for reg in biggest_regions:
print reg.full_text
print(reg.full_text)
return None, None, biggest_regions

def group_regions(self):
Expand All @@ -161,11 +161,11 @@ def group_regions(self):
self.min_region_level = region.distance_from_root

pr_com = (len(region.tree.xpath(region.root)) > 0 and\
region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \
region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
('class' in region.tree.xpath(region.root)[0].getparent().attrib) and \
region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
if region.distance_from_max != 0 and (region.class_name != "" or \
(region.class_name == "" and pr_com)):
if not grouped_comments.has_key(region.class_name):
if region.class_name not in grouped_comments:
grouped_comments[region.class_name] = [region]
else:
grouped_comments[region.class_name].append(region)
Expand Down Expand Up @@ -233,9 +233,9 @@ def get_candidate_article(self, article, grouped_comments):
max_group_density = 0

if article.root_node.getparent() is not None:
article_parent_path = self.get_path(article.root_node.getparent())
article_parent_path = self.get_path(article.root_node.getparent())
else:
article_parent_path = ""
article_parent_path = ""
max_group = None
groups_level = {}
groups_below_article_tags = []
Expand Down Expand Up @@ -317,8 +317,8 @@ def candidate_group_level_validated(self, max_group, article, grouped_comments):
comment_remaining_nodes = comment_path.split("/")

if len(article_remaining_nodes) > 1 and len(comment_remaining_nodes) > 1:
article_number = re.search("\d",article_remaining_nodes[0])
comment_number = re.search("\d",comment_remaining_nodes[0])
article_number = re.search(r"\d",article_remaining_nodes[0])
comment_number = re.search(r"\d",comment_remaining_nodes[0])
if article_number and comment_number:
article_number.start()
comment_number.start()
Expand Down Expand Up @@ -353,7 +353,7 @@ def big_areas_in_same_level(self, article, grouped_comments, max_group):
Check if the big regions (or areas) belong to the same level in the
HTML tree structure.
"""
if grouped_comments.has_key(max_group):
if max_group in grouped_comments:
first_candidate_comment = grouped_comments[max_group][0]
return article.distance_from_root == first_candidate_comment.distance_from_root\
and self.combined_region_level_exceeded(article)
Expand All @@ -366,7 +366,7 @@ def candidate_context_validated(self, article, grouped_comments, max_group):
Check whether the candidate comment regions validate as such based on
the keywords that are detected in their content.
"""
print Tcolors.ACT + " Validating candidate comment group based on its content..."
print(Tcolors.ACT + " Validating candidate comment group based on its content...")
COMMENT_TAGS = ['comment', 'reply', 'response', 'ident', 'said:', 'rate','user','inner','wrote:']
STRONG_COMMENT_TAGS = ['comment','reply','user','said:','wrote:']

Expand All @@ -379,9 +379,9 @@ def candidate_context_validated(self, article, grouped_comments, max_group):

for des in list(comment_parent.iterdescendants()) + [comment_parent]:
classname = id = ""
if des.attrib.has_key("class"):
if "class" in des.attrib:
classname = des.attrib['class']
if des.attrib.has_key("id"):
if "id" in des.attrib:
id = des.attrib['id']
for ctag in COMMENT_TAGS:
contents = (des.text_content() + classname + id).lower()
Expand Down Expand Up @@ -411,13 +411,13 @@ def print_article(self, article):
"""
Print the details of a detected article (class, title and text).
"""
print Tcolors.INFO + " Article detected!"
print "Article class: "
print Tcolors.RES + " " + repr(article.class_name)
print "Article title: "
print article.get_ancestor_title()
print "Article text: "
print article.full_text.replace("\n"," ")
print(Tcolors.INFO + " Article detected!" )
print("Article class: ")
print(Tcolors.RES + " " + repr(article.class_name))
print("Article title: ")
print(article.get_ancestor_title())
print("Article text: ")
print(article.full_text.replace("\n"," "))

def merge_groups(self, tree):
"""
Expand All @@ -429,7 +429,7 @@ def merge_groups(self, tree):
parent = node.getparent()
if parent is not None:
parent_path = self.get_path(parent)
if self.valid_nodes.has_key(parent_path):
if parent_path in self.valid_nodes:
self.valid_nodes[parent_path].append(group)
self.valid_nodes[parent_path].extend(self.valid_nodes[group])
del self.valid_nodes[group]
Expand Down Expand Up @@ -498,7 +498,7 @@ def calculate_distances_from_max(self, tree, fixed_regions=False):
if region.distance_from_max == 0 and region.parts == 1 \
and not fixed_regions and len(list(region.root_node.getchildren())) > 1\
and (self.content_appears_in_other_region(region)\
or self.close_diff_from_second_max(max_region)): #and
or self.close_diff_from_second_max(self.max_region)): #and
self.regions.remove(region)
self.recompute_max_density_region()
fixed_regions = True
Expand All @@ -510,17 +510,17 @@ def find_node_text(self, node):
"""
node_text = ""
try:
t = node.text
t = True
t = node.text
t = True
except:
t = False
t = False
if t and node.text is not None:
node_text = node.text
else:
try:
itertext = list(node.itertext())
itertext = list(node.itertext())
except:
itertext = []
itertext = []
itertexts = [text for text in itertext if text is not None and re.sub(r"\n|\r|\t| |,|\.","",text) != ""]
descendants = [des for des in list(node.iterdescendants())]
descendants_length = len(descendants)
Expand Down Expand Up @@ -556,7 +556,7 @@ def cross_tree(self, node, node_text=None, level=0):
if node_text is None:
node_text = self.find_node_text(node)

if node.attrib.has_key("class") and node.attrib["class"] == "wrappers":
if ("class" in node.attrib) and node.attrib["class"] == "wrappers":
dess = []
for d,des in enumerate(node.iterdescendants()):
if des.text is not None:
Expand Down Expand Up @@ -608,7 +608,7 @@ def get_style(self, node):
"""
Get the style attribute of the node if it exists.
"""
if node.attrib.has_key("style"):
if "style" in node.attrib:
style = node.attrib.get('style')
else:
style = ""
Expand All @@ -626,7 +626,7 @@ def group_node(self, node, node_text):

if parent_path not in ["/html","/html/body"] and node_text is not None\
and node.tag != 'body' and self.has_visible_parents(valid_parent):
if not self.valid_nodes.has_key(parent_path):
if parent_path not in self.valid_nodes:
self.valid_nodes[parent_path] = [node_path]
else:
if node_path not in self.valid_nodes[parent_path]:
Expand Down