nik0spapp · t-montes · Feb 4, 2024 · Feb 4, 2024 · Feb 4, 2024 · Feb 4, 2024
diff --git a/README.md b/README.md
@@ -27,7 +27,6 @@ http://www.pip-installer.org/en/latest/
 After installing them, you should be able to install the following packages: <br />
 ```bash
 $ pip install nltk  
-$ pip install urllib 
 $ pip install lxml 
 ```
 

diff --git a/region.py b/region.py
@@ -73,17 +73,17 @@ def find_node_text(self,node):
         """
         node_text = "\n" 
         for des in node.iter():
-			try:
-				if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\
-		           and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\
-		           or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1:
-					node_text += re.sub(r"\n|\r|\t","",des.text_content())
-					if des.tag == 'p':
-						node_text += "\n"   
-					elif des.tail is not None and des.tag in ['table']:
-						node_text += des.tail
-			except:
-				node_text += ""
+            try:
+                if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\
+                   and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\
+                   or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1:
+                    node_text += re.sub(r"\n|\r|\t","",des.text_content())
+                    if des.tag == 'p':
+                        node_text += "\n"   
+                    elif des.tail is not None and des.tag in ['table']:
+                        node_text += des.tail
+            except:
+                node_text += ""
 
         node_text = node_text.replace("&gt",">")
         node_text = re.sub(r"  |,,|--|==|<!--(.|\s)*?-->|<!\[CDATA(.|\s)*?\]\]>","",node_text)  
@@ -117,7 +117,7 @@ def calculate_id(self):
         Returns the id attribute of the node if it exists, otherwise it returns 
         an empty string.
         """
-        if self.root_node.attrib.has_key('id'):
+        if 'id' in self.root_node.attrib:
             return self.root_node.attrib['id']
         else:
             return ""
@@ -127,26 +127,26 @@ def calculate_class_name(self):
         Returns the CSS class attribute of the node if it exists, otherwise it returns 
         an empty string.
         """
-        if self.root_node.attrib.has_key('class'):
+        if 'class' in self.root_node.attrib:
             return self.root_node.attrib['class']
         else:
             return ""
 
     def _print(self): 
         # Uncomment the following for debugging of the regions.
         """
-        print Tcolors.CYAN  + "[x] Region:", self.root
-        print "-----------------------------------------------------------------------------"
-        print Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag
-        print " Class:", self.class_name
-        print " Id:", self.id
-        print " Level:", self.distance_from_root
-        print " Parts:", self.parts
-        print " Density:", self.density
-        print " Distance from max:", self.distance_from_max
-        print " Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node)
-        print " Full text: \n", self.full_text 
-        print "\n" + Tcolors.ENDC
+        print(Tcolors.CYAN  + "[x] Region:", self.root)
+        print("-----------------------------------------------------------------------------")
+        print(Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag)
+        print(" Class:", self.class_name)
+        print(" Id:", self.id)
+        print(" Level:", self.distance_from_root)
+        print(" Parts:", self.parts)
+        print(" Density:", self.density)
+        print(" Distance from max:", self.distance_from_max)
+        print(" Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node))
+        print(" Full text: \n", self.full_text)
+        print("\n" + Tcolors.ENDC)
         """
-    	pass 
+        pass 
 
diff --git a/sd_algorithm.py b/sd_algorithm.py
@@ -46,18 +46,18 @@ def __init__(self):
 
     def analyze_page(self):
 
-        print "[*] Create DOM tree..."
+        print("[*] Create DOM tree...")
         tree = self.construct_page_tree() 
         node = tree.getroot()
         self.cross_tree(node) 
-        print "[*] Calculating initial groups..."
-        print "[*] Merging groups..."
+        print("[*] Calculating initial groups...")
+        print("[*] Merging groups...")
         self.merge_groups(tree) 
-        print "[*] Creating regions..."
+        print("[*] Creating regions...")
         self.create_regions(tree) 
-        print "[*] Calculating distances from max region..."
+        print("[*] Calculating distances from max region...")
         self.calculate_distances_from_max(tree)  
-        print "[*] Printing regions...\n"
+        print("[*] Printing regions...\n")
         for region in self.regions:
             region._print()  
 
@@ -76,14 +76,14 @@ def construct_page_tree(self):
         Downloads the HTML page given the URL and creates the DOM page tree.
         Only the nodes that are useful for the segmentation are kept.
         """
-        page = urllib.urlopen(self.url)
-        html_body = page.read()  
+        with urllib.request.urlopen(self.url) as response:
+            html_body = response.read()  
         doc = html.fromstring(html_body)
         cleaner = Cleaner(**ARGS)
         try:
-        	doc = cleaner.clean_html(doc)
+            doc = cleaner.clean_html(doc)
         except:
-        	pass
+            pass
         tree = doc.getroottree() 
         return tree 
 
@@ -102,45 +102,45 @@ def classify_page(self):
         if article_exists: 
             max_group = self.get_candidate_article(article, grouped_comments)
 
-            if grouped_comments.has_key(max_group): 
+            if max_group in grouped_comments: 
                 if grouped_comments != {}:
                     validated = self.candidate_group_level_validated(max_group, article, grouped_comments)
 
                 context_validated =  self.candidate_context_validated(article, grouped_comments, max_group)            
                 if self.big_areas_in_same_level(article, grouped_comments, max_group) and not validated:
-                    print Tcolors.INFO + " Multiple similar regions detected!"
-                    print "Class: "
-                    print Tcolors.RES + " " + grouped_comments[max_group][0].class_name
-                    print "Texts: " 
+                    print(Tcolors.INFO + " Multiple similar regions detected!")
+                    print("Class: ")
+                    print(Tcolors.RES + " " + grouped_comments[max_group][0].class_name)
+                    print("Texts: " )
                     for reg in grouped_comments[max_group]:
-                        print reg.full_text
+                        print(reg.full_text)
                     return None, None, grouped_comments[max_group]
                 elif not context_validated: 
-                    print
+                    print()
                     self.print_article(article)
-                    print 
-                    print Tcolors.INFO + " No comments found."                
+                    print()
+                    print(Tcolors.INFO + " No comments found.")                
                     return article, None, None
                 elif context_validated:
-                    print 
-                    print Tcolors.INFO + " Article with comments detected!"
+                    print()
+                    print(Tcolors.INFO + " Article with comments detected!")
                     self.print_article(article)
-                    print 
-                    print "Comment class:"      
-                    print Tcolors.RES + " " + max_group 
-                    print "Comments:" 
+                    print()
+                    print("Comment class:")
+                    print(Tcolors.RES + " " + max_group) 
+                    print("Comments:")
                     for com in grouped_comments[max_group]:
-                        print com.full_text              
+                        print(com.full_text)
                     return article, grouped_comments[max_group], None
             else:
                 self.print_article(article)
                 return article, None, None
         else: 
-            print Tcolors.INFO + " Multiple similar regions detected!"  
-            print Tcolors.RES
-            print "Texts: " 
+            print(Tcolors.INFO + " Multiple similar regions detected!" ) 
+            print(Tcolors.RES)
+            print("Texts: ")
             for reg in biggest_regions:
-                print reg.full_text
+                print(reg.full_text)
             return None, None, biggest_regions
 
     def group_regions(self):
@@ -161,11 +161,11 @@ def group_regions(self):
                     self.min_region_level = region.distance_from_root
 
             pr_com = (len(region.tree.xpath(region.root)) > 0 and\
-            	      region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \
-            	     region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
+                      ('class' in region.tree.xpath(region.root)[0].getparent().attrib) and \
+                     region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
             if region.distance_from_max != 0 and (region.class_name != "" or \
                (region.class_name == "" and pr_com)):
-                if not grouped_comments.has_key(region.class_name):
+                if region.class_name not in grouped_comments:
                     grouped_comments[region.class_name] = [region]
                 else:
                     grouped_comments[region.class_name].append(region)
@@ -233,9 +233,9 @@ def get_candidate_article(self, article, grouped_comments):
         max_group_density = 0
 
         if article.root_node.getparent() is not None:
-        	article_parent_path = self.get_path(article.root_node.getparent())
+            article_parent_path = self.get_path(article.root_node.getparent())
         else:
-        	article_parent_path = ""
+            article_parent_path = ""
         max_group = None
         groups_level = {}        
         groups_below_article_tags = []
@@ -317,8 +317,8 @@ def candidate_group_level_validated(self, max_group, article, grouped_comments):
         comment_remaining_nodes = comment_path.split("/")
 
         if len(article_remaining_nodes) > 1 and len(comment_remaining_nodes) > 1:
-            article_number = re.search("\d",article_remaining_nodes[0])
-            comment_number = re.search("\d",comment_remaining_nodes[0])
+            article_number = re.search(r"\d",article_remaining_nodes[0])
+            comment_number = re.search(r"\d",comment_remaining_nodes[0])
             if article_number and comment_number:
                 article_number.start()
                 comment_number.start()
@@ -353,7 +353,7 @@ def big_areas_in_same_level(self, article, grouped_comments, max_group):
         Check if the big regions (or areas) belong to the same level in the 
         HTML tree structure.
         """
-        if grouped_comments.has_key(max_group):
+        if max_group in grouped_comments:
             first_candidate_comment = grouped_comments[max_group][0] 
             return article.distance_from_root == first_candidate_comment.distance_from_root\
                    and self.combined_region_level_exceeded(article)
@@ -366,7 +366,7 @@ def candidate_context_validated(self, article, grouped_comments, max_group):
         Check whether the candidate comment regions validate as such based on
         the keywords that are detected in their content.
         """
-        print Tcolors.ACT + " Validating candidate comment group based on its content..."
+        print(Tcolors.ACT + " Validating candidate comment group based on its content...")
         COMMENT_TAGS = ['comment', 'reply', 'response', 'ident', 'said:', 'rate','user','inner','wrote:']
         STRONG_COMMENT_TAGS = ['comment','reply','user','said:','wrote:']
 
@@ -379,9 +379,9 @@ def candidate_context_validated(self, article, grouped_comments, max_group):
 
         for des in list(comment_parent.iterdescendants()) + [comment_parent]:
             classname = id = ""
-            if des.attrib.has_key("class"):
+            if "class" in des.attrib:
                 classname = des.attrib['class']
-            if des.attrib.has_key("id"):
+            if "id" in des.attrib:
                 id = des.attrib['id']
             for ctag in COMMENT_TAGS:
                 contents = (des.text_content() + classname + id).lower() 
@@ -411,13 +411,13 @@ def print_article(self, article):
         """
         Print the details of a detected article (class, title and text).
         """
-        print Tcolors.INFO + " Article detected!" 
-        print "Article class: "
-        print Tcolors.RES + " " + repr(article.class_name)
-        print "Article title: "
-        print article.get_ancestor_title() 
-        print "Article text: "
-        print article.full_text.replace("\n"," ") 
+        print(Tcolors.INFO + " Article detected!" )
+        print("Article class: ")
+        print(Tcolors.RES + " " + repr(article.class_name))
+        print("Article title: ")
+        print(article.get_ancestor_title()) 
+        print("Article text: ")
+        print(article.full_text.replace("\n"," ")) 
 
     def merge_groups(self, tree):
         """
@@ -429,7 +429,7 @@ def merge_groups(self, tree):
             parent = node.getparent()
             if parent is not None:
                 parent_path = self.get_path(parent)
-                if self.valid_nodes.has_key(parent_path):
+                if parent_path in self.valid_nodes:
                     self.valid_nodes[parent_path].append(group)
                     self.valid_nodes[parent_path].extend(self.valid_nodes[group])
                     del self.valid_nodes[group]
@@ -498,7 +498,7 @@ def calculate_distances_from_max(self, tree, fixed_regions=False):
             if region.distance_from_max == 0 and region.parts == 1 \
                 and not fixed_regions and len(list(region.root_node.getchildren())) > 1\
                 and (self.content_appears_in_other_region(region)\
-                or self.close_diff_from_second_max(max_region)): #and
+                or self.close_diff_from_second_max(self.max_region)): #and
                 self.regions.remove(region)
                 self.recompute_max_density_region()
                 fixed_regions = True
@@ -510,17 +510,17 @@ def find_node_text(self, node):
         """
         node_text = "" 
         try:
-        	t = node.text
-        	t = True
+            t = node.text
+            t = True
         except:
-        	t = False
+            t = False
         if t and node.text is not None: 
             node_text = node.text
         else:
             try: 
-        		itertext = list(node.itertext())
+                itertext = list(node.itertext())
             except: 
-        		itertext = []
+                itertext = []
             itertexts = [text for text in itertext if text is not None and re.sub(r"\n|\r|\t| |,|\.","",text) != ""]
             descendants = [des for des in list(node.iterdescendants())]
             descendants_length = len(descendants)
@@ -556,7 +556,7 @@ def cross_tree(self, node, node_text=None, level=0):
         if node_text is None:        
             node_text = self.find_node_text(node)
 
-        if node.attrib.has_key("class") and node.attrib["class"] == "wrappers":
+        if ("class" in node.attrib) and node.attrib["class"] == "wrappers":
             dess = []
             for d,des in enumerate(node.iterdescendants()):
                 if des.text is not None:
@@ -608,7 +608,7 @@ def get_style(self, node):
         """
         Get the style attribute of the node if it exists.
         """
-        if node.attrib.has_key("style"):
+        if "style" in node.attrib:
             style = node.attrib.get('style')
         else:
             style = ""
@@ -626,7 +626,7 @@ def group_node(self, node, node_text):
 
         if parent_path not in ["/html","/html/body"] and node_text is not None\
            and node.tag != 'body' and self.has_visible_parents(valid_parent):
-            if not self.valid_nodes.has_key(parent_path):
+            if parent_path not in self.valid_nodes:
                 self.valid_nodes[parent_path] = [node_path] 
             else:
                 if node_path not in self.valid_nodes[parent_path]: