diff --git a/README.md b/README.md index 1107978..4c2b02b 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,6 @@ http://www.pip-installer.org/en/latest/ After installing them, you should be able to install the following packages:
```bash $ pip install nltk -$ pip install urllib $ pip install lxml ``` diff --git a/region.py b/region.py index 65cfff6..6a6d50d 100644 --- a/region.py +++ b/region.py @@ -73,17 +73,17 @@ def find_node_text(self,node): """ node_text = "\n" for des in node.iter(): - try: - if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\ - and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\ - or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1: - node_text += re.sub(r"\n|\r|\t","",des.text_content()) - if des.tag == 'p': - node_text += "\n" - elif des.tail is not None and des.tag in ['table']: - node_text += des.tail - except: - node_text += "" + try: + if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\ + and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\ + or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1: + node_text += re.sub(r"\n|\r|\t","",des.text_content()) + if des.tag == 'p': + node_text += "\n" + elif des.tail is not None and des.tag in ['table']: + node_text += des.tail + except: + node_text += "" node_text = node_text.replace(">",">") node_text = re.sub(r" |,,|--|==||","",node_text) @@ -117,7 +117,7 @@ def calculate_id(self): Returns the id attribute of the node if it exists, otherwise it returns an empty string. """ - if self.root_node.attrib.has_key('id'): + if 'id' in self.root_node.attrib: return self.root_node.attrib['id'] else: return "" @@ -127,7 +127,7 @@ def calculate_class_name(self): Returns the CSS class attribute of the node if it exists, otherwise it returns an empty string. """ - if self.root_node.attrib.has_key('class'): + if 'class' in self.root_node.attrib: return self.root_node.attrib['class'] else: return "" @@ -135,18 +135,18 @@ def calculate_class_name(self): def _print(self): # Uncomment the following for debugging of the regions. """ - print Tcolors.CYAN + "[x] Region:", self.root - print "-----------------------------------------------------------------------------" - print Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag - print " Class:", self.class_name - print " Id:", self.id - print " Level:", self.distance_from_root - print " Parts:", self.parts - print " Density:", self.density - print " Distance from max:", self.distance_from_max - print " Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node) - print " Full text: \n", self.full_text - print "\n" + Tcolors.ENDC + print(Tcolors.CYAN + "[x] Region:", self.root) + print("-----------------------------------------------------------------------------") + print(Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag) + print(" Class:", self.class_name) + print(" Id:", self.id) + print(" Level:", self.distance_from_root) + print(" Parts:", self.parts) + print(" Density:", self.density) + print(" Distance from max:", self.distance_from_max) + print(" Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node)) + print(" Full text: \n", self.full_text) + print("\n" + Tcolors.ENDC) """ - pass + pass diff --git a/sd_algorithm.py b/sd_algorithm.py index 7462a2f..04e844e 100644 --- a/sd_algorithm.py +++ b/sd_algorithm.py @@ -46,18 +46,18 @@ def __init__(self): def analyze_page(self): - print "[*] Create DOM tree..." + print("[*] Create DOM tree...") tree = self.construct_page_tree() node = tree.getroot() self.cross_tree(node) - print "[*] Calculating initial groups..." - print "[*] Merging groups..." + print("[*] Calculating initial groups...") + print("[*] Merging groups...") self.merge_groups(tree) - print "[*] Creating regions..." + print("[*] Creating regions...") self.create_regions(tree) - print "[*] Calculating distances from max region..." + print("[*] Calculating distances from max region...") self.calculate_distances_from_max(tree) - print "[*] Printing regions...\n" + print("[*] Printing regions...\n") for region in self.regions: region._print() @@ -76,14 +76,14 @@ def construct_page_tree(self): Downloads the HTML page given the URL and creates the DOM page tree. Only the nodes that are useful for the segmentation are kept. """ - page = urllib.urlopen(self.url) - html_body = page.read() + with urllib.request.urlopen(self.url) as response: + html_body = response.read() doc = html.fromstring(html_body) cleaner = Cleaner(**ARGS) try: - doc = cleaner.clean_html(doc) + doc = cleaner.clean_html(doc) except: - pass + pass tree = doc.getroottree() return tree @@ -102,45 +102,45 @@ def classify_page(self): if article_exists: max_group = self.get_candidate_article(article, grouped_comments) - if grouped_comments.has_key(max_group): + if max_group in grouped_comments: if grouped_comments != {}: validated = self.candidate_group_level_validated(max_group, article, grouped_comments) context_validated = self.candidate_context_validated(article, grouped_comments, max_group) if self.big_areas_in_same_level(article, grouped_comments, max_group) and not validated: - print Tcolors.INFO + " Multiple similar regions detected!" - print "Class: " - print Tcolors.RES + " " + grouped_comments[max_group][0].class_name - print "Texts: " + print(Tcolors.INFO + " Multiple similar regions detected!") + print("Class: ") + print(Tcolors.RES + " " + grouped_comments[max_group][0].class_name) + print("Texts: " ) for reg in grouped_comments[max_group]: - print reg.full_text + print(reg.full_text) return None, None, grouped_comments[max_group] elif not context_validated: - print + print() self.print_article(article) - print - print Tcolors.INFO + " No comments found." + print() + print(Tcolors.INFO + " No comments found.") return article, None, None elif context_validated: - print - print Tcolors.INFO + " Article with comments detected!" + print() + print(Tcolors.INFO + " Article with comments detected!") self.print_article(article) - print - print "Comment class:" - print Tcolors.RES + " " + max_group - print "Comments:" + print() + print("Comment class:") + print(Tcolors.RES + " " + max_group) + print("Comments:") for com in grouped_comments[max_group]: - print com.full_text + print(com.full_text) return article, grouped_comments[max_group], None else: self.print_article(article) return article, None, None else: - print Tcolors.INFO + " Multiple similar regions detected!" - print Tcolors.RES - print "Texts: " + print(Tcolors.INFO + " Multiple similar regions detected!" ) + print(Tcolors.RES) + print("Texts: ") for reg in biggest_regions: - print reg.full_text + print(reg.full_text) return None, None, biggest_regions def group_regions(self): @@ -161,11 +161,11 @@ def group_regions(self): self.min_region_level = region.distance_from_root pr_com = (len(region.tree.xpath(region.root)) > 0 and\ - region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \ - region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0) + ('class' in region.tree.xpath(region.root)[0].getparent().attrib) and \ + region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0) if region.distance_from_max != 0 and (region.class_name != "" or \ (region.class_name == "" and pr_com)): - if not grouped_comments.has_key(region.class_name): + if region.class_name not in grouped_comments: grouped_comments[region.class_name] = [region] else: grouped_comments[region.class_name].append(region) @@ -233,9 +233,9 @@ def get_candidate_article(self, article, grouped_comments): max_group_density = 0 if article.root_node.getparent() is not None: - article_parent_path = self.get_path(article.root_node.getparent()) + article_parent_path = self.get_path(article.root_node.getparent()) else: - article_parent_path = "" + article_parent_path = "" max_group = None groups_level = {} groups_below_article_tags = [] @@ -317,8 +317,8 @@ def candidate_group_level_validated(self, max_group, article, grouped_comments): comment_remaining_nodes = comment_path.split("/") if len(article_remaining_nodes) > 1 and len(comment_remaining_nodes) > 1: - article_number = re.search("\d",article_remaining_nodes[0]) - comment_number = re.search("\d",comment_remaining_nodes[0]) + article_number = re.search(r"\d",article_remaining_nodes[0]) + comment_number = re.search(r"\d",comment_remaining_nodes[0]) if article_number and comment_number: article_number.start() comment_number.start() @@ -353,7 +353,7 @@ def big_areas_in_same_level(self, article, grouped_comments, max_group): Check if the big regions (or areas) belong to the same level in the HTML tree structure. """ - if grouped_comments.has_key(max_group): + if max_group in grouped_comments: first_candidate_comment = grouped_comments[max_group][0] return article.distance_from_root == first_candidate_comment.distance_from_root\ and self.combined_region_level_exceeded(article) @@ -366,7 +366,7 @@ def candidate_context_validated(self, article, grouped_comments, max_group): Check whether the candidate comment regions validate as such based on the keywords that are detected in their content. """ - print Tcolors.ACT + " Validating candidate comment group based on its content..." + print(Tcolors.ACT + " Validating candidate comment group based on its content...") COMMENT_TAGS = ['comment', 'reply', 'response', 'ident', 'said:', 'rate','user','inner','wrote:'] STRONG_COMMENT_TAGS = ['comment','reply','user','said:','wrote:'] @@ -379,9 +379,9 @@ def candidate_context_validated(self, article, grouped_comments, max_group): for des in list(comment_parent.iterdescendants()) + [comment_parent]: classname = id = "" - if des.attrib.has_key("class"): + if "class" in des.attrib: classname = des.attrib['class'] - if des.attrib.has_key("id"): + if "id" in des.attrib: id = des.attrib['id'] for ctag in COMMENT_TAGS: contents = (des.text_content() + classname + id).lower() @@ -411,13 +411,13 @@ def print_article(self, article): """ Print the details of a detected article (class, title and text). """ - print Tcolors.INFO + " Article detected!" - print "Article class: " - print Tcolors.RES + " " + repr(article.class_name) - print "Article title: " - print article.get_ancestor_title() - print "Article text: " - print article.full_text.replace("\n"," ") + print(Tcolors.INFO + " Article detected!" ) + print("Article class: ") + print(Tcolors.RES + " " + repr(article.class_name)) + print("Article title: ") + print(article.get_ancestor_title()) + print("Article text: ") + print(article.full_text.replace("\n"," ")) def merge_groups(self, tree): """ @@ -429,7 +429,7 @@ def merge_groups(self, tree): parent = node.getparent() if parent is not None: parent_path = self.get_path(parent) - if self.valid_nodes.has_key(parent_path): + if parent_path in self.valid_nodes: self.valid_nodes[parent_path].append(group) self.valid_nodes[parent_path].extend(self.valid_nodes[group]) del self.valid_nodes[group] @@ -498,7 +498,7 @@ def calculate_distances_from_max(self, tree, fixed_regions=False): if region.distance_from_max == 0 and region.parts == 1 \ and not fixed_regions and len(list(region.root_node.getchildren())) > 1\ and (self.content_appears_in_other_region(region)\ - or self.close_diff_from_second_max(max_region)): #and + or self.close_diff_from_second_max(self.max_region)): #and self.regions.remove(region) self.recompute_max_density_region() fixed_regions = True @@ -510,17 +510,17 @@ def find_node_text(self, node): """ node_text = "" try: - t = node.text - t = True + t = node.text + t = True except: - t = False + t = False if t and node.text is not None: node_text = node.text else: try: - itertext = list(node.itertext()) + itertext = list(node.itertext()) except: - itertext = [] + itertext = [] itertexts = [text for text in itertext if text is not None and re.sub(r"\n|\r|\t| |,|\.","",text) != ""] descendants = [des for des in list(node.iterdescendants())] descendants_length = len(descendants) @@ -556,7 +556,7 @@ def cross_tree(self, node, node_text=None, level=0): if node_text is None: node_text = self.find_node_text(node) - if node.attrib.has_key("class") and node.attrib["class"] == "wrappers": + if ("class" in node.attrib) and node.attrib["class"] == "wrappers": dess = [] for d,des in enumerate(node.iterdescendants()): if des.text is not None: @@ -608,7 +608,7 @@ def get_style(self, node): """ Get the style attribute of the node if it exists. """ - if node.attrib.has_key("style"): + if "style" in node.attrib: style = node.attrib.get('style') else: style = "" @@ -626,7 +626,7 @@ def group_node(self, node, node_text): if parent_path not in ["/html","/html/body"] and node_text is not None\ and node.tag != 'body' and self.has_visible_parents(valid_parent): - if not self.valid_nodes.has_key(parent_path): + if parent_path not in self.valid_nodes: self.valid_nodes[parent_path] = [node_path] else: if node_path not in self.valid_nodes[parent_path]: