From 7856766ac2acb1db9dec0d0a763f07b03af98335 Mon Sep 17 00:00:00 2001 From: Tony Date: Sun, 4 Feb 2024 00:20:29 -0500 Subject: [PATCH 1/5] print(...) --- region.py | 48 +++++++++++++------------- sd_algorithm.py | 90 ++++++++++++++++++++++++------------------------- 2 files changed, 69 insertions(+), 69 deletions(-) diff --git a/region.py b/region.py index 65cfff6..146cd8c 100644 --- a/region.py +++ b/region.py @@ -73,17 +73,17 @@ def find_node_text(self,node): """ node_text = "\n" for des in node.iter(): - try: - if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\ - and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\ - or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1: - node_text += re.sub(r"\n|\r|\t","",des.text_content()) - if des.tag == 'p': - node_text += "\n" - elif des.tail is not None and des.tag in ['table']: - node_text += des.tail - except: - node_text += "" + try: + if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\ + and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\ + or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1: + node_text += re.sub(r"\n|\r|\t","",des.text_content()) + if des.tag == 'p': + node_text += "\n" + elif des.tail is not None and des.tag in ['table']: + node_text += des.tail + except: + node_text += "" node_text = node_text.replace(">",">") node_text = re.sub(r" |,,|--|==||","",node_text) @@ -135,18 +135,18 @@ def calculate_class_name(self): def _print(self): # Uncomment the following for debugging of the regions. """ - print Tcolors.CYAN + "[x] Region:", self.root - print "-----------------------------------------------------------------------------" - print Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag - print " Class:", self.class_name - print " Id:", self.id - print " Level:", self.distance_from_root - print " Parts:", self.parts - print " Density:", self.density - print " Distance from max:", self.distance_from_max - print " Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node) - print " Full text: \n", self.full_text - print "\n" + Tcolors.ENDC + print(Tcolors.CYAN + "[x] Region:", self.root) + print("-----------------------------------------------------------------------------") + print(Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag) + print(" Class:", self.class_name) + print(" Id:", self.id) + print(" Level:", self.distance_from_root) + print(" Parts:", self.parts) + print(" Density:", self.density) + print(" Distance from max:", self.distance_from_max) + print(" Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node)) + print(" Full text: \n", self.full_text) + print("\n" + Tcolors.ENDC) """ - pass + pass diff --git a/sd_algorithm.py b/sd_algorithm.py index 7462a2f..544f387 100644 --- a/sd_algorithm.py +++ b/sd_algorithm.py @@ -46,18 +46,18 @@ def __init__(self): def analyze_page(self): - print "[*] Create DOM tree..." + print("[*] Create DOM tree...") tree = self.construct_page_tree() node = tree.getroot() self.cross_tree(node) - print "[*] Calculating initial groups..." - print "[*] Merging groups..." + print("[*] Calculating initial groups...") + print("[*] Merging groups...") self.merge_groups(tree) - print "[*] Creating regions..." + print("[*] Creating regions...") self.create_regions(tree) - print "[*] Calculating distances from max region..." + print("[*] Calculating distances from max region...") self.calculate_distances_from_max(tree) - print "[*] Printing regions...\n" + print("[*] Printing regions...\n") for region in self.regions: region._print() @@ -81,9 +81,9 @@ def construct_page_tree(self): doc = html.fromstring(html_body) cleaner = Cleaner(**ARGS) try: - doc = cleaner.clean_html(doc) + doc = cleaner.clean_html(doc) except: - pass + pass tree = doc.getroottree() return tree @@ -108,39 +108,39 @@ def classify_page(self): context_validated = self.candidate_context_validated(article, grouped_comments, max_group) if self.big_areas_in_same_level(article, grouped_comments, max_group) and not validated: - print Tcolors.INFO + " Multiple similar regions detected!" - print "Class: " - print Tcolors.RES + " " + grouped_comments[max_group][0].class_name - print "Texts: " + print(Tcolors.INFO + " Multiple similar regions detected!") + print("Class: ") + print(Tcolors.RES + " " + grouped_comments[max_group][0].class_name) + print("Texts: " ) for reg in grouped_comments[max_group]: - print reg.full_text + print(reg.full_text) return None, None, grouped_comments[max_group] elif not context_validated: - print + print() self.print_article(article) - print - print Tcolors.INFO + " No comments found." + print() + print(Tcolors.INFO + " No comments found.") return article, None, None elif context_validated: - print - print Tcolors.INFO + " Article with comments detected!" + print() + print(Tcolors.INFO + " Article with comments detected!") self.print_article(article) - print - print "Comment class:" - print Tcolors.RES + " " + max_group - print "Comments:" + print() + print("Comment class:") + print(Tcolors.RES + " " + max_group) + print("Comments:") for com in grouped_comments[max_group]: - print com.full_text + print(com.full_text) return article, grouped_comments[max_group], None else: self.print_article(article) return article, None, None else: - print Tcolors.INFO + " Multiple similar regions detected!" - print Tcolors.RES - print "Texts: " + print(Tcolors.INFO + " Multiple similar regions detected!" ) + print(Tcolors.RES) + print("Texts: ") for reg in biggest_regions: - print reg.full_text + print(reg.full_text) return None, None, biggest_regions def group_regions(self): @@ -161,8 +161,8 @@ def group_regions(self): self.min_region_level = region.distance_from_root pr_com = (len(region.tree.xpath(region.root)) > 0 and\ - region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \ - region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0) + region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \ + region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0) if region.distance_from_max != 0 and (region.class_name != "" or \ (region.class_name == "" and pr_com)): if not grouped_comments.has_key(region.class_name): @@ -233,9 +233,9 @@ def get_candidate_article(self, article, grouped_comments): max_group_density = 0 if article.root_node.getparent() is not None: - article_parent_path = self.get_path(article.root_node.getparent()) + article_parent_path = self.get_path(article.root_node.getparent()) else: - article_parent_path = "" + article_parent_path = "" max_group = None groups_level = {} groups_below_article_tags = [] @@ -366,7 +366,7 @@ def candidate_context_validated(self, article, grouped_comments, max_group): Check whether the candidate comment regions validate as such based on the keywords that are detected in their content. """ - print Tcolors.ACT + " Validating candidate comment group based on its content..." + print(Tcolors.ACT + " Validating candidate comment group based on its content...") COMMENT_TAGS = ['comment', 'reply', 'response', 'ident', 'said:', 'rate','user','inner','wrote:'] STRONG_COMMENT_TAGS = ['comment','reply','user','said:','wrote:'] @@ -411,13 +411,13 @@ def print_article(self, article): """ Print the details of a detected article (class, title and text). """ - print Tcolors.INFO + " Article detected!" - print "Article class: " - print Tcolors.RES + " " + repr(article.class_name) - print "Article title: " - print article.get_ancestor_title() - print "Article text: " - print article.full_text.replace("\n"," ") + print(Tcolors.INFO + " Article detected!" ) + print("Article class: ") + print(Tcolors.RES + " " + repr(article.class_name)) + print("Article title: ") + print(article.get_ancestor_title()) + print("Article text: ") + print(article.full_text.replace("\n"," ")) def merge_groups(self, tree): """ @@ -498,7 +498,7 @@ def calculate_distances_from_max(self, tree, fixed_regions=False): if region.distance_from_max == 0 and region.parts == 1 \ and not fixed_regions and len(list(region.root_node.getchildren())) > 1\ and (self.content_appears_in_other_region(region)\ - or self.close_diff_from_second_max(max_region)): #and + or self.close_diff_from_second_max(self.max_region)): #and self.regions.remove(region) self.recompute_max_density_region() fixed_regions = True @@ -510,17 +510,17 @@ def find_node_text(self, node): """ node_text = "" try: - t = node.text - t = True + t = node.text + t = True except: - t = False + t = False if t and node.text is not None: node_text = node.text else: try: - itertext = list(node.itertext()) + itertext = list(node.itertext()) except: - itertext = [] + itertext = [] itertexts = [text for text in itertext if text is not None and re.sub(r"\n|\r|\t| |,|\.","",text) != ""] descendants = [des for des in list(node.iterdescendants())] descendants_length = len(descendants) From 83e5768f521826dcd79ef6fa6e9f4b4f75d408fd Mon Sep 17 00:00:00 2001 From: Tony Date: Sun, 4 Feb 2024 00:20:49 -0500 Subject: [PATCH 2/5] remove urllib from README --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 1107978..4c2b02b 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,6 @@ http://www.pip-installer.org/en/latest/ After installing them, you should be able to install the following packages:
```bash $ pip install nltk -$ pip install urllib $ pip install lxml ``` From 8114d439f2bac99bf24ef23461110030070fd6fe Mon Sep 17 00:00:00 2001 From: Tony Date: Sun, 4 Feb 2024 00:29:57 -0500 Subject: [PATCH 3/5] replace has_key for in operator --- __pycache__/region.cpython-312.pyc | Bin 0 -> 6838 bytes __pycache__/terminal_colors.cpython-312.pyc | Bin 0 -> 1530 bytes region.py | 4 ++-- sd_algorithm.py | 20 ++++++++++---------- 4 files changed, 12 insertions(+), 12 deletions(-) create mode 100644 __pycache__/region.cpython-312.pyc create mode 100644 __pycache__/terminal_colors.cpython-312.pyc diff --git a/__pycache__/region.cpython-312.pyc b/__pycache__/region.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fafd57db9293088c9d60e7695a814e5986dc67f9 GIT binary patch literal 6838 zcmd5>T}&I<6`mQ7@h^~I2V)$_V3L4YaF_fjo1djgAmpbB-2_?|;;iL(1~3_Hc4kPx za@s_SRJKaB(0zcYQo~AB7nGHfecG*5X}1rnZq+aN@1A@8$I41O1?i`ap9)9nDC)0R(Gy!JY`g}A1&XJ5TBL^Y z%ZT&@GtAIfHi@PQc9^9phB{60>>Y|XKVUS@Ve;b1*aY-1BCE`(iJvfr7@V_xl<2WfgMTN1Qq!a@+XfM>LalKgb z9XFR$3dI!jpuok`UT6a;I1WiE6V!|u@Hn0!L8&ZK=@OvdqPG%it6ob{<1W2i{5~iY zfsgDcR)Nw}q|i$xluo@3D1Cal_${HV(p!1dO|@PFz9zj~{OWvYA3(ib?;-1`(Q7kQ zz|<$%7jqGCjr!|}+3pxR5y z<yOR~h)XjZBGRSP`ZbwW_2uK9W$Ya6Ynrt2K`Z>J4Zm@-HE~MdjUfv+P}){_!^4 z_uli~-*s=-vODVyF29}ec0zwuzK(-WW!{6-&L=zS7VF--0!4ei$v?}kS{w8Jy*Yn( z#^0UwpTdS+`TE9O{r*h-{%rk0Ags>U)aGiMGd0cGnmtgrnkm){K(f3`MlzznJ9+_(Z7(8V?wup@9|=JuBU=|griWkj7rtj83 z?|(QCZ7wX8GVx}zZqwc0vs9d(seqj@K%zK5fMMpPP=|7W{6PU^`>B8>`4>C74W{lf zA$m>bZj1@am>_WxZZvjXh;s-h+*m~BB60ZBJe0c;Q^q*rrkoNP+h6^w%)dQP^kgu}HsMdl;G@J@Yl(4QBS+x-vAW{if zq@74_Lj{t7RF*)qnXFZ{$g!(pEIz8*e|X~j*`DygiPQb6nGni&l94^(IH>_gZi6h@ zP(Y#bL$?5lTn)>+MXfn0=eD%N_tdv5=WEaS+OxjkI#pBYo3rFyJJYV#yvMgNGe48_ zv}Qc5%bu*K^U=V^Lmv(04)$ga_I@^#J$Ui4=i--c@6$&A(t*XxbM^wCyK(8j(%y`_ zb?#i=Rljg_{^(L$#?_v0>&&%vW!k#3ZO7+&R^2W6R}bR1Bfpbd+Ozn*$L^M=4ZH7$ z?uCBw);h&*Z=XAr_w2+eES-Do3Fd?QbHU@8;PGtm>G zY3q4u7bqrL3S1Z&pbKn#NI;OMZ!#(R!+T$;=|`mP;-nHk?vl&6P>9>OHKg+sm)%%trQX;sKkDK-h=rL4?|g5-x4N==X@w};}Xkdz83 zDK6EXI+_XvB{XZ5o_t0Jzy2qrq*W(@+lyI=Tor|+Y450_(p|}Y9B{R#IyzFXzn(hQ z)X~uzOoik?+jl!Ux}YAq+}(3x;Dp{7x)Qq56>vxa9Dv1XN(gnTJtoKES`aIt=c$Y& zs3v*ps>+UlY9z2Zq>scz)glR#Vk9aASOUpX2X+x$)*?|8Xu^mf3E-+#D1(I3ge1A6 zfW6N*_ktjKzlOP1YKM2$mUnp;4$mKc@8|~QuyM29YmB{;%e!h9UYma{?P^)>e>k`@ zm_9a?=@@$A+p`q?@aU@V&~HzE+WVW{boXHT&EfRrw{n-Gnak1aF+M#u{`lB<-c!GD zWB$fc^Ur^@-2L#(%9)23RxW(foNe#=tmbih_ZPN5RsEqVJvf{j9L)@lW(Q;0Gvn#_ zH_{jT(*sws?zccE)pcoW-QU+7G^k>mck$AE^?SBwGD6*AkLi?+ z`P^nXGlC%}-qG{m2YskkB$Tq|0tgr{e+xnP<_<>+H+j|eDv0tjWi9TOwn z=6-kPotbxk{LYUTdQZQKGb;^^?*oXRP$O3%We)x(=Z*Ob%BD%V;cIX+d<};{9FY^LP9=lYg^#eOjmbdb(j&dUf!` zrM|O$r*WR#KxFi}8Ipi32)Faiviw{(?u|iQRQXoQv%EnT6rNocmfdBs9WdZHshrrYO=6|ErnVASk@#g|Ia*rmD9u2$BFwgO%Nl3b zscO^%s(dg~EBO|BFd`RXIfB)Hk~#j{2f&lw_#cEH+D z8Il>7dl}Kq%(|I5hWOf4Ex5IJt7B7j$W^-@#@oE(Q+3c)Ut{=I_54)reM#({>IA6c zuEO)4|BpK1X1sryB*}AxQ)j940u>LN?N-~eJK|z(W9R$^de0o$b8ur(x?UTCMuul!EPK`Wjo2U7U;@Ire%vq(U8l0Hn{D!{#dt*5U%qOzyowb8t)()EGME9N z+*2x)tGQ=1lrOtGpD)uuzWm6VW#vG%RJ7(86AI6A8Vx{aDOaF@T;Y%kpH?f37K==G zKn2!zOJd$}fS-{y&v=Xh>zWJfYI_Ub_Z$t?h3@W9O#(!(cgr4x{xz&5vUv`w6ec~v$2OEyEGB|LQwJTuj%+#}51)F6@W-6Jfz~%sKKqaq( z*Z))f>&8__xiwgEln>x*;IbH8b_Fa0H%juwVJ~&L?MT=AhmLe>P;{iVt2sxya|JB( z>xLs`yZI3x3MRcLWv{8FIqy|h+0#y6dK~`5YyU%Q4GL_KZPw|(0vyM^xp9+A{*3_K G1OEV$;(9>< literal 0 HcmV?d00001 diff --git a/region.py b/region.py index 146cd8c..6a6d50d 100644 --- a/region.py +++ b/region.py @@ -117,7 +117,7 @@ def calculate_id(self): Returns the id attribute of the node if it exists, otherwise it returns an empty string. """ - if self.root_node.attrib.has_key('id'): + if 'id' in self.root_node.attrib: return self.root_node.attrib['id'] else: return "" @@ -127,7 +127,7 @@ def calculate_class_name(self): Returns the CSS class attribute of the node if it exists, otherwise it returns an empty string. """ - if self.root_node.attrib.has_key('class'): + if 'class' in self.root_node.attrib: return self.root_node.attrib['class'] else: return "" diff --git a/sd_algorithm.py b/sd_algorithm.py index 544f387..1ba713a 100644 --- a/sd_algorithm.py +++ b/sd_algorithm.py @@ -102,7 +102,7 @@ def classify_page(self): if article_exists: max_group = self.get_candidate_article(article, grouped_comments) - if grouped_comments.has_key(max_group): + if max_group in grouped_comments: if grouped_comments != {}: validated = self.candidate_group_level_validated(max_group, article, grouped_comments) @@ -161,11 +161,11 @@ def group_regions(self): self.min_region_level = region.distance_from_root pr_com = (len(region.tree.xpath(region.root)) > 0 and\ - region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \ + ('class' in region.tree.xpath(region.root)[0].getparent().attrib) and \ region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0) if region.distance_from_max != 0 and (region.class_name != "" or \ (region.class_name == "" and pr_com)): - if not grouped_comments.has_key(region.class_name): + if region.class_name not in grouped_comments: grouped_comments[region.class_name] = [region] else: grouped_comments[region.class_name].append(region) @@ -353,7 +353,7 @@ def big_areas_in_same_level(self, article, grouped_comments, max_group): Check if the big regions (or areas) belong to the same level in the HTML tree structure. """ - if grouped_comments.has_key(max_group): + if max_group in grouped_comments: first_candidate_comment = grouped_comments[max_group][0] return article.distance_from_root == first_candidate_comment.distance_from_root\ and self.combined_region_level_exceeded(article) @@ -379,9 +379,9 @@ def candidate_context_validated(self, article, grouped_comments, max_group): for des in list(comment_parent.iterdescendants()) + [comment_parent]: classname = id = "" - if des.attrib.has_key("class"): + if "class" in des.attrib: classname = des.attrib['class'] - if des.attrib.has_key("id"): + if "id" in des.attrib: id = des.attrib['id'] for ctag in COMMENT_TAGS: contents = (des.text_content() + classname + id).lower() @@ -429,7 +429,7 @@ def merge_groups(self, tree): parent = node.getparent() if parent is not None: parent_path = self.get_path(parent) - if self.valid_nodes.has_key(parent_path): + if parent_path in self.valid_nodes: self.valid_nodes[parent_path].append(group) self.valid_nodes[parent_path].extend(self.valid_nodes[group]) del self.valid_nodes[group] @@ -556,7 +556,7 @@ def cross_tree(self, node, node_text=None, level=0): if node_text is None: node_text = self.find_node_text(node) - if node.attrib.has_key("class") and node.attrib["class"] == "wrappers": + if ("class" in node.attrib) and node.attrib["class"] == "wrappers": dess = [] for d,des in enumerate(node.iterdescendants()): if des.text is not None: @@ -608,7 +608,7 @@ def get_style(self, node): """ Get the style attribute of the node if it exists. """ - if node.attrib.has_key("style"): + if "style" in node.attrib: style = node.attrib.get('style') else: style = "" @@ -626,7 +626,7 @@ def group_node(self, node, node_text): if parent_path not in ["/html","/html/body"] and node_text is not None\ and node.tag != 'body' and self.has_visible_parents(valid_parent): - if not self.valid_nodes.has_key(parent_path): + if parent_path not in self.valid_nodes: self.valid_nodes[parent_path] = [node_path] else: if node_path not in self.valid_nodes[parent_path]: From 5a9f4a4358eca69a3b8eb9c1b9c98f399aafbb46 Mon Sep 17 00:00:00 2001 From: Tony Date: Sun, 4 Feb 2024 00:30:14 -0500 Subject: [PATCH 4/5] urlopen new implementation --- sd_algorithm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sd_algorithm.py b/sd_algorithm.py index 1ba713a..9ffbd25 100644 --- a/sd_algorithm.py +++ b/sd_algorithm.py @@ -76,8 +76,8 @@ def construct_page_tree(self): Downloads the HTML page given the URL and creates the DOM page tree. Only the nodes that are useful for the segmentation are kept. """ - page = urllib.urlopen(self.url) - html_body = page.read() + with urllib.request.urlopen(self.url) as response: + html_body = response.read() doc = html.fromstring(html_body) cleaner = Cleaner(**ARGS) try: From 1055bd25be5551d783bf00c5d1a55ae4f223ff4b Mon Sep 17 00:00:00 2001 From: Tony Date: Sun, 4 Feb 2024 00:32:11 -0500 Subject: [PATCH 5/5] change re \d to r-string --- __pycache__/region.cpython-312.pyc | Bin 6838 -> 0 bytes __pycache__/terminal_colors.cpython-312.pyc | Bin 1530 -> 0 bytes sd_algorithm.py | 4 ++-- 3 files changed, 2 insertions(+), 2 deletions(-) delete mode 100644 __pycache__/region.cpython-312.pyc delete mode 100644 __pycache__/terminal_colors.cpython-312.pyc diff --git a/__pycache__/region.cpython-312.pyc b/__pycache__/region.cpython-312.pyc deleted file mode 100644 index fafd57db9293088c9d60e7695a814e5986dc67f9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6838 zcmd5>T}&I<6`mQ7@h^~I2V)$_V3L4YaF_fjo1djgAmpbB-2_?|;;iL(1~3_Hc4kPx za@s_SRJKaB(0zcYQo~AB7nGHfecG*5X}1rnZq+aN@1A@8$I41O1?i`ap9)9nDC)0R(Gy!JY`g}A1&XJ5TBL^Y z%ZT&@GtAIfHi@PQc9^9phB{60>>Y|XKVUS@Ve;b1*aY-1BCE`(iJvfr7@V_xl<2WfgMTN1Qq!a@+XfM>LalKgb z9XFR$3dI!jpuok`UT6a;I1WiE6V!|u@Hn0!L8&ZK=@OvdqPG%it6ob{<1W2i{5~iY zfsgDcR)Nw}q|i$xluo@3D1Cal_${HV(p!1dO|@PFz9zj~{OWvYA3(ib?;-1`(Q7kQ zz|<$%7jqGCjr!|}+3pxR5y z<yOR~h)XjZBGRSP`ZbwW_2uK9W$Ya6Ynrt2K`Z>J4Zm@-HE~MdjUfv+P}){_!^4 z_uli~-*s=-vODVyF29}ec0zwuzK(-WW!{6-&L=zS7VF--0!4ei$v?}kS{w8Jy*Yn( z#^0UwpTdS+`TE9O{r*h-{%rk0Ags>U)aGiMGd0cGnmtgrnkm){K(f3`MlzznJ9+_(Z7(8V?wup@9|=JuBU=|griWkj7rtj83 z?|(QCZ7wX8GVx}zZqwc0vs9d(seqj@K%zK5fMMpPP=|7W{6PU^`>B8>`4>C74W{lf zA$m>bZj1@am>_WxZZvjXh;s-h+*m~BB60ZBJe0c;Q^q*rrkoNP+h6^w%)dQP^kgu}HsMdl;G@J@Yl(4QBS+x-vAW{if zq@74_Lj{t7RF*)qnXFZ{$g!(pEIz8*e|X~j*`DygiPQb6nGni&l94^(IH>_gZi6h@ zP(Y#bL$?5lTn)>+MXfn0=eD%N_tdv5=WEaS+OxjkI#pBYo3rFyJJYV#yvMgNGe48_ zv}Qc5%bu*K^U=V^Lmv(04)$ga_I@^#J$Ui4=i--c@6$&A(t*XxbM^wCyK(8j(%y`_ zb?#i=Rljg_{^(L$#?_v0>&&%vW!k#3ZO7+&R^2W6R}bR1Bfpbd+Ozn*$L^M=4ZH7$ z?uCBw);h&*Z=XAr_w2+eES-Do3Fd?QbHU@8;PGtm>G zY3q4u7bqrL3S1Z&pbKn#NI;OMZ!#(R!+T$;=|`mP;-nHk?vl&6P>9>OHKg+sm)%%trQX;sKkDK-h=rL4?|g5-x4N==X@w};}Xkdz83 zDK6EXI+_XvB{XZ5o_t0Jzy2qrq*W(@+lyI=Tor|+Y450_(p|}Y9B{R#IyzFXzn(hQ z)X~uzOoik?+jl!Ux}YAq+}(3x;Dp{7x)Qq56>vxa9Dv1XN(gnTJtoKES`aIt=c$Y& zs3v*ps>+UlY9z2Zq>scz)glR#Vk9aASOUpX2X+x$)*?|8Xu^mf3E-+#D1(I3ge1A6 zfW6N*_ktjKzlOP1YKM2$mUnp;4$mKc@8|~QuyM29YmB{;%e!h9UYma{?P^)>e>k`@ zm_9a?=@@$A+p`q?@aU@V&~HzE+WVW{boXHT&EfRrw{n-Gnak1aF+M#u{`lB<-c!GD zWB$fc^Ur^@-2L#(%9)23RxW(foNe#=tmbih_ZPN5RsEqVJvf{j9L)@lW(Q;0Gvn#_ zH_{jT(*sws?zccE)pcoW-QU+7G^k>mck$AE^?SBwGD6*AkLi?+ z`P^nXGlC%}-qG{m2YskkB$Tq|0tgr{e+xnP<_<>+H+j|eDv0tjWi9TOwn z=6-kPotbxk{LYUTdQZQKGb;^^?*oXRP$O3%We)x(=Z*Ob%BD%V;cIX+d<};{9FY^LP9=lYg^#eOjmbdb(j&dUf!` zrM|O$r*WR#KxFi}8Ipi32)Faiviw{(?u|iQRQXoQv%EnT6rNocmfdBs9WdZHshrrYO=6|ErnVASk@#g|Ia*rmD9u2$BFwgO%Nl3b zscO^%s(dg~EBO|BFd`RXIfB)Hk~#j{2f&lw_#cEH+D z8Il>7dl}Kq%(|I5hWOf4Ex5IJt7B7j$W^-@#@oE(Q+3c)Ut{=I_54)reM#({>IA6c zuEO)4|BpK1X1sryB*}AxQ)j940u>LN?N-~eJK|z(W9R$^de0o$b8ur(x?UTCMuul!EPK`Wjo2U7U;@Ire%vq(U8l0Hn{D!{#dt*5U%qOzyowb8t)()EGME9N z+*2x)tGQ=1lrOtGpD)uuzWm6VW#vG%RJ7(86AI6A8Vx{aDOaF@T;Y%kpH?f37K==G zKn2!zOJd$}fS-{y&v=Xh>zWJfYI_Ub_Z$t?h3@W9O#(!(cgr4x{xz&5vUv`w6ec~v$2OEyEGB|LQwJTuj%+#}51)F6@W-6Jfz~%sKKqaq( z*Z))f>&8__xiwgEln>x*;IbH8b_Fa0H%juwVJ~&L?MT=AhmLe>P;{iVt2sxya|JB( z>xLs`yZI3x3MRcLWv{8FIqy|h+0#y6dK~`5YyU%Q4GL_KZPw|(0vyM^xp9+A{*3_K G1OEV$;(9>< diff --git a/sd_algorithm.py b/sd_algorithm.py index 9ffbd25..04e844e 100644 --- a/sd_algorithm.py +++ b/sd_algorithm.py @@ -317,8 +317,8 @@ def candidate_group_level_validated(self, max_group, article, grouped_comments): comment_remaining_nodes = comment_path.split("/") if len(article_remaining_nodes) > 1 and len(comment_remaining_nodes) > 1: - article_number = re.search("\d",article_remaining_nodes[0]) - comment_number = re.search("\d",comment_remaining_nodes[0]) + article_number = re.search(r"\d",article_remaining_nodes[0]) + comment_number = re.search(r"\d",comment_remaining_nodes[0]) if article_number and comment_number: article_number.start() comment_number.start()