From 7856766ac2acb1db9dec0d0a763f07b03af98335 Mon Sep 17 00:00:00 2001
From: Tony <santiago.montesb@gmail.com>
Date: Sun, 4 Feb 2024 00:20:29 -0500
Subject: [PATCH 1/5] print(...)

---
 region.py       | 48 +++++++++++++-------------
 sd_algorithm.py | 90 ++++++++++++++++++++++++-------------------------
 2 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/region.py b/region.py
index 65cfff6..146cd8c 100644
--- a/region.py
+++ b/region.py
@@ -73,17 +73,17 @@ def find_node_text(self,node):
         """
         node_text = "\n" 
         for des in node.iter():
-			try:
-				if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\
-		           and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\
-		           or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1:
-					node_text += re.sub(r"\n|\r|\t","",des.text_content())
-					if des.tag == 'p':
-						node_text += "\n"   
-					elif des.tail is not None and des.tag in ['table']:
-						node_text += des.tail
-			except:
-				node_text += ""
+            try:
+                if des.text is not None and (des.tag not in ['script', 'style','h1','h2']\
+                   and not isinstance(des, HtmlComment) and re.sub(r"\n|\r|\t| |,|;|\.","",des.text) != ""\
+                   or (des.tag == "p")) and des.text.find(".") > -1 and node_text.find(des.text_content()) == -1:
+                    node_text += re.sub(r"\n|\r|\t","",des.text_content())
+                    if des.tag == 'p':
+                        node_text += "\n"   
+                    elif des.tail is not None and des.tag in ['table']:
+                        node_text += des.tail
+            except:
+                node_text += ""
                           
         node_text = node_text.replace("&gt",">")
         node_text = re.sub(r"  |,,|--|==|<!--(.|\s)*?-->|<!\[CDATA(.|\s)*?\]\]>","",node_text)  
@@ -135,18 +135,18 @@ def calculate_class_name(self):
     def _print(self): 
         # Uncomment the following for debugging of the regions.
         """
-        print Tcolors.CYAN  + "[x] Region:", self.root
-        print "-----------------------------------------------------------------------------"
-        print Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag
-        print " Class:", self.class_name
-        print " Id:", self.id
-        print " Level:", self.distance_from_root
-        print " Parts:", self.parts
-        print " Density:", self.density
-        print " Distance from max:", self.distance_from_max
-        print " Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node)
-        print " Full text: \n", self.full_text 
-        print "\n" + Tcolors.ENDC
+        print(Tcolors.CYAN  + "[x] Region:", self.root)
+        print("-----------------------------------------------------------------------------")
+        print(Tcolors.ENDC + Tcolors.WARNING + " Tag:", self.root_node.tag)
+        print(" Class:", self.class_name)
+        print(" Id:", self.id)
+        print(" Level:", self.distance_from_root)
+        print(" Parts:", self.parts)
+        print(" Density:", self.density)
+        print(" Distance from max:", self.distance_from_max)
+        print(" Has title on ancestors:", self.has_title_at_ancestors(None, self.root_node))
+        print(" Full text: \n", self.full_text)
+        print("\n" + Tcolors.ENDC)
         """
-    	pass 
+        pass 
 
diff --git a/sd_algorithm.py b/sd_algorithm.py
index 7462a2f..544f387 100644
--- a/sd_algorithm.py
+++ b/sd_algorithm.py
@@ -46,18 +46,18 @@ def __init__(self):
     
     def analyze_page(self):
                 
-        print "[*] Create DOM tree..."
+        print("[*] Create DOM tree...")
         tree = self.construct_page_tree() 
         node = tree.getroot()
         self.cross_tree(node) 
-        print "[*] Calculating initial groups..."
-        print "[*] Merging groups..."
+        print("[*] Calculating initial groups...")
+        print("[*] Merging groups...")
         self.merge_groups(tree) 
-        print "[*] Creating regions..."
+        print("[*] Creating regions...")
         self.create_regions(tree) 
-        print "[*] Calculating distances from max region..."
+        print("[*] Calculating distances from max region...")
         self.calculate_distances_from_max(tree)  
-        print "[*] Printing regions...\n"
+        print("[*] Printing regions...\n")
         for region in self.regions:
             region._print()  
             
@@ -81,9 +81,9 @@ def construct_page_tree(self):
         doc = html.fromstring(html_body)
         cleaner = Cleaner(**ARGS)
         try:
-        	doc = cleaner.clean_html(doc)
+            doc = cleaner.clean_html(doc)
         except:
-        	pass
+            pass
         tree = doc.getroottree() 
         return tree 
         
@@ -108,39 +108,39 @@ def classify_page(self):
                 
                 context_validated =  self.candidate_context_validated(article, grouped_comments, max_group)            
                 if self.big_areas_in_same_level(article, grouped_comments, max_group) and not validated:
-                    print Tcolors.INFO + " Multiple similar regions detected!"
-                    print "Class: "
-                    print Tcolors.RES + " " + grouped_comments[max_group][0].class_name
-                    print "Texts: " 
+                    print(Tcolors.INFO + " Multiple similar regions detected!")
+                    print("Class: ")
+                    print(Tcolors.RES + " " + grouped_comments[max_group][0].class_name)
+                    print("Texts: " )
                     for reg in grouped_comments[max_group]:
-                        print reg.full_text
+                        print(reg.full_text)
                     return None, None, grouped_comments[max_group]
                 elif not context_validated: 
-                    print
+                    print()
                     self.print_article(article)
-                    print 
-                    print Tcolors.INFO + " No comments found."                
+                    print()
+                    print(Tcolors.INFO + " No comments found.")                
                     return article, None, None
                 elif context_validated:
-                    print 
-                    print Tcolors.INFO + " Article with comments detected!"
+                    print()
+                    print(Tcolors.INFO + " Article with comments detected!")
                     self.print_article(article)
-                    print 
-                    print "Comment class:"      
-                    print Tcolors.RES + " " + max_group 
-                    print "Comments:" 
+                    print()
+                    print("Comment class:")
+                    print(Tcolors.RES + " " + max_group) 
+                    print("Comments:")
                     for com in grouped_comments[max_group]:
-                        print com.full_text              
+                        print(com.full_text)
                     return article, grouped_comments[max_group], None
             else:
                 self.print_article(article)
                 return article, None, None
         else: 
-            print Tcolors.INFO + " Multiple similar regions detected!"  
-            print Tcolors.RES
-            print "Texts: " 
+            print(Tcolors.INFO + " Multiple similar regions detected!" ) 
+            print(Tcolors.RES)
+            print("Texts: ")
             for reg in biggest_regions:
-                print reg.full_text
+                print(reg.full_text)
             return None, None, biggest_regions
     
     def group_regions(self):
@@ -161,8 +161,8 @@ def group_regions(self):
                     self.min_region_level = region.distance_from_root
                     
             pr_com = (len(region.tree.xpath(region.root)) > 0 and\
-            	      region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \
-            	     region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
+                      region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \
+                     region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
             if region.distance_from_max != 0 and (region.class_name != "" or \
                (region.class_name == "" and pr_com)):
                 if not grouped_comments.has_key(region.class_name):
@@ -233,9 +233,9 @@ def get_candidate_article(self, article, grouped_comments):
         max_group_density = 0
         
         if article.root_node.getparent() is not None:
-        	article_parent_path = self.get_path(article.root_node.getparent())
+            article_parent_path = self.get_path(article.root_node.getparent())
         else:
-        	article_parent_path = ""
+            article_parent_path = ""
         max_group = None
         groups_level = {}        
         groups_below_article_tags = []
@@ -366,7 +366,7 @@ def candidate_context_validated(self, article, grouped_comments, max_group):
         Check whether the candidate comment regions validate as such based on
         the keywords that are detected in their content.
         """
-        print Tcolors.ACT + " Validating candidate comment group based on its content..."
+        print(Tcolors.ACT + " Validating candidate comment group based on its content...")
         COMMENT_TAGS = ['comment', 'reply', 'response', 'ident', 'said:', 'rate','user','inner','wrote:']
         STRONG_COMMENT_TAGS = ['comment','reply','user','said:','wrote:']
         
@@ -411,13 +411,13 @@ def print_article(self, article):
         """
         Print the details of a detected article (class, title and text).
         """
-        print Tcolors.INFO + " Article detected!" 
-        print "Article class: "
-        print Tcolors.RES + " " + repr(article.class_name)
-        print "Article title: "
-        print article.get_ancestor_title() 
-        print "Article text: "
-        print article.full_text.replace("\n"," ") 
+        print(Tcolors.INFO + " Article detected!" )
+        print("Article class: ")
+        print(Tcolors.RES + " " + repr(article.class_name))
+        print("Article title: ")
+        print(article.get_ancestor_title()) 
+        print("Article text: ")
+        print(article.full_text.replace("\n"," ")) 
     
     def merge_groups(self, tree):
         """
@@ -498,7 +498,7 @@ def calculate_distances_from_max(self, tree, fixed_regions=False):
             if region.distance_from_max == 0 and region.parts == 1 \
                 and not fixed_regions and len(list(region.root_node.getchildren())) > 1\
                 and (self.content_appears_in_other_region(region)\
-                or self.close_diff_from_second_max(max_region)): #and
+                or self.close_diff_from_second_max(self.max_region)): #and
                 self.regions.remove(region)
                 self.recompute_max_density_region()
                 fixed_regions = True
@@ -510,17 +510,17 @@ def find_node_text(self, node):
         """
         node_text = "" 
         try:
-        	t = node.text
-        	t = True
+            t = node.text
+            t = True
         except:
-        	t = False
+            t = False
         if t and node.text is not None: 
             node_text = node.text
         else:
             try: 
-        		itertext = list(node.itertext())
+                itertext = list(node.itertext())
             except: 
-        		itertext = []
+                itertext = []
             itertexts = [text for text in itertext if text is not None and re.sub(r"\n|\r|\t| |,|\.","",text) != ""]
             descendants = [des for des in list(node.iterdescendants())]
             descendants_length = len(descendants)

From 83e5768f521826dcd79ef6fa6e9f4b4f75d408fd Mon Sep 17 00:00:00 2001
From: Tony <santiago.montesb@gmail.com>
Date: Sun, 4 Feb 2024 00:20:49 -0500
Subject: [PATCH 2/5] remove urllib from README

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 1107978..4c2b02b 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,6 @@ http://www.pip-installer.org/en/latest/
 After installing them, you should be able to install the following packages: <br />
 ```bash
 $ pip install nltk  
-$ pip install urllib 
 $ pip install lxml 
 ```
 

From 8114d439f2bac99bf24ef23461110030070fd6fe Mon Sep 17 00:00:00 2001
From: Tony <santiago.montesb@gmail.com>
Date: Sun, 4 Feb 2024 00:29:57 -0500
Subject: [PATCH 3/5] replace has_key for in operator

---
 __pycache__/region.cpython-312.pyc          | Bin 0 -> 6838 bytes
 __pycache__/terminal_colors.cpython-312.pyc | Bin 0 -> 1530 bytes
 region.py                                   |   4 ++--
 sd_algorithm.py                             |  20 ++++++++++----------
 4 files changed, 12 insertions(+), 12 deletions(-)
 create mode 100644 __pycache__/region.cpython-312.pyc
 create mode 100644 __pycache__/terminal_colors.cpython-312.pyc

diff --git a/__pycache__/region.cpython-312.pyc b/__pycache__/region.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fafd57db9293088c9d60e7695a814e5986dc67f9
GIT binary patch
literal 6838
zcmd5>T}&I<6`mQ7@h^~I2V)$_V3L4YaF_fjo1djgAmpbB-2_?|;;iL(1~3_Hc4kPx
za@s_SRJKaB(0zcYQo~AB7nGHfecG*5X}1rnZq<j{F_pcetdy!x`$pkm+q|^r-0_SZ
zaI%%!O1*+R_vf5*?>+aN@1A@8$I41O1?i`ap9)9nDC)0R(Gy!JY`g}A1&XJ5TBL^Y
z%ZT&@GtAIfHi@PQc9^9phB{60>>Y|XKVUS@Ve<?fsQ4NyeF0jv3`7%RLXx5AIHOF6
z-HC|_A+AI<Vl6LlQX4%`U!VkPnC7WrhNtgP!zSLrGk54=mN!AojAIQ~@Mg#@d<EoI
z-U7Lew?b~`ZIC-=>;b1*aY-1BCE`(iJvfr7@V_xl<2WfgMTN1Qq!a@+XfM>LalKgb
z9XFR$3dI!jpuok`UT6a;I1WiE6V!|u@Hn0!L8&ZK=@OvdqPG%it6ob{<1W2i{5~iY
zfsgDcR)Nw}q|i$xluo@3D1Cal_${HV(p!1dO|@PFz9zj~{OWvYA3(ib?;-1`(Q7kQ
zz|<$<hO1P-rLu}72r4Tj5{hb`o{T7CstqgQc!C#HYcvs8z*uC}!V7UZrp%}&QHZPN
z$%q7v+xeKRMB-5)JR&6~!V{5cb!RjpMyJGxB7~psBqP+6X>%7jqGCjr!|}+3pxR5y
z&LTyOR~h)XjZBGRSP`ZbwW_2uK9W$Ya6Yn<kitq#5e3!#d_61**99@as;n%CBa#jD
zAmPqQXbdu13J2-g+3q8uAIO3vhvZ0HiG_Lu`I?fL3{6C2MUX<ga6L3BCB}uQB8N`n
ztS<uL#H1(y10NAbLlQCm;N*;I4Tod#m=X>rt2K`Z>J4Zm@-HE~MdjUfv+P}){_!^4
z_uli~-*s=-vODVyF29}ec0zwuzK(-WW!{6-&L=zS7VF--0!4ei$v?}kS{w8Jy*Yn(
z#^0UwpTdS+`TE9O{r*h-{%rk0Ags>U)aGiMGd0cGnmtgr<h?%lROPMCoYj}H`c|zC
z`9>nkm){K(f3`MlzznJ9+_(Z7(8V?wup@9|=JuB<X54UHq}P&el0DRw&nfCAo1$+Y
z2IJM?Mkj-BS;uCJZuxKX&^phxR@hYl%lKWM=9y9ErWr=BNSS%lD4m4ehB`ZrE7_D%
z=IfN?hd#88O-Wx5xR(JV1Y!(1Jex9&Fi{w{p8^?%%N7OcGC(H>U=|griWkj7rtj83
z?|(QCZ7wX8GVx}zZqwc0vs9d(seqj@K%zK5fMMpPP=|7W{6PU^`>B8>`4>C74W{lf
zA$m>bZj1@am>_WxZZvjXh;s-h+*m~BB60ZBJe0c;Q^q*rrkoNP<yyx&+sF0=3{jCd
z&q;zZCB=b_yP8Od0&riCh*JVLl8`uI8sJ9&VIYy`U^tFAHy4Y;@UiHaAt<JBH(+R9
zP{3gYKFFO_xQHkxIK&A-Caef1Tw+8kBXW=_2Mrl2Gqw-@JK=x-z}IN&y#c%Gi~$@(
z$6_Kc32}lZs%2DA2#%UyO{x>+h6^w%)dQP^kgu}HsMdl;G@J@Yl(4QBS+x-vAW{if
zq@74_Lj{t7RF*)qnXFZ{$g!(pEIz8*e|X~j*`DygiPQb6nGni&l94^(IH>_gZi6h@
zP(Y#bL$?5lTn)>+MXfn0=eD%N_tdv5=WEaS+OxjkI#pBYo3rFyJJYV#yvMgNGe48_
zv}Qc5%bu*K^U=V^Lmv(04)$ga_I@^#J$Ui4=i--c@6$&A(t*XxbM^wCyK(8j(%y`_
zb?#i=Rljg_{^(L$#?_v0>&&%vW!k#3ZO7+&R^2W6R}bR1Bfpbd+Ozn*$L^M=4ZH7$
z?uCBw);h&*Z=XAr_w2+eES-Do3Fd?QbHU@8;PGtm<RZIT_e%c2Vf=RHx!p^`k^@@S
zY{0c<r?%HFZlAZ$oBy^4mb2DMxxM*nSFXA(Q{A?F;F0^|#*Z4aohLrq^F`C|0@>>G
zY3q4u7bqrL3S1Z&pbKn#NI;OMZ!#(R<sdjmUx7WjX|jPcmIJig-$1(&t$^%8;+gBT
zbU-m8laxs@;u1aTlXoi)oqpViF!UA*B1VL!a--^w%(K@H;pV?!4rT(Vzb-GM9MJI<
z4+=*DZ@Jb3<~Iw#QU(w~Vi_3HVX{O90{Tdb`IdEB$vR&q5qAv|0Ym1ctF1^u46M<`
z13+m+QJMtIZ#TR^f(?$5Y5>!+T$;=|`mP;-nHk?vl&6<?JMYl<_U5;(*$HcA^nC<7
zWcsOQiU!F%tBo{7!C1$qO+(ZTI#8J$F<`AX7UvO^i%3h5nxJ88Y=i^oBrq!nxC8-o
zWehMFG6O7mL5>P>9>OHKg+sm)%%trQX;sKkDK-h=rL4?|g5-x4N==X@w};}Xkdz83
zDK6EXI+_XvB{XZ5o_t0Jzy2qrq*W(@+lyI=Tor|+Y450_(p|}Y9B{R#IyzFXzn(hQ
z)X~uzOoik?+jl!Ux}YAq+}(3x;Dp{7x)Qq56>vxa9Dv1XN(gnTJtoKES`aIt=c$Y&
zs3v*ps>+UlY9z2Zq>scz)glR#Vk9aASOUpX2X+x$)*?|8Xu^mf3E-+#D1(I3ge1A6
zfW6N*_ktjKzlOP1YKM2$mUnp;4$mKc@8|~QuyM29YmB{;%e!h9UYma{?P^)>e>k`@
zm_9a?=@@$A+p`q?@aU@V&~HzE+WVW{boXHT&EfRrw{n-Gnak1aF+M#u{`lB<-c!GD
zWB$fc^Ur^@-2L#(%9)23RxW(foNe#=tmbih_ZPN5RsEqVJvf{j9L)@lW(Q;0Gvn#_
z<l{4wd3VFFYCfoazjoR2=;R|gcj$EH(CO@<v)QI|8TYxlvn7)AbxlhHnYzF|Ti(0#
ze%-ygw726?^T#b8wR|#^+1HER{U5yf{+pS)jz!x))?Bc-wR&ntW6r%N<KD9*EO%$!
zug=<@xN8@B=X=w>H_{jT(*sws?zccE)pcoW-QU+7G^k>mck$AE^?SBwGD6*AkLi?+
z`P^nX<uXSLm;=oeg*!?x`Vtf%P^HS@Gv1HN;$i{q!H=kNs0@J*lMET21o!I6DFvd5
zQfI=TL<?{Vp@n|NbFF=exWH*1#wP?h-cs<iK+%8t0w!{1OSni9Jgm^f_GDb7cBrd0
z5Gxp=gBDCAy#+hl+7$vuc{gNmzFR3<WzO21u{JODEbo38TnVmP50it?pQ-kzt$yv$
zD^(~elPL5S6yR}!zKO~!rGR!!FHmWL5+Mp=XyZ67IoisqT0aDU2e*`@875##e!r-G
z{objCb9g}{Xk^5Og$t}6<7r{3q+QbtDdb|@7cfgbFovdfc%K0v#|&@dup=0ck+Otc
zGO7cjEG@#q_XoqvpTC(#kr*O=P4h1ItOag+Id@CO-LlL+tXiqcx(|Vsd*SkRx9W+f
ze(}Klu6tcsPunc}Gv`ZoLVG3xeS_%DX&fiFDe=Y)i5T!1W9^p;p*GsAeoxo6rRj;o
zYd(pP@ixCHKz>GlC%}-qG{m2YskkB$Tq|0tgr{e+xnP<_<>+H+j|eDv0tjWi9TOwn
z=6-kPotbxk{LYUT<!p6x+S>dQZQKGb;^^?*oXR<QxqD|(Lz4FedNnM!IEME4)K3w+
zl*?*0#&Z!xkz!W?Cv?q{<F@pmZPyOk8!;K+K`|~shCn!eg^5XJ1}-lW1pY;|kWgdf
z1YI!^(^V_9(Nqh@`qzY+Ejv;R97)%f9f|QWbm+!vhikyQUGH?|99+i1<s7XUN9(eC
z)e*!~s%yS0=W5Nkpe5r9rk%m3t{wmX+mkf3S!(p%{rwzycKToL5;zCG7m<CEF5r%B
zb`v*lif_xN6d%NHe%t1FRvn$sZ;mJ9>P$O3%We)x(=Z*Ob%BD%V<xf$25#tR66{QA
zEj%L!yN6Enaoj7M|MK(|PMhTse>;cIX+d<};{9FY^LP9=lYg^#eOjmbdb(j&dUf!`
zrM|O$r*WR#KxFi}8Ipi32)Faiviw{(?u|iQRQXoQv%EnT<F|I7$7e=^Vk^XL<#+?1
zR1FUDz`B*FM|*`e$O?yA8N$NLtOnWeGFxQX{ARnA`3yXk6rz^mYfW)(<zEg+KvQA|
zM9YD#!h7|T`3Ps%?Z$Y{-9m2nmuxETagm~LWsC2q6=7m@$;yKJXw{#f!{F^1&JzxT
z9s*GC+Xy-B;in3CqQyt&a9Fj6!xIU9O2o1=9DaK$B5FNSBd}}1SPzy?V}_?aIn^Wt
zQVAVb8o;a{Gkh`7q5^y!lrY14p8P6gcc^u?%Di`jYcOA+*Sp+i?|Qr0+`L|8F~73z
zX3e|TomO-0x*M_$_cpUOV<I9@A-xU))M|J?nux_CB0MhP&pQ$x^{P#ro)Cj$@G}p&
y%@B@Fvw3DcrY*5c8U_|TpJfbb)=V@_Z}=(N{uR~y71i_&dy=6m*D1`&TK)wfFBJ0t

literal 0
HcmV?d00001

diff --git a/__pycache__/terminal_colors.cpython-312.pyc b/__pycache__/terminal_colors.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b0ef2ab28fd62d685516f386a2106d2e5204484
GIT binary patch
literal 1530
zcma)4&u<z>6rNocmfdBs9WdZHshrrYO=6|ErnVASk@#g|Ia*rmD9u2$BFwgO%Nl3b
zscO^%s(dg~EBO|B<LIiF{s;X7dfFSBQ>Fd`RXIfB)Hk~<s#GQFNPP3X_rCdNnfHEI
zRT;the(^Wsrw~GavcO-$gp5)^ULzir5kcpOaKG?g1*0=Q9ccP`wofW(^nzn&41_s?
zFi$ufAUqaG00)VHLnMerL&RYc!V(d&Ou|?p5{?iVt3<(362UV>#j{2f&lw_#cEH+D
z8Il>7dl}Kq%(|I5hWOf4Ex5IJt7B7j$W^-@#@oE(Q+3c)Ut{=I_54)reM#({>IA6c
zuEO)4|BpK1X1sryB*}AxQ)j940u>LN?N-~eJK|z(W9R<IW+%8<yPq{D@kc({oRH0J
zvm-O<=JrlD<MYf;*3V~wZ+I@Sd9FZ&#TuyHaC6JgXUs8IquiY__k7IVrQElj;GVWy
zdC=i&OO&tdclf2}O_+q~V)GGz-{^CI*Dy=pA$?kOqp8}B{CV<}?4-Ngnm?ajE$G+I
zP@2~$N{49ZQ7Km}RcWZQpMU(MM8(Q}xmqe|RD7DNYI|Cl3cI<z$5be3#X_2+f^D?E
zu-OMJjjM$R^(VGr*>$^de0o$b8ur(x?UTCMuul!EPK<BrCsz9_qj_rA%dLiOe+EMH
zq-DSX(rA5I2bFnr+-T`ud+wc_QIQ<kjTbGWGc)Pgy-DJIxYAw+=%QgFbs_wq_*-%T
zqR4Qv#S;)oL$>`Wjo2U7U;@Ire%vq(U8l0Hn{D!{#dt*5U%qOzyowb8t)()EGME9N
z+*2x)tGQ=1lrOtGpD)uuzWm6VW#vG%RJ7(86AI6A8Vx{aDOaF@T;Y%kpH?f37K==G
zKn2!zOJd$}fS-{y&v=Xh>zWJfYI_Ub_Z$t?h3@W9O#(!(cgr4x<K5g)iFAuY^#&BA
zR5w4A7rKR^G7mX9!B`xalJvuHq?`LgRxX;o)yw7H-OFb0!~TjRzt?~1$V-EmBd-jA
zt+9iV3%j>{xz&5vUv`w6ec~v$2OEyEGB|LQwJTuj%+#}51)F6@W-6Jfz~%sKKqaq(
z*Z))f>&8__xiwgEln>x*;IbH8b_Fa0H%juwVJ~&L?MT=AhmLe>P;{iVt2sxya|JB(
z>xLs`yZI3x3MRcLWv{8FIqy|h+0#y6dK~`5YyU%Q4GL_KZPw|(0vyM^xp9+A{*3_K
G1OEV$;(9><

literal 0
HcmV?d00001

diff --git a/region.py b/region.py
index 146cd8c..6a6d50d 100644
--- a/region.py
+++ b/region.py
@@ -117,7 +117,7 @@ def calculate_id(self):
         Returns the id attribute of the node if it exists, otherwise it returns 
         an empty string.
         """
-        if self.root_node.attrib.has_key('id'):
+        if 'id' in self.root_node.attrib:
             return self.root_node.attrib['id']
         else:
             return ""
@@ -127,7 +127,7 @@ def calculate_class_name(self):
         Returns the CSS class attribute of the node if it exists, otherwise it returns 
         an empty string.
         """
-        if self.root_node.attrib.has_key('class'):
+        if 'class' in self.root_node.attrib:
             return self.root_node.attrib['class']
         else:
             return ""
diff --git a/sd_algorithm.py b/sd_algorithm.py
index 544f387..1ba713a 100644
--- a/sd_algorithm.py
+++ b/sd_algorithm.py
@@ -102,7 +102,7 @@ def classify_page(self):
         if article_exists: 
             max_group = self.get_candidate_article(article, grouped_comments)
 
-            if grouped_comments.has_key(max_group): 
+            if max_group in grouped_comments: 
                 if grouped_comments != {}:
                     validated = self.candidate_group_level_validated(max_group, article, grouped_comments)
                 
@@ -161,11 +161,11 @@ def group_regions(self):
                     self.min_region_level = region.distance_from_root
                     
             pr_com = (len(region.tree.xpath(region.root)) > 0 and\
-                      region.tree.xpath(region.root)[0].getparent().attrib.has_key('class') and \
+                      ('class' in region.tree.xpath(region.root)[0].getparent().attrib) and \
                      region.tree.xpath(region.root)[0].getparent().attrib["class"].count('comment') > 0)
             if region.distance_from_max != 0 and (region.class_name != "" or \
                (region.class_name == "" and pr_com)):
-                if not grouped_comments.has_key(region.class_name):
+                if region.class_name not in grouped_comments:
                     grouped_comments[region.class_name] = [region]
                 else:
                     grouped_comments[region.class_name].append(region)
@@ -353,7 +353,7 @@ def big_areas_in_same_level(self, article, grouped_comments, max_group):
         Check if the big regions (or areas) belong to the same level in the 
         HTML tree structure.
         """
-        if grouped_comments.has_key(max_group):
+        if max_group in grouped_comments:
             first_candidate_comment = grouped_comments[max_group][0] 
             return article.distance_from_root == first_candidate_comment.distance_from_root\
                    and self.combined_region_level_exceeded(article)
@@ -379,9 +379,9 @@ def candidate_context_validated(self, article, grouped_comments, max_group):
         
         for des in list(comment_parent.iterdescendants()) + [comment_parent]:
             classname = id = ""
-            if des.attrib.has_key("class"):
+            if "class" in des.attrib:
                 classname = des.attrib['class']
-            if des.attrib.has_key("id"):
+            if "id" in des.attrib:
                 id = des.attrib['id']
             for ctag in COMMENT_TAGS:
                 contents = (des.text_content() + classname + id).lower() 
@@ -429,7 +429,7 @@ def merge_groups(self, tree):
             parent = node.getparent()
             if parent is not None:
                 parent_path = self.get_path(parent)
-                if self.valid_nodes.has_key(parent_path):
+                if parent_path in self.valid_nodes:
                     self.valid_nodes[parent_path].append(group)
                     self.valid_nodes[parent_path].extend(self.valid_nodes[group])
                     del self.valid_nodes[group]
@@ -556,7 +556,7 @@ def cross_tree(self, node, node_text=None, level=0):
         if node_text is None:        
             node_text = self.find_node_text(node)
             
-        if node.attrib.has_key("class") and node.attrib["class"] == "wrappers":
+        if ("class" in node.attrib) and node.attrib["class"] == "wrappers":
             dess = []
             for d,des in enumerate(node.iterdescendants()):
                 if des.text is not None:
@@ -608,7 +608,7 @@ def get_style(self, node):
         """
         Get the style attribute of the node if it exists.
         """
-        if node.attrib.has_key("style"):
+        if "style" in node.attrib:
             style = node.attrib.get('style')
         else:
             style = ""
@@ -626,7 +626,7 @@ def group_node(self, node, node_text):
             
         if parent_path not in ["/html","/html/body"] and node_text is not None\
            and node.tag != 'body' and self.has_visible_parents(valid_parent):
-            if not self.valid_nodes.has_key(parent_path):
+            if parent_path not in self.valid_nodes:
                 self.valid_nodes[parent_path] = [node_path] 
             else:
                 if node_path not in self.valid_nodes[parent_path]:

From 5a9f4a4358eca69a3b8eb9c1b9c98f399aafbb46 Mon Sep 17 00:00:00 2001
From: Tony <santiago.montesb@gmail.com>
Date: Sun, 4 Feb 2024 00:30:14 -0500
Subject: [PATCH 4/5] urlopen new implementation

---
 sd_algorithm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sd_algorithm.py b/sd_algorithm.py
index 1ba713a..9ffbd25 100644
--- a/sd_algorithm.py
+++ b/sd_algorithm.py
@@ -76,8 +76,8 @@ def construct_page_tree(self):
         Downloads the HTML page given the URL and creates the DOM page tree.
         Only the nodes that are useful for the segmentation are kept.
         """
-        page = urllib.urlopen(self.url)
-        html_body = page.read()  
+        with urllib.request.urlopen(self.url) as response:
+            html_body = response.read()  
         doc = html.fromstring(html_body)
         cleaner = Cleaner(**ARGS)
         try:

From 1055bd25be5551d783bf00c5d1a55ae4f223ff4b Mon Sep 17 00:00:00 2001
From: Tony <santiago.montesb@gmail.com>
Date: Sun, 4 Feb 2024 00:32:11 -0500
Subject: [PATCH 5/5] change re \d to r-string

---
 __pycache__/region.cpython-312.pyc          | Bin 6838 -> 0 bytes
 __pycache__/terminal_colors.cpython-312.pyc | Bin 1530 -> 0 bytes
 sd_algorithm.py                             |   4 ++--
 3 files changed, 2 insertions(+), 2 deletions(-)
 delete mode 100644 __pycache__/region.cpython-312.pyc
 delete mode 100644 __pycache__/terminal_colors.cpython-312.pyc

diff --git a/__pycache__/region.cpython-312.pyc b/__pycache__/region.cpython-312.pyc
deleted file mode 100644
index fafd57db9293088c9d60e7695a814e5986dc67f9..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6838
zcmd5>T}&I<6`mQ7@h^~I2V)$_V3L4YaF_fjo1djgAmpbB-2_?|;;iL(1~3_Hc4kPx
za@s_SRJKaB(0zcYQo~AB7nGHfecG*5X}1rnZq<j{F_pcetdy!x`$pkm+q|^r-0_SZ
zaI%%!O1*+R_vf5*?>+aN@1A@8$I41O1?i`ap9)9nDC)0R(Gy!JY`g}A1&XJ5TBL^Y
z%ZT&@GtAIfHi@PQc9^9phB{60>>Y|XKVUS@Ve<?fsQ4NyeF0jv3`7%RLXx5AIHOF6
z-HC|_A+AI<Vl6LlQX4%`U!VkPnC7WrhNtgP!zSLrGk54=mN!AojAIQ~@Mg#@d<EoI
z-U7Lew?b~`ZIC-=>;b1*aY-1BCE`(iJvfr7@V_xl<2WfgMTN1Qq!a@+XfM>LalKgb
z9XFR$3dI!jpuok`UT6a;I1WiE6V!|u@Hn0!L8&ZK=@OvdqPG%it6ob{<1W2i{5~iY
zfsgDcR)Nw}q|i$xluo@3D1Cal_${HV(p!1dO|@PFz9zj~{OWvYA3(ib?;-1`(Q7kQ
zz|<$<hO1P-rLu}72r4Tj5{hb`o{T7CstqgQc!C#HYcvs8z*uC}!V7UZrp%}&QHZPN
z$%q7v+xeKRMB-5)JR&6~!V{5cb!RjpMyJGxB7~psBqP+6X>%7jqGCjr!|}+3pxR5y
z&LTyOR~h)XjZBGRSP`ZbwW_2uK9W$Ya6Yn<kitq#5e3!#d_61**99@as;n%CBa#jD
zAmPqQXbdu13J2-g+3q8uAIO3vhvZ0HiG_Lu`I?fL3{6C2MUX<ga6L3BCB}uQB8N`n
ztS<uL#H1(y10NAbLlQCm;N*;I4Tod#m=X>rt2K`Z>J4Zm@-HE~MdjUfv+P}){_!^4
z_uli~-*s=-vODVyF29}ec0zwuzK(-WW!{6-&L=zS7VF--0!4ei$v?}kS{w8Jy*Yn(
z#^0UwpTdS+`TE9O{r*h-{%rk0Ags>U)aGiMGd0cGnmtgr<h?%lROPMCoYj}H`c|zC
z`9>nkm){K(f3`MlzznJ9+_(Z7(8V?wup@9|=JuB<X54UHq}P&el0DRw&nfCAo1$+Y
z2IJM?Mkj-BS;uCJZuxKX&^phxR@hYl%lKWM=9y9ErWr=BNSS%lD4m4ehB`ZrE7_D%
z=IfN?hd#88O-Wx5xR(JV1Y!(1Jex9&Fi{w{p8^?%%N7OcGC(H>U=|griWkj7rtj83
z?|(QCZ7wX8GVx}zZqwc0vs9d(seqj@K%zK5fMMpPP=|7W{6PU^`>B8>`4>C74W{lf
zA$m>bZj1@am>_WxZZvjXh;s-h+*m~BB60ZBJe0c;Q^q*rrkoNP<yyx&+sF0=3{jCd
z&q;zZCB=b_yP8Od0&riCh*JVLl8`uI8sJ9&VIYy`U^tFAHy4Y;@UiHaAt<JBH(+R9
zP{3gYKFFO_xQHkxIK&A-Caef1Tw+8kBXW=_2Mrl2Gqw-@JK=x-z}IN&y#c%Gi~$@(
z$6_Kc32}lZs%2DA2#%UyO{x>+h6^w%)dQP^kgu}HsMdl;G@J@Yl(4QBS+x-vAW{if
zq@74_Lj{t7RF*)qnXFZ{$g!(pEIz8*e|X~j*`DygiPQb6nGni&l94^(IH>_gZi6h@
zP(Y#bL$?5lTn)>+MXfn0=eD%N_tdv5=WEaS+OxjkI#pBYo3rFyJJYV#yvMgNGe48_
zv}Qc5%bu*K^U=V^Lmv(04)$ga_I@^#J$Ui4=i--c@6$&A(t*XxbM^wCyK(8j(%y`_
zb?#i=Rljg_{^(L$#?_v0>&&%vW!k#3ZO7+&R^2W6R}bR1Bfpbd+Ozn*$L^M=4ZH7$
z?uCBw);h&*Z=XAr_w2+eES-Do3Fd?QbHU@8;PGtm<RZIT_e%c2Vf=RHx!p^`k^@@S
zY{0c<r?%HFZlAZ$oBy^4mb2DMxxM*nSFXA(Q{A?F;F0^|#*Z4aohLrq^F`C|0@>>G
zY3q4u7bqrL3S1Z&pbKn#NI;OMZ!#(R<sdjmUx7WjX|jPcmIJig-$1(&t$^%8;+gBT
zbU-m8laxs@;u1aTlXoi)oqpViF!UA*B1VL!a--^w%(K@H;pV?!4rT(Vzb-GM9MJI<
z4+=*DZ@Jb3<~Iw#QU(w~Vi_3HVX{O90{Tdb`IdEB$vR&q5qAv|0Ym1ctF1^u46M<`
z13+m+QJMtIZ#TR^f(?$5Y5>!+T$;=|`mP;-nHk?vl&6<?JMYl<_U5;(*$HcA^nC<7
zWcsOQiU!F%tBo{7!C1$qO+(ZTI#8J$F<`AX7UvO^i%3h5nxJ88Y=i^oBrq!nxC8-o
zWehMFG6O7mL5>P>9>OHKg+sm)%%trQX;sKkDK-h=rL4?|g5-x4N==X@w};}Xkdz83
zDK6EXI+_XvB{XZ5o_t0Jzy2qrq*W(@+lyI=Tor|+Y450_(p|}Y9B{R#IyzFXzn(hQ
z)X~uzOoik?+jl!Ux}YAq+}(3x;Dp{7x)Qq56>vxa9Dv1XN(gnTJtoKES`aIt=c$Y&
zs3v*ps>+UlY9z2Zq>scz)glR#Vk9aASOUpX2X+x$)*?|8Xu^mf3E-+#D1(I3ge1A6
zfW6N*_ktjKzlOP1YKM2$mUnp;4$mKc@8|~QuyM29YmB{;%e!h9UYma{?P^)>e>k`@
zm_9a?=@@$A+p`q?@aU@V&~HzE+WVW{boXHT&EfRrw{n-Gnak1aF+M#u{`lB<-c!GD
zWB$fc^Ur^@-2L#(%9)23RxW(foNe#=tmbih_ZPN5RsEqVJvf{j9L)@lW(Q;0Gvn#_
z<l{4wd3VFFYCfoazjoR2=;R|gcj$EH(CO@<v)QI|8TYxlvn7)AbxlhHnYzF|Ti(0#
ze%-ygw726?^T#b8wR|#^+1HER{U5yf{+pS)jz!x))?Bc-wR&ntW6r%N<KD9*EO%$!
zug=<@xN8@B=X=w>H_{jT(*sws?zccE)pcoW-QU+7G^k>mck$AE^?SBwGD6*AkLi?+
z`P^nX<uXSLm;=oeg*!?x`Vtf%P^HS@Gv1HN;$i{q!H=kNs0@J*lMET21o!I6DFvd5
zQfI=TL<?{Vp@n|NbFF=exWH*1#wP?h-cs<iK+%8t0w!{1OSni9Jgm^f_GDb7cBrd0
z5Gxp=gBDCAy#+hl+7$vuc{gNmzFR3<WzO21u{JODEbo38TnVmP50it?pQ-kzt$yv$
zD^(~elPL5S6yR}!zKO~!rGR!!FHmWL5+Mp=XyZ67IoisqT0aDU2e*`@875##e!r-G
z{objCb9g}{Xk^5Og$t}6<7r{3q+QbtDdb|@7cfgbFovdfc%K0v#|&@dup=0ck+Otc
zGO7cjEG@#q_XoqvpTC(#kr*O=P4h1ItOag+Id@CO-LlL+tXiqcx(|Vsd*SkRx9W+f
ze(}Klu6tcsPunc}Gv`ZoLVG3xeS_%DX&fiFDe=Y)i5T!1W9^p;p*GsAeoxo6rRj;o
zYd(pP@ixCHKz>GlC%}-qG{m2YskkB$Tq|0tgr{e+xnP<_<>+H+j|eDv0tjWi9TOwn
z=6-kPotbxk{LYUT<!p6x+S>dQZQKGb;^^?*oXR<QxqD|(Lz4FedNnM!IEME4)K3w+
zl*?*0#&Z!xkz!W?Cv?q{<F@pmZPyOk8!;K+K`|~shCn!eg^5XJ1}-lW1pY;|kWgdf
z1YI!^(^V_9(Nqh@`qzY+Ejv;R97)%f9f|QWbm+!vhikyQUGH?|99+i1<s7XUN9(eC
z)e*!~s%yS0=W5Nkpe5r9rk%m3t{wmX+mkf3S!(p%{rwzycKToL5;zCG7m<CEF5r%B
zb`v*lif_xN6d%NHe%t1FRvn$sZ;mJ9>P$O3%We)x(=Z*Ob%BD%V<xf$25#tR66{QA
zEj%L!yN6Enaoj7M|MK(|PMhTse>;cIX+d<};{9FY^LP9=lYg^#eOjmbdb(j&dUf!`
zrM|O$r*WR#KxFi}8Ipi32)Faiviw{(?u|iQRQXoQv%EnT<F|I7$7e=^Vk^XL<#+?1
zR1FUDz`B*FM|*`e$O?yA8N$NLtOnWeGFxQX{ARnA`3yXk6rz^mYfW)(<zEg+KvQA|
zM9YD#!h7|T`3Ps%?Z$Y{-9m2nmuxETagm~LWsC2q6=7m@$;yKJXw{#f!{F^1&JzxT
z9s*GC+Xy-B;in3CqQyt&a9Fj6!xIU9O2o1=9DaK$B5FNSBd}}1SPzy?V}_?aIn^Wt
zQVAVb8o;a{Gkh`7q5^y!lrY14p8P6gcc^u?%Di`jYcOA+*Sp+i?|Qr0+`L|8F~73z
zX3e|TomO-0x*M_$_cpUOV<I9@A-xU))M|J?nux_CB0MhP&pQ$x^{P#ro)Cj$@G}p&
y%@B@Fvw3DcrY*5c8U_|TpJfbb)=V@_Z}=(N{uR~y71i_&dy=6m*D1`&TK)wfFBJ0t

diff --git a/__pycache__/terminal_colors.cpython-312.pyc b/__pycache__/terminal_colors.cpython-312.pyc
deleted file mode 100644
index 6b0ef2ab28fd62d685516f386a2106d2e5204484..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1530
zcma)4&u<z>6rNocmfdBs9WdZHshrrYO=6|ErnVASk@#g|Ia*rmD9u2$BFwgO%Nl3b
zscO^%s(dg~EBO|B<LIiF{s;X7dfFSBQ>Fd`RXIfB)Hk~<s#GQFNPP3X_rCdNnfHEI
zRT;the(^Wsrw~GavcO-$gp5)^ULzir5kcpOaKG?g1*0=Q9ccP`wofW(^nzn&41_s?
zFi$ufAUqaG00)VHLnMerL&RYc!V(d&Ou|?p5{?iVt3<(362UV>#j{2f&lw_#cEH+D
z8Il>7dl}Kq%(|I5hWOf4Ex5IJt7B7j$W^-@#@oE(Q+3c)Ut{=I_54)reM#({>IA6c
zuEO)4|BpK1X1sryB*}AxQ)j940u>LN?N-~eJK|z(W9R<IW+%8<yPq{D@kc({oRH0J
zvm-O<=JrlD<MYf;*3V~wZ+I@Sd9FZ&#TuyHaC6JgXUs8IquiY__k7IVrQElj;GVWy
zdC=i&OO&tdclf2}O_+q~V)GGz-{^CI*Dy=pA$?kOqp8}B{CV<}?4-Ngnm?ajE$G+I
zP@2~$N{49ZQ7Km}RcWZQpMU(MM8(Q}xmqe|RD7DNYI|Cl3cI<z$5be3#X_2+f^D?E
zu-OMJjjM$R^(VGr*>$^de0o$b8ur(x?UTCMuul!EPK<BrCsz9_qj_rA%dLiOe+EMH
zq-DSX(rA5I2bFnr+-T`ud+wc_QIQ<kjTbGWGc)Pgy-DJIxYAw+=%QgFbs_wq_*-%T
zqR4Qv#S;)oL$>`Wjo2U7U;@Ire%vq(U8l0Hn{D!{#dt*5U%qOzyowb8t)()EGME9N
z+*2x)tGQ=1lrOtGpD)uuzWm6VW#vG%RJ7(86AI6A8Vx{aDOaF@T;Y%kpH?f37K==G
zKn2!zOJd$}fS-{y&v=Xh>zWJfYI_Ub_Z$t?h3@W9O#(!(cgr4x<K5g)iFAuY^#&BA
zR5w4A7rKR^G7mX9!B`xalJvuHq?`LgRxX;o)yw7H-OFb0!~TjRzt?~1$V-EmBd-jA
zt+9iV3%j>{xz&5vUv`w6ec~v$2OEyEGB|LQwJTuj%+#}51)F6@W-6Jfz~%sKKqaq(
z*Z))f>&8__xiwgEln>x*;IbH8b_Fa0H%juwVJ~&L?MT=AhmLe>P;{iVt2sxya|JB(
z>xLs`yZI3x3MRcLWv{8FIqy|h+0#y6dK~`5YyU%Q4GL_KZPw|(0vyM^xp9+A{*3_K
G1OEV$;(9><

diff --git a/sd_algorithm.py b/sd_algorithm.py
index 9ffbd25..04e844e 100644
--- a/sd_algorithm.py
+++ b/sd_algorithm.py
@@ -317,8 +317,8 @@ def candidate_group_level_validated(self, max_group, article, grouped_comments):
         comment_remaining_nodes = comment_path.split("/")
         
         if len(article_remaining_nodes) > 1 and len(comment_remaining_nodes) > 1:
-            article_number = re.search("\d",article_remaining_nodes[0])
-            comment_number = re.search("\d",comment_remaining_nodes[0])
+            article_number = re.search(r"\d",article_remaining_nodes[0])
+            comment_number = re.search(r"\d",comment_remaining_nodes[0])
             if article_number and comment_number:
                 article_number.start()
                 comment_number.start()