From 9841761ab3be8aab01e65e20e9af3bc096096bea Mon Sep 17 00:00:00 2001 From: Michael Walsh Date: Fri, 18 Mar 2022 13:49:09 -0400 Subject: [PATCH 1/3] Updated output to show better progress --- abdul.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/abdul.py b/abdul.py index 2362ad1..836350f 100644 --- a/abdul.py +++ b/abdul.py @@ -1,5 +1,4 @@ import re -import sys import requests import argparse @@ -42,12 +41,13 @@ def addURLs(hrefs): newURL = node.url + newURL # Build URL if it is refrencing a local resource if rootDomain in newURL and node.depth < args.d and newURL not in webListingsStrings and not re.match(r'\.(css|zip|gz|bz2|png|gif|jpg|jpeg|bmp|mpg|mpeg|avi|wmv|mov|rm|ram|swf|flv|ogg|webm|mp4|mp3|wav|acc|wma|mid|midi)$',newURL): if args.v: - print("Adding URL: " + newURL) + print("Adding URL: (" + str(len(webListings)) + ") " + newURL) webListings.append(webNode(newURL,node.depth+1)) webListingsStrings.append(newURL) def dedupeFile(): + print("Starting dedupe") outWords = [] outFile = open(args.o,"a", encoding="utf-8") with open(tempFile, encoding="utf-8") as inFile: @@ -69,10 +69,17 @@ def dedupeFile(): } # Loops through each node dynamicaly + +if not args.v: + print("You have chonse no verbose output. This proccess will take a while. You might want to kill this and add \'-v\' to track progress.") +else: + currentNode = 1 + for node in webListings: if args.v: - print("Working on: " + node.url + ":" + str(node.depth)) + print("Working on: (" + str(currentNode) + " of " + str(len(webListings)) + ") " + node.url + ":" + str(node.depth)) + currentNode+=1 # Try website connection try: From 35f6eaf248d9379a2958d62980c9701919440861 Mon Sep 17 00:00:00 2001 From: Michael Walsh Date: Fri, 18 Mar 2022 14:07:33 -0400 Subject: [PATCH 2/3] Added max URL variable --- abdul.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/abdul.py b/abdul.py index 836350f..3be382d 100644 --- a/abdul.py +++ b/abdul.py @@ -8,6 +8,7 @@ parser.add_argument("URL", help="URL to start from") parser.add_argument("-o", default="word.list", help="Location to save file (default=word.list") parser.add_argument("-d", type=int, default=2, help="Depth to spider (default=2)") +parser.add_argument("-m", type=int, default=500, help="Max number of pages to scrape (default=500)") parser.add_argument("-l", type=int, default=4, help="Minimum length of word to search for (default=4)") parser.add_argument("-v", help="Display verbose output", action="store_true") @@ -39,7 +40,7 @@ def addURLs(hrefs): newURL = urlItem.replace("href=",'').replace('"','',2) if newURL[0] == "/": newURL = node.url + newURL # Build URL if it is refrencing a local resource - if rootDomain in newURL and node.depth < args.d and newURL not in webListingsStrings and not re.match(r'\.(css|zip|gz|bz2|png|gif|jpg|jpeg|bmp|mpg|mpeg|avi|wmv|mov|rm|ram|swf|flv|ogg|webm|mp4|mp3|wav|acc|wma|mid|midi)$',newURL): + if len(webListings) < args.m and rootDomain in newURL and node.depth < args.d and newURL not in webListingsStrings and not re.match(r'\.(css|zip|gz|bz2|png|gif|jpg|jpeg|bmp|mpg|mpeg|avi|wmv|mov|rm|ram|swf|flv|ogg|webm|mp4|mp3|wav|acc|wma|mid|midi)$',newURL): if args.v: print("Adding URL: (" + str(len(webListings)) + ") " + newURL) webListings.append(webNode(newURL,node.depth+1)) @@ -87,7 +88,8 @@ def dedupeFile(): except: print("Host unreachable: " + node.url) continue - addURLs(re.findall('href=".+?"',data.text)) + if len(webListings) < args.m: + addURLs(re.findall('href=".+?"',data.text)) searchContent(data.text) From 88faac963b0d137318a382ea677c6c155db854a2 Mon Sep 17 00:00:00 2001 From: Michael Walsh Date: Fri, 18 Mar 2022 14:11:31 -0400 Subject: [PATCH 3/3] Updated README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 65fdb6e..1ea7aa9 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ options: -h, --help Show help message and exit -o O Location to save file (default=word.list -d D Depth to spider (default=2) + -m M Max number of pages to scrape (default=500) -l L Minimum length of word to search for (default=4) -v Display verbose output \ No newline at end of file