-
Notifications
You must be signed in to change notification settings - Fork 1
Reverse Pull Request #19
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: start
Are you sure you want to change the base?
Changes from all commits
b07858c
abab21a
a45897e
2f38983
b1bfafe
33c4475
f280303
f78badb
cb6e408
b628bf5
9e9ffce
4d0ca7f
3a417e8
585f556
6ff1db0
88699bd
b9e7cd2
46abaaa
0608403
78ae703
ea9c874
40469e5
10ea207
d5cbbd4
7a74f0d
6b92f7d
6d24954
f7f318f
b962897
d428941
efa0db8
f3bd076
2885950
f222ed2
056bb02
e8989fa
b610135
dee832f
76e4f16
a3cc087
422886f
c43b759
21adc30
91b4576
0c58c1d
c9d824d
f3d85a4
b9c8eba
e26a8f8
926fae8
8418133
4b63031
ef939e8
91ee55d
3aae7df
f532175
7bf6280
2bcf6a0
969ed09
eeb8289
5c0c537
9752398
69568bb
bea0b72
3df7d79
b96b34a
5c990b0
a0f90f9
bfa74d2
24da5b6
bfbf7dc
13ecd6a
eb19b89
b1d7fd2
a6c430d
8a262da
9f07997
07437e3
2eb74f5
330eb25
5e9b645
b134862
d5c9954
7dd52dd
c312802
7122f58
2f67d0c
2f98ff8
40454f7
438fbe7
9b1d027
fff6526
edf4525
496725d
e943aa0
3d09fbc
cfa67ae
cde2f5f
1efcc5b
28ff691
90806f3
d4872a7
a126bfa
b82a076
762cb24
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| web: ./run |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| # PYSEARCH | ||
|
|
||
| **Authors:** Marc Fieser, Sera Smith, and Ben Shields | ||
|
|
||
| **Live URL:** https://vascodagama.herokuapp.com/ | ||
|
|
||
| ## Getting Started: | ||
| ``` | ||
| - cd <directory containing this file> | ||
|
|
||
| - $VENV/bin/pip install -e . | ||
|
|
||
| - $VENV/bin/initialize_db development.ini | ||
|
|
||
| - $VENV/bin/pserve development.ini | ||
| ``` | ||
|
|
||
| ## Global Variables: | ||
|
|
||
| harvester.py: | ||
| NUM_OF_OCCURANCES -- number of times a word must appear in order for harvester to add word to database keyword table. | ||
|
|
||
| crawler.py: | ||
| CRAWL_COUNT -- crawl page count parameter | ||
| DEPTH_LEVEL -- depth per page crawled | ||
|
|
||
| pipelines.py: | ||
| MINIMUM_MATCHES -- minimum number of word matches for url to be sent to database matches table. | ||
|
|
||
| ## License: | ||
| MIT | ||
|
|
||
| **Copyright 2017** |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Define here the models for your spider middleware | ||
| # | ||
| # See documentation in: | ||
| # http://doc.scrapy.org/en/latest/topics/spider-middleware.html | ||
|
|
||
| from scrapy import signals | ||
|
|
||
|
|
||
| class HarvesterSpiderMiddleware(object): | ||
| # Not all methods need to be defined. If a method is not defined, | ||
| # scrapy acts as if the spider middleware does not modify the | ||
| # passed objects. | ||
|
|
||
| @classmethod | ||
| def from_crawler(cls, crawler): | ||
| # This method is used by Scrapy to create your spiders. | ||
| s = cls() | ||
| crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | ||
| return s | ||
|
|
||
| def process_spider_input(response, spider): | ||
| # Called for each response that goes through the spider | ||
| # middleware and into the spider. | ||
|
|
||
| # Should return None or raise an exception. | ||
| return None | ||
|
|
||
| def process_spider_output(response, result, spider): | ||
| # Called with the results returned from the Spider, after | ||
| # it has processed the response. | ||
|
|
||
| # Must return an iterable of Request, dict or Item objects. | ||
| for i in result: | ||
| yield i | ||
|
|
||
| def process_spider_exception(response, exception, spider): | ||
| # Called when a spider or process_spider_input() method | ||
| # (from other spider middleware) raises an exception. | ||
|
|
||
| # Should return either None or an iterable of Response, dict | ||
| # or Item objects. | ||
| pass | ||
|
|
||
| def process_start_requests(start_requests, spider): | ||
| # Called with the start requests of the spider, and works | ||
| # similarly to the process_spider_output() method, except | ||
| # that it doesn’t have a response associated. | ||
|
|
||
| # Must return only requests (not items). | ||
| for r in start_requests: | ||
| yield r | ||
|
|
||
| def spider_opened(self, spider): | ||
| spider.logger.info('Spider opened: %s' % spider.name) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| """pipelines.py.""" | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Define your item pipelines here | ||
| # | ||
| # Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
| # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
|
|
||
| from sqlalchemy.orm import sessionmaker | ||
| from pysearch.models import Keyword, Match | ||
|
|
||
|
|
||
| from pysearch.models.meta import Base | ||
| from sqlalchemy.engine.url import URL | ||
| from sqlalchemy import create_engine | ||
|
|
||
| DATABASE = { | ||
| 'drivername': 'postgres', | ||
| 'host': 'localhost', | ||
| 'port': '5432', | ||
| 'database': 'pysearch' | ||
| } | ||
|
|
||
| MINIMUM_MATCHES = 5 | ||
|
|
||
|
|
||
| def db_connect(): | ||
| """Perform database connection using database settings from settings.py. Returns sqlalchemy engine instance.""" | ||
| return create_engine(URL(**DATABASE)) | ||
|
|
||
|
|
||
| def create_keyword_table(engine): | ||
| """Create tables.""" | ||
| Base.metadata.create_all(engine) | ||
|
|
||
|
|
||
| class CrawlerPipeline(object): | ||
| """Crawler pipeline for comparing scraped items with items in the database.""" | ||
|
|
||
| def __init__(self): | ||
| """Initialize database connection and sessionmaker. Creates matches table.""" | ||
| engine = db_connect() | ||
| create_keyword_table(engine) | ||
| self.Session = sessionmaker(bind=engine) | ||
|
|
||
| def process_item(self, item, spider): | ||
| """Save matches in the database. | ||
|
|
||
| This method is called for every item pipeline component. | ||
|
|
||
| """ | ||
| if spider.name is 'crawler': | ||
| session = self.Session() | ||
| try: | ||
| db_words = session.query(Keyword).all() | ||
| match_words = [] | ||
| for word in db_words: | ||
| if word.keyword in item['words']: | ||
| match = { | ||
| 'word': word.keyword, | ||
| 'key_weight': word.keyword_weight, | ||
| 'count': item['words'][word.keyword], | ||
| 'url': item['url'] | ||
| } | ||
| match_words.append(match) | ||
| if len(match_words) > MINIMUM_MATCHES: | ||
| to_add = [] | ||
| for match in match_words: | ||
| new_keyword = Match(keyword=match['word'], keyword_weight=match['key_weight'], page_url=match['url'], count=match['count']) | ||
| to_add.append(new_keyword) | ||
| print('Pysearch Database Updated...') | ||
| session.add_all(to_add) | ||
| session.commit() | ||
| except: | ||
| session.rollback() | ||
| raise | ||
| finally: | ||
| session.close() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This monolith of a function could probably be broken down into some smaller, more testable functions. |
||
|
|
||
| return item | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,90 @@ | ||
| # -*- coding: utf-8 -*- | ||
|
|
||
| # Scrapy settings for harvester project | ||
| # | ||
| # For simplicity, this file contains only settings considered important or | ||
| # commonly used. You can find more settings consulting the documentation: | ||
| # | ||
| # http://doc.scrapy.org/en/latest/topics/settings.html | ||
| # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
| # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
|
|
||
| BOT_NAME = 'harvester' | ||
|
|
||
| SPIDER_MODULES = ['harvester.spiders'] | ||
| NEWSPIDER_MODULE = 'harvester.spiders' | ||
|
|
||
| # Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
| #USER_AGENT = 'harvester (+http://www.yourdomain.com)' | ||
|
|
||
| # Obey robots.txt rules | ||
| ROBOTSTXT_OBEY = True | ||
|
|
||
| # Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
| #CONCURRENT_REQUESTS = 32 | ||
|
|
||
| # Configure a delay for requests for the same website (default: 0) | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
| # See also autothrottle settings and docs | ||
| #DOWNLOAD_DELAY = 3 | ||
| # The download delay setting will honor only one of: | ||
| #CONCURRENT_REQUESTS_PER_DOMAIN = 16 | ||
| #CONCURRENT_REQUESTS_PER_IP = 16 | ||
|
|
||
| # Disable cookies (enabled by default) | ||
| #COOKIES_ENABLED = False | ||
|
|
||
| # Disable Telnet Console (enabled by default) | ||
| #TELNETCONSOLE_ENABLED = False | ||
|
|
||
| # Override the default request headers: | ||
| #DEFAULT_REQUEST_HEADERS = { | ||
| # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
| # 'Accept-Language': 'en', | ||
| #} | ||
|
|
||
| # Enable or disable spider middlewares | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
| #SPIDER_MIDDLEWARES = { | ||
| # 'harvester.middlewares.HarvesterSpiderMiddleware': 543, | ||
| #} | ||
|
|
||
| # Enable or disable downloader middlewares | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
| #DOWNLOADER_MIDDLEWARES = { | ||
| # 'harvester.middlewares.MyCustomDownloaderMiddleware': 543, | ||
| #} | ||
|
|
||
| # Enable or disable extensions | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
| #EXTENSIONS = { | ||
| # 'scrapy.extensions.telnet.TelnetConsole': None, | ||
| #} | ||
|
|
||
| # Configure item pipelines | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
| ITEM_PIPELINES = { | ||
| 'harvester.pipelines.HarvesterPipeline': 300, | ||
| # 'harvester.pipelines.CrawlerPipeline': 300, | ||
| } | ||
|
|
||
| # Enable and configure the AutoThrottle extension (disabled by default) | ||
| # See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
| #AUTOTHROTTLE_ENABLED = True | ||
| # The initial download delay | ||
| #AUTOTHROTTLE_START_DELAY = 5 | ||
| # The maximum download delay to be set in case of high latencies | ||
| #AUTOTHROTTLE_MAX_DELAY = 60 | ||
| # The average number of requests Scrapy should be sending in parallel to | ||
| # each remote server | ||
| #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | ||
| # Enable showing throttling stats for every response received: | ||
| #AUTOTHROTTLE_DEBUG = False | ||
|
|
||
| # Enable and configure HTTP caching (disabled by default) | ||
| # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
| #HTTPCACHE_ENABLED = True | ||
| #HTTPCACHE_EXPIRATION_SECS = 0 | ||
| #HTTPCACHE_DIR = 'httpcache' | ||
| #HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
| #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could probably remove all of the commented lines since they're not being used anyway. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,4 @@ | ||
| # This package will contain the spiders of your Scrapy project | ||
| # | ||
| # Please refer to the documentation for information on how to create and manage | ||
| # your spiders. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| """Spider for crawling.""" | ||
| import collections | ||
| from stop_words import get_stop_words | ||
| from scrapy.crawler import CrawlerProcess | ||
| from scrapy.utils.project import get_project_settings | ||
| from scrapy.linkextractors import LinkExtractor | ||
| from scrapy.spiders import Rule, CrawlSpider | ||
| from scrapy.item import Item, Field | ||
|
|
||
| CRAWL_COUNT = 10 | ||
| DEPTH_LEVEL = 10 | ||
|
|
||
|
|
||
| class MyItem(Item): | ||
| """Item container for scraping.""" | ||
|
|
||
| url = Field() | ||
| words = Field() | ||
|
|
||
|
|
||
| class CrawlingSpider(CrawlSpider): | ||
| """Spider for harvesting words from a URL.""" | ||
|
|
||
| name = "crawler" | ||
| custom_settings = { | ||
| 'ITEM_PIPELINES': { | ||
| 'pysearch.harvester.pipelines.CrawlerPipeline': 300, | ||
| } | ||
| } | ||
|
|
||
| rules = (Rule(LinkExtractor(allow=("", ),), callback="parse_items", follow=True),) | ||
|
|
||
| def __init__(self, url=None, *args, **kwargs): | ||
| """Initialize a harvest spider.""" | ||
| super(CrawlingSpider, self).__init__(*args, **kwargs) | ||
|
|
||
| def parse_items(self, response): | ||
| """Get links from site.""" | ||
| item = MyItem() | ||
| words = [] | ||
| stop_words = get_stop_words('english') | ||
| p = response.css('p::text').extract() | ||
| for each in p: | ||
| words.extend(each.split()) | ||
| words = lower_list(words) | ||
| word_count = collections.Counter(words) | ||
| for key in list(word_count.keys()): | ||
| if key in stop_words: | ||
| del word_count[key] | ||
| item['words'] = word_count | ||
| item['url'] = response.url | ||
| yield item | ||
|
|
||
|
|
||
| def crawl(url): | ||
| """Initialize crawling sequence.""" | ||
| settings = get_project_settings() | ||
| settings.url = url | ||
| settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT | ||
| settings["DEPTH_LEVEL"] = DEPTH_LEVEL | ||
| process = CrawlerProcess(settings) | ||
|
|
||
| class ThisSpider(CrawlingSpider): | ||
| """Create a spider to crawl with.""" | ||
|
|
||
| start_urls = [url] | ||
| process.crawl(ThisSpider) | ||
| process.start() | ||
|
|
||
|
|
||
| def lower_list(list_in): | ||
| """Return a list with all words lowercase.""" | ||
| list_out = [] | ||
| for each in list_in: | ||
| list_out.append(each.lower()) | ||
| return list_out | ||
|
|
||
| if __name__ == '__main__': | ||
| import sys | ||
| crawl(sys.argv[1]) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove corpse code