Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
105 commits
Select commit Hold shift + click to select a range
b07858c
layout and result templates. routes.
Jan 9, 2017
abab21a
Merge pull request #1 from Pysearch/templates
iamrobinhood12345 Jan 9, 2017
a45897e
temp commit to switch to new branch.
Jan 9, 2017
2f38983
home view template added. modifed layout template.
Jan 9, 2017
b1bfafe
Merge branch 'master' of https://github.com/Pysearch/Pysearch2.0 into…
Jan 9, 2017
33c4475
Merge branch 'views' of https://github.com/Pysearch/Pysearch2.0 into …
Jan 9, 2017
f280303
created dummy db in default, tested basic app. links/routes working
Jan 10, 2017
f78badb
spiders integrated into app
midfies Jan 10, 2017
cb6e408
rge branch 'harvester' of https://github.com/Pysearch/Pysearch2.0 int…
Jan 10, 2017
b628bf5
posgres
Jan 10, 2017
9e9ffce
scrapy pipeline updated to update db
midfies Jan 10, 2017
4d0ca7f
adding file that tests harvester from scripts dir
midfies Jan 10, 2017
3a417e8
Merge pull request #4 from Pysearch/harvester
iamrobinhood12345 Jan 10, 2017
585f556
Merge branch 'development' of https://github.com/Pysearch/Pysearch2.0…
Jan 10, 2017
6ff1db0
base.metadata.drop_all(engine) in create_keyword_table in pipelines.py.
Jan 10, 2017
88699bd
psycopg2 and stop-words added to requires in setup.py.
Jan 10, 2017
b9e7cd2
moved test harv functionality into homeview.
Jan 10, 2017
46abaaa
added route for computing results, added view for computing results, …
Jan 10, 2017
0608403
templates and dummy data updated
Jan 10, 2017
78ae703
merge conflict resolved, views/default.py, setup.py.
Jan 10, 2017
ea9c874
update dummy data for pserve
Jan 10, 2017
40469e5
resolve stop_words to stop-words in requires in setup.py.
Jan 10, 2017
10ea207
basic crawler spider
midfies Jan 10, 2017
d5cbbd4
commiting to pull in changes from crawler.
Jan 10, 2017
7a74f0d
edited template for results, edited views results list of dicts, adde…
Jan 10, 2017
6b92f7d
adjusted development.ini for my postgres username (ben), modified ini…
Jan 10, 2017
6d24954
working on crawler spider and multiple pipelines
midfies Jan 10, 2017
f7f318f
pushing converted keyword model data to results.jinja2 in results_vie…
Jan 10, 2017
b962897
modified home.jinja2 search to have form with method post.
Jan 10, 2017
d428941
moved results test param and edited home_view print statement in defa…
Jan 10, 2017
efa0db8
crawling pipeline added, FIRST WORKING SEARCH ENGINE
midfies Jan 11, 2017
f3bd076
removing corpse code and print statements
midfies Jan 11, 2017
2885950
added passing url through harvest function
midfies Jan 11, 2017
f222ed2
edited harvester process to take HarvestSpider as arg (instead of har…
Jan 11, 2017
056bb02
merge conflict resolved.
Jan 11, 2017
e8989fa
debugging critical: unhandled error in deferred.
Jan 11, 2017
b610135
shortened time delay to five seconds in home_view.
Jan 11, 2017
dee832f
debugging views.
Jan 11, 2017
76e4f16
placed class HarvestSpider within def harvest(url) in harvester.py. s…
Jan 11, 2017
a3cc087
commented out print statements.
Jan 11, 2017
422886f
added drop all tables for running harvester
midfies Jan 11, 2017
c43b759
debugging critical error db population.
Jan 11, 2017
21adc30
working on integration
midfies Jan 11, 2017
91b4576
moved pipeline functionality to spider
midfies Jan 11, 2017
0c58c1d
merging views to dev-branch
midfies Jan 11, 2017
c9d824d
first functioning harvester/crawler
midfies Jan 11, 2017
f3d85a4
removed beautifulsoup from setup, added requirements.txt
midfies Jan 11, 2017
b9c8eba
added Procfile
midfies Jan 11, 2017
e26a8f8
added run
midfies Jan 11, 2017
926fae8
added runapp.py
midfies Jan 11, 2017
8418133
changed crawl to 10 pages for debugging purposes. added route for loa…
Jan 12, 2017
4b63031
restructured model. ranking algorithm in progress.
Jan 12, 2017
ef939e8
setup dbsession fixtures. initial unittest.
Jan 12, 2017
91ee55d
added an ajax call to preventdefault harvesting and crawling operatio…
Jan 12, 2017
3aae7df
Merge pull request #7 from Pysearch/dev-branch
iamrobinhood12345 Jan 12, 2017
f532175
Merge pull request #8 from Pysearch/loading-view
iamrobinhood12345 Jan 12, 2017
7bf6280
Merge branch 'dev-branch' into ranking-sera
midfies Jan 12, 2017
2bcf6a0
Merge pull request #9 from Pysearch/ranking-sera
midfies Jan 12, 2017
969ed09
Merge branch 'dev-branch' of https://github.com/Pysearch/Pysearch2.0 …
midfies Jan 12, 2017
eeb8289
reworking spiders to match new model
midfies Jan 12, 2017
5c0c537
adjusted models to work with spiders, removed punctuation
midfies Jan 12, 2017
9752398
testing
Jan 12, 2017
69568bb
ranking now taking in results data, scored it, and put it in format r…
Jan 12, 2017
bea0b72
Merge pull request #10 from Pysearch/integration-thursday
midfies Jan 12, 2017
3df7d79
Merge pull request #11 from Pysearch/dev-branch
midfies Jan 12, 2017
b96b34a
changed model name from Keyword to Match.
Jan 12, 2017
5c990b0
Merge branch 'ranking-sera' of https://github.com/Pysearch/Pysearch2.…
Jan 12, 2017
a0f90f9
took out corpose code and pdb
Jan 12, 2017
bfa74d2
Merge branch 'dev-branch' into ranking-sera
iamrobinhood12345 Jan 12, 2017
24da5b6
Merge pull request #13 from Pysearch/ranking-sera
iamrobinhood12345 Jan 12, 2017
bfbf7dc
working on bootstrap
midfies Jan 12, 2017
13ecd6a
Merge branch 'dev-branch' of https://github.com/Pysearch/Pysearch2.0 …
midfies Jan 12, 2017
eb19b89
FIRST START TO FINISH SEARCH
midfies Jan 12, 2017
b1d7fd2
Merge branch 'dev-branch' of https://github.com/Pysearch/Pysearch2.0 …
midfies Jan 12, 2017
a6c430d
some front end magic
midfies Jan 13, 2017
8a262da
added about me route and view
midfies Jan 13, 2017
9f07997
adjusting for no results
midfies Jan 13, 2017
07437e3
Merge pull request #14 from Pysearch/integration-thursday
midfies Jan 13, 2017
2eb74f5
add about me template
midfies Jan 13, 2017
330eb25
Merge pull request #15 from Pysearch/integration-thursday
midfies Jan 13, 2017
5e9b645
Merge pull request #16 from Pysearch/dev-branch
midfies Jan 13, 2017
b134862
role midfies management.
Jan 13, 2017
d5c9954
adding new tests
midfies Jan 13, 2017
7dd52dd
adding tox
midfies Jan 13, 2017
c312802
about me
Jan 13, 2017
7122f58
Merge pull request #18 from Pysearch/final-testing
iamrobinhood12345 Jan 13, 2017
2f67d0c
prepping master for presentation.
Jan 13, 2017
2f98ff8
cleaned up test page, corpse code and docstrings.
Jan 13, 2017
40454f7
docstrings and readme in harvester directory.
Jan 13, 2017
438fbe7
model docstrings.
Jan 13, 2017
9b1d027
cleaning css, removed files, cleaned initializedb.py, renamed css files.
Jan 13, 2017
fff6526
modifying templates, deleting unused templates, updating about me, py…
Jan 13, 2017
edf4525
debugged results page seed page search.
Jan 13, 2017
496725d
Update .gitignore
iamrobinhood12345 Jan 23, 2017
e943aa0
Delete .DS_Store
iamrobinhood12345 Jan 23, 2017
3d09fbc
Update .gitignore
iamrobinhood12345 Jan 23, 2017
cfa67ae
Update and rename README.txt to README.md
iamrobinhood12345 Jan 23, 2017
cde2f5f
Update README.md
iamrobinhood12345 Jan 23, 2017
1efcc5b
Update README.md
iamrobinhood12345 Jan 23, 2017
28ff691
Update README.md
iamrobinhood12345 Jan 23, 2017
90806f3
Update README.md
iamrobinhood12345 Jan 23, 2017
d4872a7
Update README.md
iamrobinhood12345 Jan 23, 2017
a126bfa
Update README.md
iamrobinhood12345 Jan 23, 2017
b82a076
Update README.md
iamrobinhood12345 Jan 23, 2017
762cb24
Update README.md
iamrobinhood12345 Jan 23, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ var/
*.egg-info/
.installed.cfg
*.egg
.DS_Store

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -96,4 +97,4 @@ pip-selfcheck.json
share

#misc
*.sqlite
*.sqlite
1 change: 1 addition & 0 deletions Procfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
web: ./run
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# PYSEARCH

**Authors:** Marc Fieser, Sera Smith, and Ben Shields

**Live URL:** https://vascodagama.herokuapp.com/

## Getting Started:
```
- cd <directory containing this file>

- $VENV/bin/pip install -e .

- $VENV/bin/initialize_db development.ini

- $VENV/bin/pserve development.ini
```

## Global Variables:

harvester.py:
NUM_OF_OCCURANCES -- number of times a word must appear in order for harvester to add word to database keyword table.

crawler.py:
CRAWL_COUNT -- crawl page count parameter
DEPTH_LEVEL -- depth per page crawled

pipelines.py:
MINIMUM_MATCHES -- minimum number of word matches for url to be sent to database matches table.

## License:
MIT

**Copyright 2017**
14 changes: 0 additions & 14 deletions README.txt

This file was deleted.

3 changes: 2 additions & 1 deletion development.ini
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ pyramid.default_locale_name = en
pyramid.includes =
pyramid_debugtoolbar

sqlalchemy.url = sqlite:///%(here)s/pysearch.sqlite
sqlalchemy.url = postgres://@localhost:5432/pysearch


# By default, the toolbar only appears for clients from IP addresses
# '127.0.0.1' and '::1'.
Expand Down
Binary file added pysearch/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion pysearch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
def main(global_config, **settings):
""" This function returns a Pyramid WSGI application.
"""
settings["sqlachemy.url"] = os.environ["DATABASE_URL"]
# settings["sqlachemy.url"] = os.environ["DATABASE_URL"]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove corpse code

config = Configurator(settings=settings)
config.include('pyramid_jinja2')
config.include('.models')
Expand Down
File renamed without changes.
56 changes: 56 additions & 0 deletions pysearch/harvester/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class HarvesterSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict
# or Item objects.
pass

def process_start_requests(start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
80 changes: 80 additions & 0 deletions pysearch/harvester/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""pipelines.py."""
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from sqlalchemy.orm import sessionmaker
from pysearch.models import Keyword, Match


from pysearch.models.meta import Base
from sqlalchemy.engine.url import URL
from sqlalchemy import create_engine

DATABASE = {
'drivername': 'postgres',
'host': 'localhost',
'port': '5432',
'database': 'pysearch'
}

MINIMUM_MATCHES = 5


def db_connect():
"""Perform database connection using database settings from settings.py. Returns sqlalchemy engine instance."""
return create_engine(URL(**DATABASE))


def create_keyword_table(engine):
"""Create tables."""
Base.metadata.create_all(engine)


class CrawlerPipeline(object):
"""Crawler pipeline for comparing scraped items with items in the database."""

def __init__(self):
"""Initialize database connection and sessionmaker. Creates matches table."""
engine = db_connect()
create_keyword_table(engine)
self.Session = sessionmaker(bind=engine)

def process_item(self, item, spider):
"""Save matches in the database.

This method is called for every item pipeline component.

"""
if spider.name is 'crawler':
session = self.Session()
try:
db_words = session.query(Keyword).all()
match_words = []
for word in db_words:
if word.keyword in item['words']:
match = {
'word': word.keyword,
'key_weight': word.keyword_weight,
'count': item['words'][word.keyword],
'url': item['url']
}
match_words.append(match)
if len(match_words) > MINIMUM_MATCHES:
to_add = []
for match in match_words:
new_keyword = Match(keyword=match['word'], keyword_weight=match['key_weight'], page_url=match['url'], count=match['count'])
to_add.append(new_keyword)
print('Pysearch Database Updated...')
session.add_all(to_add)
session.commit()
except:
session.rollback()
raise
finally:
session.close()
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This monolith of a function could probably be broken down into some smaller, more testable functions.


return item
90 changes: 90 additions & 0 deletions pysearch/harvester/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-

# Scrapy settings for harvester project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'harvester'

SPIDER_MODULES = ['harvester.spiders']
NEWSPIDER_MODULE = 'harvester.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'harvester (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'harvester.middlewares.HarvesterSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'harvester.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'harvester.pipelines.HarvesterPipeline': 300,
# 'harvester.pipelines.CrawlerPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could probably remove all of the commented lines since they're not being used anyway.

4 changes: 4 additions & 0 deletions pysearch/harvester/spiders/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
80 changes: 80 additions & 0 deletions pysearch/harvester/spiders/crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""Spider for crawling."""
import collections
from stop_words import get_stop_words
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import Rule, CrawlSpider
from scrapy.item import Item, Field

CRAWL_COUNT = 10
DEPTH_LEVEL = 10


class MyItem(Item):
"""Item container for scraping."""

url = Field()
words = Field()


class CrawlingSpider(CrawlSpider):
"""Spider for harvesting words from a URL."""

name = "crawler"
custom_settings = {
'ITEM_PIPELINES': {
'pysearch.harvester.pipelines.CrawlerPipeline': 300,
}
}

rules = (Rule(LinkExtractor(allow=("", ),), callback="parse_items", follow=True),)

def __init__(self, url=None, *args, **kwargs):
"""Initialize a harvest spider."""
super(CrawlingSpider, self).__init__(*args, **kwargs)

def parse_items(self, response):
"""Get links from site."""
item = MyItem()
words = []
stop_words = get_stop_words('english')
p = response.css('p::text').extract()
for each in p:
words.extend(each.split())
words = lower_list(words)
word_count = collections.Counter(words)
for key in list(word_count.keys()):
if key in stop_words:
del word_count[key]
item['words'] = word_count
item['url'] = response.url
yield item


def crawl(url):
"""Initialize crawling sequence."""
settings = get_project_settings()
settings.url = url
settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
settings["DEPTH_LEVEL"] = DEPTH_LEVEL
process = CrawlerProcess(settings)

class ThisSpider(CrawlingSpider):
"""Create a spider to crawl with."""

start_urls = [url]
process.crawl(ThisSpider)
process.start()


def lower_list(list_in):
"""Return a list with all words lowercase."""
list_out = []
for each in list_in:
list_out.append(each.lower())
return list_out

if __name__ == '__main__':
import sys
crawl(sys.argv[1])
Loading