-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathselTOscrapy.py
More file actions
147 lines (117 loc) · 4.86 KB
/
selTOscrapy.py
File metadata and controls
147 lines (117 loc) · 4.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import scrapy
from scrapy.selector import Selector
from selenium import webdriver
import pandas as pd
import time
class SeltoscrapySpider(scrapy.Spider):
"""
The purpose of this code is to scrape a personal Audible library.
The first part uses selenium to control the pages.
The initial page is accessed and the logon screen is chosen.
There is an intentionally long pause to allow the input of credentials.
Thereafter the handover from selenium to scrapy occurs and the pages are scraped.
The small piece of code required for pagination is done with selenium.
"""
name = 'selTOscrapy'
allowed_domains = ['toscrape.com']
start_urls = ['http://audible.com/']
web = 'https://www.audible.com/library/titles'
path = 'D:/webScrappingAndrade/chromedriver/chromedriver.exe'
def combineAuthors(self,originaltext, authors, auth = True):
"""
:param originaltext: the text
:param authors: author for the result dictionary
As books may have several authors and narrators this method
combines them into a single string.
"author" can be set to either authorDic or narratorDic, the dictionaries to store
the results.with auth beinmg True for author, the default, or False for narrator.
"""
bytoken = 'By:'
aORn = "Author"
if not auth :
bytoken = 'Narrated by:'
aORn = "Narrator"
#print(f'originaltext size: {len(originaltext)}')
authorname = False
author = ""
for textitem in originaltext:
textitem = textitem.strip()
#print(f'*{textitem}*')
if textitem == "" and author != "":
#print(f'author name: {author}')
authors.append(author)
authorname = False
author = ""
if authorname:
# print(f'author: **{author}**')
if textitem == ',':
textitem = ', '
authorname = True
author = author + textitem
if textitem == bytoken:
authorname = True
print(f'{aORn} size: {len(authors)}')
# Using a dummy website to start scrapy requestGY9PEHih
def start_requests(self):
url = "http://quotes.toscrape.com"
yield scrapy.Request(url=url, callback=self.parse_books)
def parse_books(self, response):
driver = webdriver.Chrome(self.path)
#print(f'driver is * {driver} * \n')
driver.get(self.web)
driver.maximize_window()
time.sleep(5 * 60)
# Hand-off between Selenium and Scrapy happens here
sel = Selector(text=driver.page_source)
booksDic = []
authorsDic = []
narratorsDic = []
# Pagination
pagination = driver.find_element_by_xpath('//ul[contains(@class, "pagingElements")]') # locating pagination bar
pages = pagination.find_elements_by_tag_name('li') # locating each page displayed in the pagination bar
last_page = int(pages[-2].text) # getting the last page with negative indexing (starts from where the array ends)
print(f'pages: {pages} last page:{last_page}')
current_page = 1 # this is the page the bot starts scraping
while current_page <= last_page:
books = sel.xpath('//span[contains(@class, "bc-size-headline3")]/text()').getall()
for book in books:
booksDic.append(book)
#print(f'books: {books}')
authors = sel.xpath('.//li[contains(@class, "authorLabel")]//span/text()').getall()
print(f'authors size: {len(authors)}')
self.combineAuthors(authors, authorsDic)
# for author in authorsDic:
# print(f'author: {author}')
print(f'authorsDic: length: {len(authorsDic)}')
narrators = sel.xpath('.//li[contains(@class, "narratorLabel")]//span/text()').getall()
# for narrator in narrators:
# print(f'narrator: *{narrator.strip()}*')
self.combineAuthors(narrators, narratorsDic, False)
current_page = current_page + 1 # increment the current_page by 1 after the data is extracted
# Locating the next_page button and clicking on it. If the element isn't on the website, pass to the next iteration
try:
next_page = driver.find_element_by_xpath('//span[contains(@class , "nextButton")]')
print(f'** next_page:(** {next_page} ')
next_page.click()
except:
print("** an exception has been raised about next page **")
driver.close()
"""
As a particular book has no narrator, it bis located and "no narrator" added
this is to en sure that all dictionals have tghe same length.
"""
if last_page > 10:
for index in range(118, 210):
if narratorsDic[index] == 'Luis Moreno':
bk = booksDic[index + 1]
print(f'at index {index + 1} {bk}: has no narrator')
narratorsDic.insert(index + 1, "no narration")
break
# Storing the data into a DataFrame and exporting to a csv file
print(f'books: {len(booksDic)} authors: {len(authorsDic)} narrators: {len(narratorsDic)}')
df_books = pd.DataFrame({'title': booksDic})
df_authors = pd.DataFrame({'author': authorsDic})
df_narrators = pd.DataFrame({'narrator': narratorsDic})
df_books = pd.DataFrame({'title': booksDic, 'author': authorsDic, 'narrator': narratorsDic})
df_books.to_csv('books2.csv', index=False)
df_books.to_json('books.json')