-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIreadSpider.py
More file actions
120 lines (85 loc) · 3.39 KB
/
Copy pathIreadSpider.py
File metadata and controls
120 lines (85 loc) · 3.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
from lxml import etree
import csv
import re
import pymysql
import random
import time
from fake_useragent import UserAgent
csvFile = open('book.csv','w',encoding='utf-8',newline='')
csv_writer = csv.writer(csvFile)
conn = pymysql.connect('127.0.0.1', user='root', password='', db='book')
print(conn)
print(type(conn))
cursor = conn.cursor()
print(cursor)
def getFatherIntfo():
x = requests.get("http://www.ireadweek.com/")
html = etree.HTML(x.text)
# print(html)
print(html.xpath('//ul[@class="hanghang-list"]/a/li/div[@class="hanghang-list-name"]/text()'))
NewestBook = html.xpath('/html/body/div/div/ul/a[3]/li/div[1]/text()')
NewestBookLink = html.xpath('/html/body/div/div/ul/a[3]/@href')
print('最新的书是'+ str(NewestBook) +',它的地址是'+ str(NewestBookLink))
NewestBookId = re.sub("\D", "", str(NewestBookLink))
return NewestBookId
def avoid():
ua = UserAgent()
headers = {'User-Agent':ua.random}
sleepTime = random.randint(0,10)
time.sleep(sleepTime)
def getChildInfo():
url1 = 'http://ireadweek.com/sdfesfwsf.php?m=article&a=index&id='
avoid()
for id in range(14, eval(getFatherIntfo())):
url2 = url1 + str(id)
x = requests.get(url2)
html = etree.HTML(x.text)
# print(html)
print(id)# id
bookPic = html.xpath('/html/body/div[1]/div/div[1]/div[2]/div[1]/img/@src')#封面
bookPic = ''.join(bookPic)
print(bookPic)
bookName = html.xpath('//html/body/div/div/div[1]/div[2]/div[2]/p[1]/text()')# 书名
bookName = ''.join(bookName)
print(bookName)
Author = html.xpath('//html/body/div/div/div[1]/div[2]/div[2]/p[2]/text()')# 作者
Author = ''.join(Author)
print(Author)
classification = html.xpath('//html/body/div/div/div[1]/div[2]/div[2]/p[3]/text()')# 分类
classification = ''.join(classification)
print(classification)
score = html.xpath('//html/body/div/div/div[1]/div[2]/div[2]/p[4]/text()')# 豆瓣评分
score = ''.join(score)
print(score)
Introduction = html.xpath('//html/body/div/div/div[1]/div[2]/div[2]/p[6]/text()')# 简介
Introduction = ''.join(Introduction)
print(Introduction)
Link = html.xpath('/html/body/div/div/div[1]/div[3]/div[1]/a/@href')# 下载链接
Link = ''.join(Link)
print(Link)
csv_writer.writerow([id, bookPic, bookName, Author, classification, score, Introduction, Link])
insertSql = "insert into book values(%s,%s,%s,%s,%s,%s,%s,%s)"
#+ id +","+ bookPic + "," + bookName+","+Author+","+classification+","+score+","+Introduction+","+Link+ ")"
values = (id, bookPic, bookName, Author, classification, str(score), Introduction, Link)
cursor.execute(insertSql,values)
conn.commit()
def CreateTable():
cursor.execute("drop table if exists book")
CreateTableSql = """CREATE TABLE book (
id int(10) NOT NULL PRIMARY KEY auto_increment,
bookPic varchar(255),
bookName varchar(50),
author varchar(50),
classification varchar(50),
score varchar(50),
introduction text(255),
link varchar(255)
);"""
cursor.execute(CreateTableSql)
if __name__ == "__main__":
CreateTable()
getChildInfo()
csvFile.close()
cursor.close()
conn.close()