-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathcrawlerMain.py
More file actions
62 lines (55 loc) · 2.08 KB
/
crawlerMain.py
File metadata and controls
62 lines (55 loc) · 2.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
##导入库
import os
import time
import requests
import lxml
from lxml import etree
## 发送请求
def send_request(url, headers):
r = requests.get(url,headers=headers)
print('状态码:',r.status_code)
return r
## 提取数据
def fetch(domain,resp, headers):
selector = lxml.etree.HTML(resp.text) #generate object
titles = selector.xpath('//a[@class="x-wiki-index-item"]/text()')#拿到各个页面的小标题,比如《面向对象》这种
uri = selector.xpath('//a[@class="x-wiki-index-item"]/@href') #拿到各个页面的url的参数
#打包到一起返回
urls = []
for i in uri:
urls.append(domain + i)
return titles, urls
## 下载页面
def download(titles,urls):
#设置下载路径
cwd=os.getcwd()
if not os.path.exists(cwd + '/廖雪峰Java教程HTML'):
os.mkdir(cwd + '/廖雪峰Java教程HTML')
filepath = cwd + '/廖雪峰Java教程HTML/'
for key, value in zip(titles, urls):
resp = requests.get(value,headers=headers)
selector = lxml.etree.HTML(resp.text)
#拿到各个页面的正文
html = selector.xpath('//div[@class="uk-flex-item-1"]')
#转换类型
html = etree.tostring(html[0], pretty_print = True, method = "html")
#设置文件名
filename = filepath + "%d" % titles.index(key) + key +'.html'
with open(filename,'wb')as f:
f.write(html)
#加一个延时,别访问太快
time.sleep(0.3)
print("下载完成")
## 主函数
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chro'
'me/53.0.2785.104 Safari/537.36 Core/1.53.2372.400 QQBrowser/9.5.10548.400'
}
domain = 'http://www.liaoxuefeng.com' #廖雪峰网站的域名
url = "https://www.liaoxuefeng.com/wiki/1252599548343744" #Java教程的url
#获取Java教程的页面
resp = send_request(url, headers)
#提取所有的数据回来,包括标题和urls
titles, urls = fetch(domain, resp, headers)
#开始下载
download(titles,urls)