python/chsi_info.py at master · arnojin/python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
"""爬取学信网(http://www.chsi.com.cn/)中“高校招生信息服务”板块下的“院校库”、“往年录取分数”
作者：金鹏
时间：2018-06-14
用途：爬取学信网(http://www.chsi.com.cn/)中“高校招生信息服务”板块下的“院校库”、“往年录取分数”
安装：pip install lxml
      pip install bs4
步骤：1、爬取 院校库（http://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml）
          获取 院校名称，院校所在地，院校隶属，院校类型，学历层次，院校特性，研究生院，满意度
      2、爬取 往年录取分数（http://gaokao.chsi.com.cn/lqfs/）
          根据 考生来源、院校名称，参考年份，科类
          获取 录取批次，录取线差，省市分数线，高校平均分，专业平均分（门类、专业名称、平均分），专业录取线差
      3、爬取 各省历年分数线
          http://gaokao.chsi.com.cn/z/gkbmfslq2014/pcx.jsp
          http://gaokao.chsi.com.cn/z/gkbmfslq2015/pcx.jsp
          http://gaokao.chsi.com.cn/z/gkbmfslq2016/pcx.jsp
          http://gaokao.chsi.com.cn/z/gkbmfslq2017/pcx.jsp
"""

import logging
import time

from urllib import request
from bs4 import BeautifulSoup


logging.basicConfig(filename='./log.log',
                    format='[%(asctime)s] [%(name)s] [%(levelname)s] [%(funcName)s] %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.DEBUG)
log = logging.getLogger('chsi_info')
sh = logging.StreamHandler()
sh.setLevel(level=logging.DEBUG)
formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] [%(funcName)s] %(message)s')
sh.setFormatter(formatter)
log.addHandler(sh)


def docstring():
    """文档说明"""
    return __doc__ % globals()


def get_html(url):
    """爬取 页面内容"""
    log.info("开始 ...")

    response_result = request.urlopen(url).read()
    html = response_result.decode('utf-8')

    log.info("结束 。")
    return html


def get_soup(url):
    """返回 soup 对象"""
    log.info("开始 ...")

    html = get_html(url)
    # soup = BeautifulSoup(html, 'html.parser')
    soup = BeautifulSoup(html, 'lxml')
    # print(soup.prettify())

    log.info("结束 。")
    return soup


def get_yxk():
    """获取 院校库 页面内容"""
    log.info("开始 ...")

    # 第一页地址，每页20个院校，url后缀从0开始，间隔20递增
    # url = 'http://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml'
    # url = 'http://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-20.dhtml'
    # ...
    # url = 'http://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-2700.dhtml'
    # 最后一页地址
    # url = 'http://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-2720.dhtml'

    all_content = ''
    url_begin = 0
    url_end = 40 #2740
    url_step = 20
    for url_index in range(url_begin, url_end, url_step):
        url = 'http://gaokao.chsi.com.cn/sch/search--ss-on,searchType-1,option-qg,start-' + str(url_index) + '.dhtml'

        soup = get_soup(url)

        trs = soup.table.find_all('tr')
        trs_len = len(trs)

        for tr_index in range(0, trs_len):
            # if (0 == tr_index) and (0 == url_index):
            #     tag_name = 'th'
            # else:
            #     tag_name = 'td'
            tag_name = 'td'

            tds = trs[tr_index].find_all(tag_name)
            if 0 < len(tds):
                all_content = all_content + str(url_index) + ',' + str(tr_index)
                for td in tds:
                    info = ''
                    for text in td.strings:
                        t = text.strip()
                        if '' != t:
                            info = info + ',' + t

                    for a in td.find_all('a'):
                        href = a.get('href')
                        if href.startswith('/sch/'):
                            sch_id = href.replace('/sch/schoolInfo--schId-', '')
                            sch_id = sch_id.replace('.dhtml', '')
                            info = info + ',' + sch_id
                            info = info + ',' + 'http://gaokao.chsi.com.cn' + href
                            info = info + ',' + 'http://gaokao.chsi.com.cn/sch/schoolInfoMain.do?schId=' + sch_id + '&ssdm=44&lqfsyear=2014&kldm=5#lqfs'
                            info = info + ',' + 'http://gaokao.chsi.com.cn/sch/schoolInfoMain.do?schId=' + sch_id + '&ssdm=44&lqfsyear=2015&kldm=5#lqfs'
                            info = info + ',' + 'http://gaokao.chsi.com.cn/sch/schoolInfoMain.do?schId=' + sch_id + '&ssdm=44&lqfsyear=2016&kldm=5#lqfs'
                            info = info + ',' + 'http://gaokao.chsi.com.cn/sch/schoolInfoMain.do?schId=' + sch_id + '&ssdm=44&lqfsyear=2017&kldm=5#lqfs'

                    all_content = all_content + info.strip()
                all_content = all_content + "\r\n"

        msg = 'url_index = ' + str(url_index) + ', trs_len = ' + str(trs_len)
        log.info(msg)
        time.sleep(1)

    log.info("结束 。")
    return all_content


def main():
    """主程序"""
    log.info("开始 ...")

    # print(docstring())
    # 1、爬取 院校库（http://gaokao.chsi.com.cn/sch/search--ss-on,option-qg,searchType-1,start-0.dhtml）
    #    获取 院校名称，院校所在地，院校隶属，院校类型，学历层次，院校特性，研究生院，满意度
    yxk = get_yxk()
    open('yxk-utf8.csv', 'wb').write(yxk.encode('UTF-8'))
    open('yxk-gb18030.csv', 'wb').write(yxk.encode('GB18030'))

    log.info("结束 。")
    return


if __name__ == '__main__':
    main()