学了Python爬虫
过了有一段时间了. 有些概念都开始模糊了, 所以打算写一些小的爬虫, 加深一下映象, 别到时候学了就忘了,然后用的时候又全程懵逼
# Python3
# 使用 pip install 库名称 来安装
requests
lxml
start
参数值就增加20,通过xpath
先匹配出所有的列xpath
获取我们要获取的内容python
内置的csv
模块将内容写入csv文件中import requests
from lxml import etree
import time
import csv
start_time = time.time()
url = 'https://gaokao.chsi.com.cn/sch/search.do?searchType=1&start='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
with open('data.csv', 'a', encoding='utf-8') as f:
csv_writer = csv.writer(f)
csv_writer.writerow(['院校名称', '院校所在地', '院校隶属', '院校类型', '学历层次', '院校特性','研究生院','满意度'])
start = 0
count = 0
while start <= 2720:
response = requests.get(url+str(start), headers=headers)
start += 20
html = etree.HTML(response.text)
university_list = []
for university in html.xpath('//tr[td]'):
university_name = university.xpath('./td/a')[0].text.strip()
university_addr = university.xpath('./td')[1].text.strip()
university_agency = university.xpath('./td')[2].text.strip()
university_type = university.xpath('./td')[3].text.strip()
university_level = university.xpath('./td')[4].text.strip()
if len(university.xpath('./td')[5]) == 1:
university_characteristic = '211'
elif len(university.xpath('./td')[5]) == 2:
university_characteristic = '985 211'
else:
university_characteristic = 'none'
if len(university.xpath('./td')[6]) == 1:
university_is_grad = True
else:
university_is_grad = False
l = len(university.xpath('./td/a'))
university_satisfaction = university.xpath('./td/a')[1].text.strip() if l >= 2 else '--'
university_list = [university_name, university_addr, university_agency, university_type,
university_level, university_characteristic, university_is_grad, university_satisfaction]
print(university_list)
with open('data.csv', 'a', encoding='utf-8') as f: # 写入文件
csv_writer = csv.writer(f)
csv_writer.writerow(university_list)
count += 1
time.sleep(1)
end_time = time.time()
print("共花费 {} S, 爬取 {} 座高校".format(end_time-start_time, count))