今天来学习利用正则表达式爬取高考网的高校信息
人生苦短,我用python
今天要爬取的是高考网广东地区的高校
链接:http://college.gaokao.com/schlist/a14/p
def parse_one_page(html):
pattern=re.compile('.*?href="(.*?)".*?(.*?).*?.*? (.*?) '
+'.*?(.*?) .*?(.*?) .*?(.*?) ',re.S)
items=re.findall(pattern,html)
print(items)
for item in items:
yield {
'web':item[0],
'image':item[1],
'position':item[2],
'style':item[3],
'Belong':item[4],
'education':item[5],
'schoolweb':item[6]
}
步骤三:根据页数循环爬取信息
def main(offset):
url = 'http://college.gaokao.com/schlist/a14/p'+str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(1,5):
main(offset=i)
time.sleep(1)
完整代码如下
import requests
import time
import json
import re
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
def get_one_page(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern=re.compile('.*?href="(.*?)".*?(.*?).*?.*? (.*?) '
+'.*?(.*?) .*?(.*?) .*?(.*?) ',re.S)
items=re.findall(pattern,html)
print(items)
for item in items:
yield {
'web':item[0],
'image':item[1],
'position':item[2],
'style':item[3],
'Belong':item[4],
'education':item[5],
'schoolweb':item[6]
}
def write_to_file(content):
with open('gaoxiao.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
def main(offset):
url = 'http://college.gaokao.com/schlist/a14/p'+str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(1,5):
main(offset=i)
time.sleep(1)
最后爬取实现效果如下(以中大大学为例)
{“web”: “http://college.gaokao.com/school/30/”, “image”: “http://college.gaokao.com/style/college/images/icon/30.png”, “position”: “高校所在地:广东”, “style”: “高校类型:工科”, “Belong”: “高校隶属:教育部”, “education”: “高校性质:本科”, “schoolweb”: “学校网址:www.scut.edu.cn”}