面试的导师是做图像处理的,面试后让我做一个检索近几年图像去雾顶会的程序,并输出成表格,于是我找了https://dblp.uni-trier.de/和http://openaccess.thecvf.com/两个网站进行爬取。
import requests
import re
import csv
from requests.exceptions import RequestException
huiyi = ['CVPR', 'ECCV', 'ICIP', 'ICCV', 'ECCV'] #输入想要搜索的期刊或会议名称
keyword = 'dehazing' #搜索关键词
year = 2014 #设置检索年份起点
#获取一个网页的所有信息
def Get_html(url):
try:
kv = {'User-Agent': 'Mozilla/5.0'} #设置请求头反爬
response = requests.get(url=url, headers=kv, timeout=10)
response.encoding = 'utf-8'
if response.status_code == 200: #如果连接正常就返回响应
return response
else: #否则打印反常状态码
print('response.status_code =={}'.format(response.status_code))
return None
except RequestException: #其他错误
return None
#获取每个文章对应的url链接
def Get_urllist(huiyi, keyword):
huiyi = huiyi
keyword = keyword
list = []
num = []
for i in range(len(huiyi)): #遍历每个会议,获取每个会议检索到的文章数
url = 'https://dblp.uni-trier.de/search?q=' + keyword + '%20venue%3A' + huiyi[i] + '%3A'
r = Get_html(url)
num1 = re.findall('data-matches="(.*?)">