ACM(Association for Computing Machinery)指美国计算机协会,是全球历史最悠久和最大的计算机教育和科研机构.ACM数据库收录了大量关于信息技术教育、科研和应用方面的期刊和会议论文,是计算机和信息技术领域十分重要的资源窗口.
但是该平台在检索方面存在一定的问题,如以“Trajectory data”为主题检索2019-2021年的期刊论文,当以被引量排序时,排在前列的往往是高引用、低相关的论文,而这与我们的检索初衷相违背。
一种替代方法是先按相关度排序,再对相关度前100的论文取被引量较高的文献。这样既可以保证高引用,又可以确保高相关。
通过Python爬虫可以快速批量抓取按相关度排名时各论文的被引量,首先拿下网页的源码,然后通过Python正则表达式匹配,找出论文标题、作者、被引量等信息即可。
类似的思路同样适用于Springer等平台,但正则表达式需要修改,且循环获取数据时需要考虑被引量是否直接显示在了检索界面上。若未显示,则需要模拟浏览器进入论文详情界面获取。
匹配实质上就是从html文件中找到信息出现的位置,通常这些信息前后会有固定的格式,从而可以实现匹配。
//匹配论文链接
findlink = re.compile(r''
)
//匹配论文标题
findtitle = re.compile(r'(.*?)')
//匹配论文被引量
findcitation = re.compile(r'.*?(.*?)')
//匹配论文作者
findauthor = re.compile(r'')
def getData():
//可以将网页代码保存到本地,也可以通过askURL()获取
# html = askURL("https://dlnext.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=AllField&text1=Trajectory+Data+&AfterYear=2019&BeforeYear=2021&startPage=0&pageSize=50")
html = open("ACMTop100.html", 'r', encoding='utf-8')
//网页解析器
soup = BeautifulSoup(html, 'html.parser')
# print(soup)
//对于每一条论文信息
for item in soup.find_all(attrs={'class':'search__item issue-item-container'}):
item = str(item)
//正则表达式匹配
link = re.findall(findlink, item)[0]
link = "https://dlnext.acm.org" + link
title = re.findall(findtitle, item)
//信息缺失特殊处理
if len(title) > 0:
title = title[0]
title = str(title)
title = title.replace('','')
title = title.replace('','')
citation = re.findall(findcitation, item)[0]
author = re.findall(findauthor, item)
# print(link)
# print(title)
# print(citation)
# print(author)
//保存数据
data = [link, title, citation, author]
datalist.append(data)
def saveData(datalist, savepath):
print("save...")
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("ACMTop100", cell_overwrite_ok=True)
//列名
col = ("link", "title", "citation", "author")
for i in range(0, 4):
sheet.write(0, i, col[i])
//相关度前100项记录
for i in range(0, 100):
if i % 9 == 0:
print("---------第{}条-----------".format(i+1))
data = datalist[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(savepath)
process.py
import re
from bs4 import BeautifulSoup
import urllib.request
import xlwt
findlink = re.compile(r''
)
findtitle = re.compile(r'(.*?)')
findcitation = re.compile(r'.*?(.*?)')
findauthor = re.compile(r'')
datalist = []
def askURL(URL):
head = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Edg/94.0.992.38"}
request = urllib.request.Request(URL,headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
def getData():
# html = askURL("https://dlnext.acm.org/action/doSearch?fillQuickSearch=false&target=advanced&expand=dl&field1=AllField&text1=Trajectory+Data+&AfterYear=2019&BeforeYear=2021&startPage=0&pageSize=50")
html = open("ACMTop100.html", 'r', encoding='utf-8')
soup = BeautifulSoup(html, 'html.parser')
# print(soup)
for item in soup.find_all(attrs={'class':'search__item issue-item-container'}):
item = str(item)
link = re.findall(findlink, item)[0]
link = "https://dlnext.acm.org" + link
title = re.findall(findtitle, item)
if len(title) > 0:
title = title[0]
title = str(title)
title = title.replace('','')
title = title.replace('','')
citation = re.findall(findcitation, item)[0]
author = re.findall(findauthor, item)
print(link)
print(title)
print(citation)
print(author)
data = [link, title, citation, author]
datalist.append(data)
def saveData(datalist, savepath):
print("save...")
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("ACMTop100", cell_overwrite_ok=True)
col = ("link", "title", "citation", "author")
for i in range(0, 4):
sheet.write(0, i, col[i])
for i in range(0, 100):
if i % 9 == 0:
print("---------第{}条-----------".format(i+1))
data = datalist[i]
for j in range(0, 4):
sheet.write(i + 1, j, data[j])
book.save(savepath)
if __name__ == "__main__":
getData()
saveData(datalist,"ACMPaperTop100.xls")
本文所用ACM网页源码html
本文爬取结果xls