前段时间导师有个小目标,把实验室发的文章汇总一下,看看都被谁引用过,其中哪些是大牛,跳出来,给脸上贴金,于是催生了这样一个需求:
ps:现在已经查询到一万三千多行数据,中间断过一次
import requests,time,random
from lxml import etree
from urllib import parse
# 全局变量定义
search_records_list = []
author_homepage='https://app.webofknowledge.com/api/wosnx/rrc/author/'
url_base = 'https://apps.webofknowledge.com'
url_head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57'
}
log_file_name=f"log{time.localtime().tm_hour}_{time.localtime().tm_min}_{time.localtime().tm_sec}.txt"
def log(str):
logfile=open(log_file_name,mode='a+',encoding='utf-8')
logfile.write(str)
print(str, end='')
if set(str)!={'\t'}:
logfile.write('\n')
print('\n',end='')
logfile.close()
def extract_frame(url,list):
time.sleep(0.5+random.random())
response=session.get(url,headers=url_head)
response_html = etree.HTML(response.text)
records = response_html.xpath('//div[contains(@id,"RECORD_")]')
record_dict={}
for record in records:
record_dict['num_index']=record.xpath('.//div[@class="search-results-number-align"]/text()')[0]
record_dict['title_paper'] =record.xpath('.//div[@class="search-results-content"]/div[1]/div[1]/descendant::value/text()')[0]
record_dict['title_url'] =url_base+record.xpath('.//div[@class="search-results-content"]/div[1]/div[1]/descendant::a[@class="smallV110 snowplow-full-record"]/@href')[0]
timescited=record.xpath('.//div[@class="search-results-data-cite"]/text()')[0]
if timescited=='被引频次: 0' or timescited=='Times Cited: 0':#判断如果没有被引频次,则不需要添加施引文献链接,直接返回list,需要按被引频次排序
record_dict['cited_number']=0
record_dict['cited_url']=None
else:
record_dict['cited_number'] =record.xpath('.//a[@class="snowplow-times-cited-link"]/text()')[0]
record_dict['cited_url'] =url_base+record.xpath('.//a[@class="snowplow-times-cited-link"]/@href')[0]
print(f"{record_dict['num_index']} 《{record_dict['title_paper']}》 Has Been Built Successfully")
list.append(record_dict.copy())
nextpage_url=response_html.xpath('//a[contains(@class,"paginationNext")]/@href')[0]
if not nextpage_url=='javascript: void(\'paginationNext\')':#如果存在下一页,则递归下一页
extract_frame(nextpage_url,list)
return list
def find_author_url(url):
time.sleep(0.5 + random.random())
response=session.get(url,headers=url_head)
response_html = etree.HTML(response.text)
name_list = response_html.xpath('//a[@title="Find more records by this author"]/text()')
name_url = response_html.xpath('//a[@title="Find more records by this author"]/@href')
name_dict=dict(list(zip(name_list,name_url)))#把姓名和url两个列表元素一一对应生成字典
print(f"\tAuthors: {name_list} Have Been Found")
return name_dict
def find_author_hindex(url):
url_parts = parse.urlparse(url)
url_params = parse.parse_qs(url_parts.query)
author_head={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
'x-1p-wos-sid': url_params['SID'][0]
}
if 'daisIds' in url_params.keys():#部分作者没有主页
author_detail_url=author_homepage+url_params['daisIds'][0]+'?coAuthor=true'
time.sleep(0.5 + random.random())
author_detail_response=session.get(author_detail_url,headers=author_head)
author_detail_dict=eval(author_detail_response.text)
return author_detail_dict['hIndex']
else:
return 0
if __name__=="__main__":
url_test='***手动打码***'
session=requests.session()
search_records_list=extract_frame(url_test,search_records_list)
for search_record in search_records_list:
log(search_record['num_index']+search_record['title_paper'])
num_index_float=float(search_record['num_index'])
if num_index_float<115:#意外结束后跳到指定文章继续运行
continue
if search_record['cited_number']==0:
log('\tNo Cited Record Found')
continue
cited_records_list=[]
cited_records_list=extract_frame(search_record['cited_url'],cited_records_list)
for cited_record in cited_records_list:
log('\t')
log(cited_record['num_index']+cited_record['title_paper'])
log('\t')
log(cited_record['title_url'])
authors_dict=find_author_url(cited_record['title_url'])
for key in authors_dict.keys():
author_hindex=find_author_hindex(authors_dict[key])
log('\t\t')
log('Hindex:\t'+str(author_hindex)+'\tauthor:\t'+key)
print('end')