日常学习0104

论文抓取

dblp
import requests
import csv
from lxml import etree

class Publication(object):

    authors_list_xpath = './span[@itemprop="author"]'
    author_xpath = './/span[@itemprop="name"]/text()'
    title_xpath = './span[@class="title"]/text()'
    date_xpath = './/span[@itemprop="datePublished"]/text()'
    source_xpath = ''

    def __init__(self, node, source):
        self.author_list =[]
        self.date = ""
        self.source = source
        self.title = ""
        self._parse(node)

    def _parse(self, node):
        authors_list = node.xpath(self.authors_list_xpath)
        for author in authors_list:
            self.author_list.append(author.xpath(self.author_xpath)[0])
        self.title = node.xpath(self.title_xpath)
        if len(self.title) > 0:
            self.title = self.title[0].strip()
        self.date = node.xpath(self.date_xpath)
        if len(self.date) > 0:
            self.date = self.date[0].strip()

    def as_list(self):
        return [self.title, '{0} {1}'.format(self.source, self.date), ' ' ,' '.join(self.author_list),  '']


class DBLP(object):

    dblp_url = 'https://dblp.uni-trier.de/search'
    publication_xpath = '//cite[@class="data"]'
    
    def __init__(self, csv_file, headers, max_limit = 4):
        self.session = requests.session()
        self.b = 1
        self.s = 'ydvpc'
        self.h = 100
        self._csv_file = open(csv_file, 'w', encoding='utf-8', newline='')
        self._csv_writer = csv.writer(self._csv_file)
        self._csv_writer.writerow(headers)
        self.titles = {}

    def search(self, q, year, source):
        is_continue = True
        key = '{0} {1} {2}'.format(q, source, year)
        content = self.session.get(self.dblp_url, params = {'q': key, 'h':self.h})
        html = etree.HTML(content.text)
        items = html.xpath(self.publication_xpath)
        for item in items:
            publication = Publication(item, source)
            self.titles[publication.title] = publication.as_list()   
        self.b += 1
        print('run')
    
    def flush(self):
        for title in self.titles:
            self._csv_writer.writerow(self.titles[title])

    def __del__(self):
        self._csv_file.close()
    
   

if __name__ == '__main__':

    querys = ['3D Model', '3D Object', '3D Shape', 'voxel', 'Point Cloud', 'mesh']
    years = ['2019', '2018', '2017']
    csv_file = "ECCV.csv"
    headers = ['题目', '年份及刊物', '摘要', '作者', '代码连接']
    
    # q = '3D Object CVPR'
    year = '2019'
  
    searcher = DBLP(csv_file, headers)
    for year in years:
        for query in querys:
            searcher.search(query, year, 'ECCV')
    
    searcher.flush()
日常学习0104

论文抓取

你可能感兴趣的:(日常学习0104)