多线程爪巴虫抓取 ICML 2020 补充材料连链接

多线程爪巴虫抓取 ICML 2020 补充材料连链接_第1张图片


import requests
from bs4 import BeautifulSoup
import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue
 

root = 'https://proceedings.icml.cc/paper/2020'
root_page = requests.get(root).text
bs0 = BeautifulSoup(root_page)

papers = bs0.find('div',{
     'class':'col'}).findAll('a')
urls = []
for p in papers:
    urls.append('https://proceedings.icml.cc'+ p.get('href'))


def get_supplemental(url):
    html = requests.get(url).text
    bs = BeautifulSoup(html)
    col = bs.find('div', {
     'class':'col'})
    title = col.find('h4').text
    a = col.findAll('a',text= re.compile('Supplemental.*'))
    if len(a) > 0:
        return (title,'https://proceedings.icml.cc' + a[0].get('href'))
    return (title,'')



class Producer(threading.Thread):
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    def __init__(self,page_queue,result_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.result_queue = result_queue
 
    def run(self):
        while True:
            if self.page_queue.empty():
                print('bye')
                break
            print('剩余页数:', page_queue.qsize())
            url = self.page_queue.get()
            self.parse_page(url)
 
    def parse_page(self,u):
        try:
            res = get_supplemental(u)
            print(res)
            self.result_queue.put(res)

        except Exception as e:
            pass


N = 1084
N_threads = 100
page_queue = Queue(N)
result_queue = Queue(N)

for u in urls:
    page_queue.put(u)

for x in range(N_threads):
    t = Producer(page_queue, result_queue)
    t.start()

result = list(result_queue.queue)
result.sort()

with open('supplemental.csv', 'w', encoding='utf_8_sig') as file:
    for k,v in result:
        file.write(k.strip('{}')+','+v+'\n')

你可能感兴趣的:(#,爪巴虫技术,ICML,supplemental,补充材料,2020)