python 多线程爬虫下载中图分类号

目标网站

python 多线程爬虫下载中图分类号_第1张图片
爬这个网站需要访问 45836 个网页,
在这里插入图片描述
一个一个访问是很慢的,还好网站没有做反爬

单线程爪巴虫

import requests
from bs4 import BeautifulSoup
import traceback
import time

dic = {
     }
url = 'http://www.ztflh.com/?c='
def visit(i):
    try:
        print(i, end='')
        u = url+str(i)
        html = requests.get(u)
        html.encoding = 'utf8'
        h = html.text
        bs = BeautifulSoup(h)
        lis = bs.find('ul',{
     'id':'list'}).findAll('li')
        for i in lis:
            dic[i.span.text.strip('[]')] = i.a.text
        print()
    except Exception as e:
#         traceback.print_exc()
        print('failed', u)

for i in range(45837):
	visit(i)

print(dic)

可是这么一个一个来预计要爪巴几个小时

多线程爪巴虫

python 多线程爬虫下载中图分类号_第2张图片

开启 100 个线程,不到 10 分钟完成所有网页的爪取

import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue
 
class Producer(threading.Thread):
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'
    }
    def __init__(self,page_queue,result_queue,*args,**kwargs):
        super(Producer, self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.result_queue = result_queue
 
    def run(self):
        while True:
            if self.page_queue.empty():
                print('bye')
                break
            print('剩余页数:', page_queue.qsize())
            url = self.page_queue.get()
            self.parse_page(url)
 
    def parse_page(self,u):
        try:
            html = requests.get(u)
            html.encoding = 'utf8'
            h = html.text
            bs = BeautifulSoup(h)
            lis = bs.find('ul',{
     'id':'list'}).findAll('li')
            dic = {
     }
            for i in lis:
                self.result_queue.put((i.span.text.strip('[]'),i.a.text))
                dic[i.span.text.strip('[]')] = i.a.text
            print(u, dic)
        except Exception as e:
            pass


N = 45836
N_threads = 100
page_queue = Queue(N)
result_queue = Queue(N)

for i in range(N):
    u = url + str(i)
    page_queue.put(u)

for x in range(N_threads):
    t = Producer(page_queue, result_queue)
    t.start()

result = list(result_queue.queue)
result.sort()

with open('中图分类号.csv', 'w', encoding='utf_8_sig') as file:
    for k,v in result:
        file.write(k.strip('{}')+','+v+'\n')

抓取结果

python 多线程爬虫下载中图分类号_第3张图片
⋮ \vdots
python 多线程爬虫下载中图分类号_第4张图片

大家先不要再爬啦,毕竟对网站不好,可以先试试我已经爬好的
https://download.csdn.net/download/itnerd/12836734

中图分类号的本地检索方式

虽然这个文件已经很全了,可是居然还有在小数上匹配不上的,例如

在这里插入图片描述
而官网上只给出了 F426

那就采用如下匹配方式:

如果没有直接匹配上,就把分类号的最后一位去掉,继续匹配,直到匹配上为止:

kv = pd.read_csv('中图分类号.csv', names = ['k','v'], delimiter=',')
kv = kv.set_index('k')

def search_class_name(k):
    v = ''
    if type(k) is str:
        while len(k) > 0 and not k in kv.index:
            k = k[:-1]
        if len(k)>0:
            v = kv.loc[k]['v']
    return v

你可能感兴趣的:(#,爪巴虫技术,#,编程语言,中图分类号,多线程,爪巴虫,下载)