python requests+BeautifulSoup桂电毕业生就业网搜索结果提取

#!/usr/bin/ebv python
# -*- coding: utf-8 -*-

import requests
from queue import Queue
import threading
from bs4 import BeautifulSoup as bs
import re
import base64

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, compress',
    'Accept-Language': 'en-us;q=0.5,en;q=0.3',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
    }

class BaiduSpider(threading.Thread):
    """docstring for ClassName"""
    def __init__(self, queue):
        threading.Thread.__init__(self)
        self._queue = queue

    def run(self):
        while not self._queue.empty():
            url = self._queue.get()
            try:
                self.spider(url)
            except Exception as e:
                print(e)
                pass

    def spider(self, url):
        #   请在此添加实现代码   #
        # ********** Begin *********#
        res = requests.get(url, headers=headers)
        soup = bs(res.content, 'lxml')
        news = soup.find_all(name='a', attrs={'href': re.compile(r'^info/')})
        for new in news:
            if new.select('font')[0].text == '2022年10月21日':
                url1 = "https://www.guet.edu.cn/jy/"+new['href']
                res1 = requests.get(url1, headers=headers)
                print(url1)
                print(bs(res1.content, 'lxml').select('div[class="title"]')[0].text)
        # ********** End **********#



def Evidence(keyword):
    queue = Queue()

    #   请在此添加实现代码   #
    # ********** Begin *********#
    key = str(base64.b64encode(keyword.encode('utf-8')), 'utf-8')
    # ********** End **********#

    # 可以修改爬取页数
    for i in range(1, 200):
        #   请在此添加实现代码   #
        # ********** Begin *********#
        queue.put("https://www.guet.edu.cn/jy/search.jsp?wbtreeid=1001&searchScope=0¤tnum={id}&newskeycode2={key}".format(id=i, key=key))
        # ********** End **********#

    # 多线程
        threads = []
        thread_code = 5
    #   请在此添加实现代码   #
    # ********** Begin *********#
    for i in range(thread_code):
        t = BaiduSpider(queue)
        threads.append(t)
    
    for i in range(thread_code):
        threads[i].start()
        
    for i in range(thread_code):
        threads[i].join()
    # ********** End **********#

你可能感兴趣的:(python,beautifulsoup,开发语言)