#!/usr/bin/ebv python
# -*- coding: utf-8 -*-
import requests
from queue import Queue
import threading
from bs4 import BeautifulSoup as bs
import re
import base64
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, compress',
'Accept-Language': 'en-us;q=0.5,en;q=0.3',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'
}
class BaiduSpider(threading.Thread):
"""docstring for ClassName"""
def __init__(self, queue):
threading.Thread.__init__(self)
self._queue = queue
def run(self):
while not self._queue.empty():
url = self._queue.get()
try:
self.spider(url)
except Exception as e:
print(e)
pass
def spider(self, url):
# 请在此添加实现代码 #
# ********** Begin *********#
res = requests.get(url, headers=headers)
soup = bs(res.content, 'lxml')
news = soup.find_all(name='a', attrs={'href': re.compile(r'^info/')})
for new in news:
if new.select('font')[0].text == '2022年10月21日':
url1 = "https://www.guet.edu.cn/jy/"+new['href']
res1 = requests.get(url1, headers=headers)
print(url1)
print(bs(res1.content, 'lxml').select('div[class="title"]')[0].text)
# ********** End **********#
def Evidence(keyword):
queue = Queue()
# 请在此添加实现代码 #
# ********** Begin *********#
key = str(base64.b64encode(keyword.encode('utf-8')), 'utf-8')
# ********** End **********#
# 可以修改爬取页数
for i in range(1, 200):
# 请在此添加实现代码 #
# ********** Begin *********#
queue.put("https://www.guet.edu.cn/jy/search.jsp?wbtreeid=1001&searchScope=0¤tnum={id}&newskeycode2={key}".format(id=i, key=key))
# ********** End **********#
# 多线程
threads = []
thread_code = 5
# 请在此添加实现代码 #
# ********** Begin *********#
for i in range(thread_code):
t = BaiduSpider(queue)
threads.append(t)
for i in range(thread_code):
threads[i].start()
for i in range(thread_code):
threads[i].join()
# ********** End **********#