学习记录:
1.requests思维脑图,记录基础用法
2.python多线程threading模块
3.队列模块Queue
# -*- coding=utf-8 -*-
import time
import requests
import threading
from multiprocessing import Queue
from lxml import etree
class CrawlPage(threading.Thread):
def __init__(self, page_queue, data_queue, thread_name):
super(CrawlPage,self).__init__()
self.page_queue = page_queue
self.data_queue = data_queue
self.thread_name = thread_name
# 默认请求头
self.header = {
"Accept":
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Host": "search.51job.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3610.2 Safari/537.36",
}
def run(self):
print("当前工作线程为:{}".format(self.thread_name))
# 1.循环取page_queue里的url直到取完 2.requests发送请求返回数据保存到data_queue
while not page_flag:
try:
page = self.page_queue.get(block=False)
page_url = 'https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html'.format(page)
response = requests.get(url=page_url, headers=self.header)
print("当前爬取的url为:{}".format(page_url),"爬取页面响应状态码为:{}".format(response.status_code))
response.encoding = 'gbk'
self.data_queue.put(response.text)
except:
pass
class CrawlData(threading.Thread):
def __init__(self, thread_name, data_queue):
super(CrawlData, self).__init__()
self.thread_name = thread_name
self.data_queue = data_queue
def run(self):
print("当前处理文本数据的线程为:{}".format(self.thread_name))
while not data_flag:
try:
print("当前剩余数据量为{}".format(self.data_queue.qsize()))
text = self.data_queue.get(block=False)
html = etree.HTML(text)
all_div = html.xpath("//div[@id='resultList']//div[@class='el']")
info_list = []
for item in all_div:
info = {}
# 获取数据的时候,使用列表索引为0的数据
info['job_name'] = item.xpath("./p/span/a/@title")[0]
info['company_name'] = item.xpath(".//span[@class='t2']/a/@title")[0]
info['company_address'] = item.xpath(".//span[@class='t3']/text()")[0]
# money字段可能为空,try-except来进行异常处理
try:
info['money'] = item.xpath(".//span[@class='t4']/text()")[0]
except:
info['money'] = '无数据'
info['date'] = item.xpath(".//span[@class='t5']/text()")[0]
info_list.append(info)
print("当前处理的线程为:{},解析出的数据为:{}".format(self.thread_name,info_list))
time.sleep(1)
except:
pass
# 设置两个全局标志位,当数据取完时结束while循环
page_flag = False
data_flag = False
def main():
# 构造存放页码和文本数据队列
page_queue = Queue()
data_queue = Queue()
#存入页码数据
for page in range(1,10):
page_queue.put(page)
print("当前队列中共有页码数为:{}".format(page_queue.qsize()))
#启动线程爬取页面信息
global page_flag
page_thread_name = ['页面爬取1号','页面爬取2号','页面爬取3号']
page_crawl_list = []
for page_thread in page_thread_name:
page_crawl = CrawlPage(page_queue,data_queue,page_thread)
page_crawl.start()
page_crawl_list.append(page_crawl)
# 主线程进行阻塞,直到page_queue里的数据全部取完
while not page_queue.empty():
pass
# 当page_queue里的数据全部取完后,将标志位设置为True,结束CrawlPage.run() 方法中的while循环
page_flag = True
for page_crwal_join in page_crawl_list:
page_crwal_join.join()
print(page_crwal_join.thread_name+"页面爬取工作结束!")
print("当前data_queue的数据总量为:{}".format(data_queue.qsize()))
# 设置3个文本处理线程,启动文本处理类
crawl_thread_name = ["文本处理1号","文本处理2号","文本处理3号"]
crawl_data_list = []
for crawl_data_name in crawl_thread_name:
crawl_data = CrawlData(crawl_data_name, data_queue)
crawl_data.start()
crawl_data_list.append(crawl_data)
# 阻塞主线程,直到data_queue的数据全部取完
while not data_queue.empty():
pass
global data_flag
# 数据取完后退出线程
data_flag = True
for crawl_data_join in crawl_data_list:
crawl_data_join.join()
print("线程{}结束".format(crawl_data_join.thread_name))
# 数据为0,解析html文本数据结束
print("data_queue的数据为:{}".format(data_queue.qsize()))
if __name__ == '__main__':
main()
代码是自己写的,参考的是别的学习网站