", "/", welfare)
welfare = re.sub("//", ",", welfare)
welfare = re.sub("/", "", welfare)
welfare = re.sub("\r", "", welfare)
# 将爬取内容存储到数据的队列
self.jobinfo_queue.put((jobname, companyname, companytype, companysize, companycity, companysalary,
edulevel, workingExp, welfare, needperson, createdata))
# 自定义线程--消费者
class Consumer(threading.Thread):
# 初始化,传入url和保存数据的队列
def __init__(self, pageurl_queue, jobinfo_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.pageurl_queue = pageurl_queue
self.jobinfo_queue = jobinfo_queue
# 重写run方法
def run(self):
while True:
# 当url队列和存储数据队列为空时,退出循环
if self.jobinfo_queue.empty() and self.pageurl_queue.empty():
break
# "jobname","companyname","companytype","companysize","companycity","companysalary","edulevel","workingExp","welfare","needperson","createdata"
values = self.jobinfo_queue.get()
# 追加写入到文本中
with open("qcwy.txt", "a+", encoding="utf-8", newline="") as f:
f.write(values[0] + '\001' + values[1] + '\001' + values[2] + '\001' + values[3] + '\001' + values[
4] + '\001' + values[5] + '\001' + values[6] + '\001' + values[7] + '\001' + values[8] + '\001' +
values[9] + '\001' + values[10] + "\n")
print("完成")
# 根据url返回页数
def return_pages(url):
resp = requests.get(url, headers=HEADERS)
resp.encoding = "gbk"
text = resp.text
page = re.findall('
.*? / (.*?)<', text, re.DOTALL)[0]
return page.strip()
def main():
# 创建队列
pageurl_queue = Queue(200000)
jobinfo_queue = Queue(200000)
start_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,1.html"
info_url = "https://search.51job.com/list/{},000000,0000,00,9,99,%2520,2,{}.html"
#相关省份和程式码
city_code = ['010000', '020000', '030000', '050000', '060000', '070000',
'080000', '090000', '100000', '110000', '120000', '130000',
'140000', '150000', '160000', '170000', '180000', '190000',
'200000', '210000', '220000', '230000', '240000', '250000',
'260000', '270000', '280000', '290000', '300000', '310000',
'320000', '110200', '030200', '040000', '080200', '180200',
'200200', '070200', '090200', '030800', '230300', '230200',
'080300', '170200', '070300', '250200', '190200', '150200',
'120300', '120200', '220200', '240200']
#获取每条职位对应的url放入队列中
for x in city_code:
for y in range(1, int(return_pages(start_url.format(x))) + 1):
u = info_url.format(x, y)
pageurl_queue.put(u)
#循环开启生产者线程 100=100线程
for x in range(100):
t = Procuder(pageurl_queue, jobinfo_queue)
t.start()
time.sleep(8)
#循环开启消费者线程 100=100线程
for x in range(100):
t = Consumer(pageurl_queue, jobinfo_queue)
t.start()
if __name__ == '__main__':
main()