import requests
import re,json,pymysql,time
headers = {
“Accept”: “application/json, text/javascript, /; q=0.01”,
“Accept-Encoding”: “gzip, deflate, br”,
“Accept-Language”: “zh-CN,zh;q=0.9,en;q=0.8”,
“Connection”: “keep-alive”,
“Content-Length”: “80”,
“Content-Type”: “application/x-www-form-urlencoded; charset=UTF-8”,
“Cookie”: “huxiu_analyzer_wcy_id=30fp6hg4w12618l89nwim; is_test_hot_article=8; gr_user_id=0253cc61-ff02-4d44-859c-293130bda733; b6a739d69e7ea5b6_gr_last_sent_cs1=0; grwng_uid=c2b2432b-d86d-4d45-a954-f8c0bb7cb46e; _ga=GA1.2.662040524.1545295769; screen=%7B%22w%22%3A1366%2C%22h%22%3A768%2C%22d%22%3A1%7D; _gid=GA1.2.557408362.1545385659; aliyungf_tc=AQAAAI6uSmbh2AgACHQe2uBnaJ/j00yf; SERVERID=f60b85c1ffd425d843469f623dc2b612|1545386015|1545386014; Hm_lvt_324368ef52596457d064ca5db8c6618e=1545295769,1545385659,1545386016; Hm_lpvt_324368ef52596457d064ca5db8c6618e=1545386016; b6a739d69e7ea5b6_gr_session_id_5a84b241-11ed-4ce0-8b07-0440e8ddb3a7=false; b6a739d69e7ea5b6_gr_session_id=5a84b241-11ed-4ce0-8b07-0440e8ddb3a7; b6a739d69e7ea5b6_gr_last_sent_sid_with_cs1=5a84b241-11ed-4ce0-8b07-0440e8ddb3a7; b6a739d69e7ea5b6_gr_cs1=0”,
“Host”: “www.huxiu.com”,
“Origin”: “https://www.huxiu.com”,
“Referer”: “https://www.huxiu.com/”,
“User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36”,
“X-Requested-With”: “XMLHttpRequest”
}
url=‘https://www.huxiu.com/v2_action/article_list’
def list_page(queue):
for num in range(50000):
try:
post_data = { # form data
“huxiu_hash_code”:“c9588583b446f4307ff26bf78e9d298d”,
“page”:num,
“last_dateline”: 1545178080
}
response=requests.post(url,data=post_data,headers=headers)
text=json.loads(response.text)
data=text[“data”]
url_info_list=re.findall(’’‘href="(.?)"’’’,data)
queue.put(url_info_list)
except:
print(“error”)
def info_url_page(url_info_list):
for url_info in url_info_list:
try:
start_url = “http://www.huxiu.com/”
urls=start_url+url_info
data_info={
“clientToken”: “48ab7b2e183d455baf31073851a1a462”,
“version”: 1.0,
“itemSetId”: 232
}
response_info=requests.post(urls,data=data_info,headers=headers)
text_info=(response_info.text)
#标题
try:
title=str((re.findall(r’’’
if name’main’:
import multiprocessing
#创建一个消息队列 用于存放图片的url 和获取图片url 和其上一级的url
queue=multiprocessing.Queue()
print(‘消息队列创建成功’)
#创建一个进程获取所有的allpage_allpages的url
p=multiprocessing.Process(target=list_page,args=(queue,))
#进程的开启
p.start()
print(‘进程开启了’*10)
#创建下载图片的进程池 指定同时开启5个进程 进程池中的进程只有5个进程是循环利用的
pool=multiprocessing.Pool(20)
print(‘进程池创建成功’)
while True:
#从消息队列中获取url 如果获得了就执行 获取不到会处于阻塞状态
manages_url=queue.get()
#打印获取到的信息
print(manages_url,’’*50)
pool.apply_async(info_url_page,(manages_url,))
#进程池的关闭
pool.close()
#进程池的等待
pool.join()
#所有执行完毕之后
p.join()