import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import os
from hashlib import md5
from multiprocessing.pool import Pool
GROUP_START =1
GROUP_END =20
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:59.0) Gecko/20100101 Firefox/59.0',
}
def get_page(offset):
params = {
'offset' : offset,
'format' : 'json',
'keyword' : '美女',
'autoload':'true',
'count':'20',
'cur_tab':'1',
}
url = 'https://www.toutiao.com/search_content/?' + urlencode(params)
try:
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None
def get_images(json):
data = json.get('data')
if data:
for item in data:
image_list = item.get('image_list')
title = item.get('title')
if image_list:
for image in image_list:
yield{
'image':image.get('url'),
'title':title
}
def save_image(item):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
local_image_url = item.get('image')
new_image_url = local_image_url.replace('list','large')
response = requests.get('http:' + new_image_url)
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb') as f:
f.write(response.content)
else:
print('Already Downloaded',file_path)
except request.ConnectionError:
print('Failed to save image')
def main(offset):
json = get_page(offset)
for item in get_images(json):
print(item)
save_image(item)
if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)])
pool.map(main,groups)
pool.close()
pool.join()
刚开始的时候按照书的去爬的时候,网站的源码有些变化,image_detail是没有了,改为了image_list中。
仅供参考