python3爬取头条美女

import requests

from urllib.parse import urlencode

from pyquery import PyQuery as pq

import os

from hashlib import md5

from multiprocessing.pool import Pool

GROUP_START =1

GROUP_END =20

headers = {


    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:59.0) Gecko/20100101 Firefox/59.0',



}

def get_page(offset):

params = {

'offset' : offset,

'format' : 'json',

'keyword' : '美女',

'autoload':'true',

'count':'20',

'cur_tab':'1',

}

url = 'https://www.toutiao.com/search_content/?' + urlencode(params)

try:

response = requests.get(url=url, headers=headers)

if response.status_code == 200:

return response.json()

except requests.ConnectionError:

return None

def get_images(json):

data = json.get('data')

if data:

for item in data:

image_list = item.get('image_list')

title = item.get('title')

if image_list:

for image in image_list:

yield{

'image':image.get('url'),

'title':title

}

def save_image(item):

if not os.path.exists(item.get('title')):

os.mkdir(item.get('title'))

try:

local_image_url = item.get('image')

new_image_url = local_image_url.replace('list','large')

response = requests.get('http:' + new_image_url)

if response.status_code == 200:

file_path = '{0}/{1}.{2}'.format(item.get('title'),md5(response.content).hexdigest(),'jpg')

if not os.path.exists(file_path):

with open(file_path,'wb') as f:

f.write(response.content)

else:

print('Already Downloaded',file_path)

except request.ConnectionError:

print('Failed to save image')

def main(offset):

json = get_page(offset)

for item in get_images(json):

print(item)

save_image(item)

if __name__ == '__main__':

pool = Pool()

groups = ([x * 20 for x in range(GROUP_START,GROUP_END + 1)])

pool.map(main,groups)

pool.close()

pool.join()


python3爬取头条美女_第1张图片
python3爬取头条美女_第2张图片
python3爬取头条美女_第3张图片

刚开始的时候按照书的去爬的时候,网站的源码有些变化,image_detail是没有了,改为了image_list中。


仅供参考

你可能感兴趣的:(python3爬取头条美女)