python 抓取今日头条街拍图片并下载到本地

基于python3.6.2,mac,废话不多说,直接上代码~

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from urllib.parse import urlencode
import urllib.request
import chardet
import urllib
import json
import re
from bs4 import BeautifulSoup
import sys
import requests
import os

# 定义全局变量,设置图片命名规则
imageIndex = 0
def modifyConstant():
    global imageIndex
    imageIndex += 1
    return

# 通过chardet通用编解码神器,解决了UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 1278: invalid continuation byte
def getHtml(url):
    page = urllib.request.urlopen(url)
    html = page.read()

    htmlCharsetGuess = chardet.detect(html)
    htmlCharsetEncoding = htmlCharsetGuess["encoding"]
    htmlCode_decode = html.decode(htmlCharsetEncoding)
    type = sys.getfilesystemencoding()
    htmlCode_encode = htmlCode_decode.encode(type)

    return htmlCode_encode


# 获取每个页面中的详细信息
def parse_page_detail(html):
    soup = BeautifulSoup(html, 'lxml')
    images_pattern = re.compile('sub_images(.*?)max_img_width', re.S)
    html = html.decode('utf8')
    result = re.search(images_pattern, html)
    if result:
        str = result.group(0)[12:-15]
        data = json.loads(str)
        for i in range(0, len(data)):
            getImg(data[i].get('url'))



# 下载图片到本地
# def getImg(imgurl):
#     path = "/Users/luoxiaohui/Desktop/test/"
#     if not os.path.isdir(path):
#         os.makedirs(path)
#     paths = path + ""
#     urllib.request.urlretrieve(imgurl, ''.format(paths))
# 下载图片到本地,改用了requests库
def getImg(imgUrl):
    modifyConstant()
    path = "/Users/luoxiaohui/Desktop/test/"
    if not os.path.isdir(path):
        os.makedirs(path)
    try:
        pic = requests.get(imgUrl, timeout=10)
        print('第' + str(imageIndex) + '张图片下载完成-->' + imgUrl)
    except requests.exceptions.ConnectionError:
        print('【错误】当前图片无法下载')
    string = path + str(imageIndex) + '.jpg'
    fp = open(string, 'wb')
    fp.write(pic.content)
    fp.close()


# 获取索引页面
def get_page_index(offset, keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
    url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
    return getHtml(url)


# 解析套图中的图片
def parse_page_index(html):
    data = json.loads(html)
    if data and 'data' in data.keys():
        for item in data.get('data'):
            yield item.get('article_url')


html = get_page_index(0, '街拍')
for url in parse_page_index(html):
    html = getHtml(url)
    if html:
        parse_page_detail(html)

你可能感兴趣的:(python)