基于python3.6.2,mac,废话不多说,直接上代码~
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from urllib.parse import urlencode
import urllib.request
import chardet
import urllib
import json
import re
from bs4 import BeautifulSoup
import sys
import requests
import os
# 定义全局变量,设置图片命名规则
imageIndex = 0
def modifyConstant():
global imageIndex
imageIndex += 1
return
# 通过chardet通用编解码神器,解决了UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 1278: invalid continuation byte
def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
htmlCharsetGuess = chardet.detect(html)
htmlCharsetEncoding = htmlCharsetGuess["encoding"]
htmlCode_decode = html.decode(htmlCharsetEncoding)
type = sys.getfilesystemencoding()
htmlCode_encode = htmlCode_decode.encode(type)
return htmlCode_encode
# 获取每个页面中的详细信息
def parse_page_detail(html):
soup = BeautifulSoup(html, 'lxml')
images_pattern = re.compile('sub_images(.*?)max_img_width', re.S)
html = html.decode('utf8')
result = re.search(images_pattern, html)
if result:
str = result.group(0)[12:-15]
data = json.loads(str)
for i in range(0, len(data)):
getImg(data[i].get('url'))
# 下载图片到本地
# def getImg(imgurl):
# path = "/Users/luoxiaohui/Desktop/test/"
# if not os.path.isdir(path):
# os.makedirs(path)
# paths = path + ""
# urllib.request.urlretrieve(imgurl, ''.format(paths))
# 下载图片到本地,改用了requests库
def getImg(imgUrl):
modifyConstant()
path = "/Users/luoxiaohui/Desktop/test/"
if not os.path.isdir(path):
os.makedirs(path)
try:
pic = requests.get(imgUrl, timeout=10)
print('第' + str(imageIndex) + '张图片下载完成-->' + imgUrl)
except requests.exceptions.ConnectionError:
print('【错误】当前图片无法下载')
string = path + str(imageIndex) + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()
# 获取索引页面
def get_page_index(offset, keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': '20',
'cur_tab': 3
}
url = 'http://www.toutiao.com/search_content/?' + urlencode(data)
return getHtml(url)
# 解析套图中的图片
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
html = get_page_index(0, '街拍')
for url in parse_page_index(html):
html = getHtml(url)
if html:
parse_page_detail(html)