分析Ajax抓取今日头条街拍美图(二)

1043-Python-小组：李凯旋

抓取美图流程

获得索引页
解析索引页
获得详情页
解析详情页
存储、并下载

获得索引页

import requests
from requests.exceptions import RequestException
from urllib.parse import urlencode
def get_page_index(offset,keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求索引页有错误")
    return None

def main():
    html=get_page_index(0,'街拍')
    print(html)

if __name__=='__main__':
    main()

输出的部分结果

{"count": 20, "action_label": "click_search", "return_count": 20, "no_outsite_res": 0, "has_more":

补充：

其中字典data是get请求参数
当url地址含有中文，或者参数有中文的时候U，让url作为参数传递的时候（最常见的callback），需要把一些中文甚至'/'做一下编码转换。
urllib库里面有个urlencode函数，可以把key-value这样的键值对转换成我们想要的格式，返回的是a=1&b=2这样的字符串，如下：

import urllib
from urllib.parse import urlencode
data = {
        'offset': 0,
        'format': 'json',
        'keyword':'街拍',
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
print (urlencode(data))

输出结果
offset=0&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=3

status_code==200，表示服务器已成功处理了请求，通常即是服务器提供了请求的网页

解析索引页

从上面的请求索引页，可知输出的结果是json格式，但是不是觉得python的字典和json格式差不多呢？

其实是不同的,json 就是一个根据某种约定格式编写的纯字符串，不具备任何数据结构的特征。而字典本身是一个完整的数据结构，实现了一切自身该有的算法。dict存在于内存中，可以被序列化成 json 格式的数据（string），之后这些数据就可以传输或者存储了。

import requests
import json
from requests.exceptions import RequestException
from urllib.parse import urlencode
def get_page_index(offset,keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求有错误")
        return None

def parse_index_page(html):
    if html:
        data=json.loads(html)
        if data and 'data' in data.keys():#data.keys(）是返回所有的键
            for item in data.get('data'):
                yield item.get('article_url') #article_url即是返回索引页标题的链接
                
def main():
    html=get_page_index(0,'街拍')
    for url in parse_index_page(html):
        print (url)

输出的部分结果：
http://toutiao.com/group/6450374750065164557/
https://temai.snssdk.com/article/feed/index/?id=11649061&source_type=12&content_type=2&adid=__AID__
https://temai.snssdk.com/article/feed/index/?id=11641922&source_type=12&content_type=2&adid=__AID__
http://toutiao.com/group/6450386354081186061/
http://toutiao.com/group/6450444860133589262/
http://toutiao.com/group/6424016000043778305/
http://toutiao.com/group/6450568221539385614/
http://toutiao.com/group/6422192886574317825/

补充：

json下的两个重要函数

编码：把一个Python对象编码转换成Json字符串 , json.dumps()
解码：把Json格式字符串解码转换成Python对象, json.loads()

生成器：generator,一边循环一边计算的机制在parse_index_page(html) 函数中，把yield 换成print之后也能得出我们想要的结果，但是会把所有结果一下输出来，占用内存，而用yield时就变成了一个generator，可以用next()一个个的调用，但是太变态了，所以用for循环来迭代，其实，for循环中会自动调用next().

获得详情页

import requests
import json
from requests.exceptions import RequestException
from urllib.parse import urlencode
def get_page_index(offset,keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求有错误")
        return None

def parse_index_page(html):
    if html:
        data=json.loads(html)
        if data and 'data' in data.keys():#data.keys(）是返回所有的键
            for item in data.get('data'):
                yield item.get('article_url') #article_url即是返回索引页标题的链接

def get_page_detail(url):
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求详情页有错误")
        return None


def main():
    html=get_page_index(0,'街拍')
    for url in parse_index_page(html):
        html=get_page_detail(url)
        print(html)

if __name__=='__main__':
    main()


街拍时尚：包臀裙姐姐，很漂亮！

解析详情页

import requests
from multiprocessing import Pool
import re
import json
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from urllib.parse import urlencode
def get_page_index(offset,keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求有错误")
        return None

def parse_index_page(html):
    if html:
        data=json.loads(html)
        if data and 'data' in data.keys():#data.keys(）是返回所有的键
            for item in data.get('data'):
                yield item.get('article_url') #article_url即是返回索引页标题的链接

def get_page_detail(url):
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求详情页有错误")
        return None
def parse_page_detail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()  # soup.select()，查找'title'标签，返回类型是 list
    print(title)#打印每个图集的标题
    images_pattern = re.compile('gallery: ({.*}),.*?siblingList.*', re.S)
    # c通过compile方法编译原生字符串，
    # re.S,表示多行匹配，如果一行没有，就换下一行重新开始
    # 每个子标题的链接在变量'gallery'中，且这变量也是json格式

    jieguo = re.search(images_pattern, html)
    if jieguo and jieguo.group(1):
        data = json.loads(jieguo.group(1))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [image.get('url') for image in sub_images]
            res = {
                'title': title,
                'url': url,
                'images': images
            }
            return res

def main():
    html=get_page_index(0,'街拍')
    for url in parse_index_page(html):
        html=get_page_detail(url)
        if html:
            result=parse_page_detail(html,url)
            print(result)

if __name__=='__main__':

    main()



输出部分结果：

路人街拍，重庆女孩：喜欢自己，喜好打扮
{'title': '路人街拍，重庆女孩：喜欢自己，喜好打扮', 'url': 'http://toutiao.com/group/6450820090748715277/', 'images': ['http://p1.pstatp.com/origin/31e80004a4eb49d5de7f', 'http://p1.pstatp.com/origin/31eb0001eb51ed6d3e53', 'http://p3.pstatp.com/origin/31e60001eaf1524e93d9', 'http://p3.pstatp.com/origin/31ec0004a72484e5a392', 'http://p3.pstatp.com/origin/31e80004a4f48d0a9d34', 'http://p1.pstatp.com/origin/31e60001eaf464b90d3c']}
精品路人街拍，搭配不仅仅是为了好看，更是对自己内心世界的表达
{'title': '精品路人街拍，搭配不仅仅是为了好看，更是对自己内心世界的表达', 'url': 'http://toutiao.com/group/6450651352926830862/', 'images': ['http://p1.pstatp.com/origin/31e300031a955fc4e275', 'http://p3.pstatp.com/origin/31f2000065fc3827f7cb', 'http://p9.pstatp.com/origin/31e1000494b3a7dee66f', 'http://p3.pstatp.com/origin/31e300031a9869323e23', 'http://p1.pstatp.com/origin/31e40004943b9acebee2']}
笑迎八方之街拍-最好的闺蜜就是拍照时同时露出微笑
{'title': '笑迎八方之街拍-最好的闺蜜就是拍照时同时露出微笑', 'url': 'http://toutiao.com/group/6450828058220314894/', 'images': ['http://p3.pstatp.com/origin/31e30004ba3aeb2ba3bd', 'http://p3.pstatp.com/origin/31e30004ba7d884ac0b2', 'http://p3.pstatp.com/origin/31ef0001b8a84508b359', 'http://p9.pstatp.com/origin/31ec0004aef40d585463', 'http://p3.pstatp.com/origin/31e80004ace8a772287d', 'http://p1.pstatp.com/origin/31ef0001b8f1958f7f3d', 'http://p3.pstatp.com/origin/31e30004baa2c37a007e', 'http://p3.pstatp.com/origin/31e60001f327bc050163', 'http://p1.pstatp.com/origin/31eb0001f3881551573f', 'http://p1.pstatp.com/origin/31ef0001b911b735e9b3', 'http://p3.pstatp.com/origin/31e30004baac323a2259', 'http://p1.pstatp.com/origin/31e9000111fe0b96a0e6']}
精品路人街拍，气质不一定是与生俱来，多半是后期修饰
{'title': '精品路人街拍，气质不一定是与生俱来，多半是后期修饰', 'url': 'http://toutiao.com/group/6450658567229456653/', 'images': ['http://p9.pstatp.com/origin/31f200009502cdde31e2', 'http://p1.pstatp.com/origin/31f2000095087aaa8a5c', 'http://p3.pstatp.com/origin/31e40004c36b46c5688c', 'http://p3.pstatp.com/origin/31ec00033eb51c46c78e', 'http://p1.pstatp.com/origin/31e10004c386f44ec13c']}
路人街拍，街拍女孩儿的牛仔裤特辑
{'title': '路人街拍，街拍女孩儿的牛仔裤特辑', 'url': 'http://toutiao.com/group/6424016000043778305/', 'images': ['http://p3.pstatp.com/origin/22d1000035dd9b85559d', 'http://p3.pstatp.com/origin/21390004987482da8767', 'http://p3.pstatp.com/origin/22d1000035e68f0a3c27', 'http://p3.pstatp.com/origin/22ce0001063d416bddf1', 'http://p9.pstatp.com/origin/22ce000106429ed3ab7a', 'http://p3.pstatp.com/origin/21390004987828322e31', 'http://p3.pstatp.com/origin/22d1000035f149ebd635']}
街拍时尚：我家有女初长成！
{'title': '街拍时尚：我家有女初长成！', 'url': 'http://toutiao.com/group/6450444860133589262/', 'images': ['http://p3.pstatp.com/origin/31d300014dffe5812a02', 'http://p3.pstatp.com/origin/31d300014e015a0d46f1', 'http://p9.pstatp.com/origin/31d50003a3b19274824a', 'http://p3.pstatp.com/origin/31d80001410924bb5bba', 'http://p3.pstatp.com/origin/31d300014e04229086b6', 'http://p3.pstatp.com/origin/31d70003bc1b11ee95f1']}
路人街拍，重庆街头的520街拍
{'title': '路人街拍，重庆街头的520街拍', 'url': 'http://toutiao.com/group/6422192886574317825/', 'images': ['http://p9.pstatp.com/origin/213700063df1f07745bf', 'http://p3.pstatp.com/origin/213c000289748f052e65', 'http://p3.pstatp.com/origin/213c0002897552a7b9d1', 'http://p3.pstatp.com/origin/2135000659c345b2f769', 'http://p3.pstatp.com/origin/22c70001bf2ad5638169', 'http://p1.pstatp.com/origin/213700063dfba30e5ba9', 'http://p1.pstatp.com/origin/213c0002897f3b343808']}
成都步行街行人街拍，夏日街头的一道靓丽风景

存储、下载图片

#config.py
# 指定搜索的参数offset范围为[GROUP_START*20,(GROUP_END+1)*20]
MONGO_URL = 'localhost'    #使用默认端口连接
MONGO_DB = 'toutiao1'  #数据库名字，如一个excel文件
MONGO_TABLE = 'toutiao1' # 数据表名字，如一个excel文件中的sheet1

GROUP_START = 0
GROUP_END = 20
KEYWORD='街拍'


#toutiao.py

import requests
from multiprocessing import Pool
import re
import json
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
from urllib.parse import urlencode
from config import *
import pymongo
from hashlib import md5
client = pymongo.MongoClient(MONGO_URL, connect=False)#连接一个数据库
db = client[MONGO_DB]
def get_page_index(offset,keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': '20',
        'cur_tab': 3
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求有错误")
        return None

def parse_index_page(html):
    if html:
        data=json.loads(html)
        if data and 'data' in data.keys():#data.keys(）是返回所有的键
            for item in data.get('data'):
                yield item.get('article_url') #article_url即是返回索引页标题的链接

def get_page_detail(url):
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            return respones.text
        return None
    except RequestException:
        print("请求详情页有错误")
        return None
def parse_page_detail(html,url):
    soup = BeautifulSoup(html,'lxml')
    title = soup.select('title')[0].get_text()  # soup.select()，查找'title'标签，返回类型是 list
    print(title)#打印每个图集的标题
    images_pattern = re.compile('gallery: ({.*}),.*?siblingList.*', re.S)
    # c通过compile方法编译原生字符串，
    # re.S,表示多行匹配，如果一行没有，就换下一行重新开始
    # 每个子标题的链接在变量'gallery'中，且这变量也是json格式

    jieguo = re.search(images_pattern, html)
    if jieguo and jieguo.group(1):
        data = json.loads(jieguo.group(1))
        if data and 'sub_images' in data.keys():
            sub_images = data.get('sub_images')
            images = [image.get('url') for image in sub_images]
            for image in images:
                download_image(image)
            res = {
                'title': title,
                'url': url,
                'images': images
            }
            return res
def save_to_mongodb(result):
    if db[MONGO_TABLE].insert(result):
        print ('存储到MongoDB成功',result)
        return True
    return False

def download_image(url):
    print("正在下载",url)
    try:
        respones=requests.get(url)#get请求
        if respones.status_code==200:
            save_image(responses.content)#response.content返回二进制结果，response.text返回正常的网页请求结果。
        return None
    except RequestException:
        print("请求图片有错误")
        return None

def save_image(content):
    file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
   # md5(content).hexdigest()摘要算法加密，为了得到不重复文件名。os.getcwd() 返回当前进程工作目录
    print(file_path)
    if not os.path.exists(file_path):
        with open(file_path, 'wb') as f:   #wb 以二进制写模式打开
            f.write(content)
            f.close()

def main(offest):
    html=get_page_index(offest,KEYWORD)
    for url in parse_index_page(html):
        html=get_page_detail(url)
        if html:
            result=parse_page_detail(html,url)
            if result:
                save_to_mongodb(result)

if __name__=='__main__':
    pool = Pool()
    groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
    pool.map(main, groups)
    pool.close()
    pool.join()

输出部分结果
图虫街拍摄影：随遇而安
正在下载 http://p3.pstatp.com/origin/149c00000b252927bf3c
图虫风光摄影：城市 街拍
正在下载 http://p3.pstatp.com/origin/127a00016c7fde875c6e
杨幂玩转休闲风大自然街拍，经典条纹T搭宽松牛仔，洋气有气气质
连体裤火得一塌糊涂，景甜出席活动就穿它，杨幂街拍也穿上了
C:\Users\lkx941013\Documents\Tencent Files\893579569\FileRecv/4311c5d6951af29b374a08c6ee716182.jpg
正在下载 http://p3.pstatp.com/origin/127a00016c89cb530b4f
C:\Users\lkx941013\Documents\Tencent Files\893579569\FileRecv/38312546d6b84d2cd7ed2f58ebd9d5ac.jpg
正在下载 http://p3.pstatp.com/origin/15350006352f6b9cd6cf

注意
由于这几天的今日头条街拍图集详情页改变，所以正则表达式就变成了：

images_pattern = re.compile('gallery: ({.*}),.*?siblingList.*', re.S)
jieguo = re.search(images_pattern, html)

第一次写这么长的编程类文章，确实收获了不少，但还还有好多细节问题没有处理好，希望大家多多指教。其中也要感谢崔庆才老师的课程，升哥的帮助，少华哥的指点。