爬虫python学习代码记录3-抓取某电商网站的商品数据

Python3 爬虫、数据清洗与可视化实战

第4章 大型爬虫案例:抓取某电商网站的商品数据

目录

抓取某电商网站的商品数据

封装代码,提高可读性

写一个程序test.py定时监控运行结果

 爬虫效率优化


抓取某电商网站的商品数据

# 第4章 大型爬虫案例:抓取某电商网站的商品数据
import requests
import urllib
import time
import pymongo

# 建立与mongo的连接
client = pymongo.MongoClient('localhost',27017)
# 新建名为travel的数据库
book_travel = client['travel']
# 在travel数据库中新建名为travel_all的表
sheet_travel = book_travel['travel_all']
url = 'https://touch.dujia.qunar.com/depCities.qunar'
strhtml = requests.get(url)
dep_dict = strhtml.json()  # 出发地列表
for dep_item in dep_dict['data']:   # 循环每个首字母
    for dep in dep_dict['data'][dep_item]: # 循环每个首字母名下的城市
        a = []              # 储存出发地对应的目的地
        print(dep)
        # urllib.request.quote() 将UTF-8转化为URL编码
        # URL编码:任何特殊的字符(就是那些不是简单的七位ASCII,如汉字)将以百分符%用十六进制编码
        url = 'https://touch.dujia.qunar.com/golfz/domestic/domesticDest?dep={}' \
              '&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
        time.sleep(1)
        strhtml = requests.get(url)
        arrive_dict = strhtml.json()   # 获取出发地对应的目的地
        # 对目的地列表去重
        for arr_item in arrive_dict['data']['originalData']:
            for arr_item_1 in arr_item['subModules']:
                for arr_item_2 in arr_item_1['items']:
                    if arr_item_2['query'] not in a:
                        a.append(arr_item_2['query'])
        for arr_item in arrive_dict['data']['ossData']:
            for arr_item_1 in arr_item['subModules']:
                for arr_item_2 in arr_item_1['items']:
                    if arr_item_2['query'] not in a:
                        a.append(arr_item_2['query'])
        # 确定出发地、目的地,获取目的地景点产品列表
        for item in a:
            url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
                  'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
                  '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
                  'originalquery={}&width=480&height=320&quality=90&limit=0,8&includeAD=true&qsact=search' \
                  '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep),urllib.request.quote(item),
                  urllib.request.quote(item))
            time.sleep(1)
            strhtml = requests.get(url)
            routeCount = int(strhtml.json()['data']['limit']['routeCount']) #产品数
            for limit in range(0, routeCount, 20):
                url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
                      'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
                      '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
                      'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
                      '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                      urllib.request.quote(item),limit)
                url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
                      'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
                      '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
                      'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
                      '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                               urllib.request.quote(item), limit)
                time.sleep(1)
                strhtml = requests.get(url)
                # 将爬取数据储存入mongo
                result = {
                    'date': time.strftime('%Y-%m-%d',time.localtime(time.time())),
                    'dep': dep,
                    'arrive': item,
                    'limit': limit,
                    'result': strhtml.json()
                }
                sheet_travel.insert_one(result)

效果

爬虫python学习代码记录3-抓取某电商网站的商品数据_第1张图片

封装代码,提高可读性

import requests
import urllib
import time
import pymongo

# 建立与mongo的连接
client = pymongo.MongoClient('localhost',27017)
# 新建名为travel的数据库
book_travel = client['travel']
# 在travel数据库中新建名为travel_all的表
sheet_travel = book_travel['travel_all']

# 确定出发地、目的地,获取旅游产品列表
def get_list(dep,item):
    url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
          'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
          '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
          'originalquery={}&width=480&height=320&quality=90&limit=0,8&includeAD=true&qsact=search' \
          '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                   urllib.request.quote(item))
    strhtml = get_json(url)
    routeCount = int(strhtml['data']['limit']['routeCount'])  # 产品数
    for limit in range(0, routeCount, 20):
        url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
              'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
              '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
              'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
              '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                       urllib.request.quote(item), limit)
        url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
              'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
              '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
              'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
              '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                       urllib.request.quote(item), limit)
        strhtml = get_json(url)
        # 将爬取数据储存入mongo
        result = {
            'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
            'dep': dep,
            'arrive': item,
            'limit': limit,
            'result': strhtml
        }
        sheet_travel.insert_one(result)
        print(limit)

# 爬虫并延时
def get_json(url):
    strhtml = requests.get(url)
    time.sleep(1)
    return strhtml.json()

if __name__ == "__main__":
    url = 'https://touch.dujia.qunar.com/depCities.qunar'
    dep_dict = get_json(url) # 出发地列表
    for dep_item in dep_dict['data']:   # 循环每个首字母
        for dep in dep_dict['data'][dep_item]: # 循环每个首字母名下的城市
            a = []              # 储存出发地对应的目的地
            # urllib.request.quote() 将UTF-8转化为URL编码
            # URL编码:任何特殊的字符(就是那些不是简单的七位ASCII,如汉字)将以百分符%用十六进制编码
            url = 'https://touch.dujia.qunar.com/golfz/domestic/domesticDest?dep={}' \
                  '&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
            arrive_dict = get_json(url)  # 获取出发地对应的目的地
            # 对目的地列表去重
            for arr_item in arrive_dict['data']['originalData']:
                for arr_item_1 in arr_item['subModules']:
                    for arr_item_2 in arr_item_1['items']:
                        if arr_item_2['query'] not in a:
                            a.append(arr_item_2['query'])
            for arr_item in arrive_dict['data']['ossData']:
                for arr_item_1 in arr_item['subModules']:
                    for arr_item_2 in arr_item_1['items']:
                        if arr_item_2['query'] not in a:
                            a.append(arr_item_2['query'])
            # 确定出发地、目的地,获取目的地景点产品列表
            for item in a:
                get_list(dep,item)

写一个程序test.py定时监控运行结果

from main import sheet_travel
# 注:python不能读取自定义函数里面的对象
import time

while True:
    print(sheet_travel.estimated_document_count())
    time.sleep(10)    # 每10秒监控一次数据库的记录数

同时运行两个程序

效果:

 爬虫效率优化

将爬虫优化成多进程爬虫,以提高效率

主代码main.py

# 第4章 大型爬虫案例:抓取某电商网站的商品数据
import requests
import urllib
import time
import pymongo

# 建立与mongo的连接
client = pymongo.MongoClient('localhost',27017)
# 新建名为travel的数据库
book_travel = client['travel']
# 在travel数据库中新建名为travel_all的表
sheet_travel = book_travel['travel_all']

# 确定出发地、目的地,获取旅游产品列表
def get_list(dep,item):
    url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
          'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
          '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
          'originalquery={}&width=480&height=320&quality=90&limit=0,8&includeAD=true&qsact=search' \
          '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                   urllib.request.quote(item))
    strhtml = get_json(url)
    # 可能出现对应的出发地和目的地没有旅游产品,加上容错处理
    try:
        routeCount = int(strhtml['data']['limit']['routeCount'])  # 产品数
    except:
        return

    for limit in range(0, routeCount, 20):
        url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
              'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
              '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
              'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
              '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                       urllib.request.quote(item), limit)
        url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
              'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
              '&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
              'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
              '&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
                                                       urllib.request.quote(item), limit)
        strhtml = get_json(url)
        # 将爬取数据储存入mongo
        result = {
            'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
            'dep': dep,
            'arrive': item,
            'limit': limit,
            'result': strhtml
        }
        sheet_travel.insert_one(result)
        print(limit)

# 爬虫并延时
def get_json(url):
    strhtml = requests.get(url)
    time.sleep(1)
    return strhtml.json()

# 主函数 循环每个出发地城市
def get_all_data(dep):
    a = []              # 储存出发地对应的目的地
    # urllib.request.quote() 将UTF-8转化为URL编码
    # URL编码:任何特殊的字符(就是那些不是简单的七位ASCII,如汉字)将以百分符%用十六进制编码
    url = 'https://touch.dujia.qunar.com/golfz/domestic/domesticDest?dep={}' \
          '&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
    arrive_dict = get_json(url)  # 获取出发地对应的目的地
    # 对目的地列表去重
    for arr_item in arrive_dict['data']['originalData']:
        for arr_item_1 in arr_item['subModules']:
            for arr_item_2 in arr_item_1['items']:
                if arr_item_2['query'] not in a:
                    a.append(arr_item_2['query'])
    for arr_item in arrive_dict['data']['ossData']:
        for arr_item_1 in arr_item['subModules']:
            for arr_item_2 in arr_item_1['items']:
                if arr_item_2['query'] not in a:
                    a.append(arr_item_2['query'])
    # 确定出发地、目的地,获取目的地景点产品列表
    for item in a:
        get_list(dep,item)

# 出发地城市列表
dep_list = '''
    澳门
    阿坝州
    阿克苏地区
    阿拉尔
    阿拉善盟
    此处省略。。。
    资阳
    遵义
'''

 mulpool.py 多进程爬虫

from main import get_all_data # 从文件中导入get_all_data函数
from main import dep_list     # 从文件中导入dep_list字符串
from multiprocessing import Pool

if __name__ == '__main__':
    # Pool()可以自定义多进程的数量,不设置时代表默认有多少个CPU就开多少个进程
    pool = Pool()
    # pool.map()将第二个参数映射到第一个参数上
    pool.map(get_all_data, dep_list.split())

效果: 

 

 电脑上有4个逻辑CPU, 比单进程明显速度要快

你可能感兴趣的:(python,python,json,开发语言)