Python3 爬虫、数据清洗与可视化实战
第4章 大型爬虫案例:抓取某电商网站的商品数据
目录
抓取某电商网站的商品数据
封装代码,提高可读性
写一个程序test.py定时监控运行结果
爬虫效率优化
# 第4章 大型爬虫案例:抓取某电商网站的商品数据
import requests
import urllib
import time
import pymongo
# 建立与mongo的连接
client = pymongo.MongoClient('localhost',27017)
# 新建名为travel的数据库
book_travel = client['travel']
# 在travel数据库中新建名为travel_all的表
sheet_travel = book_travel['travel_all']
url = 'https://touch.dujia.qunar.com/depCities.qunar'
strhtml = requests.get(url)
dep_dict = strhtml.json() # 出发地列表
for dep_item in dep_dict['data']: # 循环每个首字母
for dep in dep_dict['data'][dep_item]: # 循环每个首字母名下的城市
a = [] # 储存出发地对应的目的地
print(dep)
# urllib.request.quote() 将UTF-8转化为URL编码
# URL编码:任何特殊的字符(就是那些不是简单的七位ASCII,如汉字)将以百分符%用十六进制编码
url = 'https://touch.dujia.qunar.com/golfz/domestic/domesticDest?dep={}' \
'&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
time.sleep(1)
strhtml = requests.get(url)
arrive_dict = strhtml.json() # 获取出发地对应的目的地
# 对目的地列表去重
for arr_item in arrive_dict['data']['originalData']:
for arr_item_1 in arr_item['subModules']:
for arr_item_2 in arr_item_1['items']:
if arr_item_2['query'] not in a:
a.append(arr_item_2['query'])
for arr_item in arrive_dict['data']['ossData']:
for arr_item_1 in arr_item['subModules']:
for arr_item_2 in arr_item_1['items']:
if arr_item_2['query'] not in a:
a.append(arr_item_2['query'])
# 确定出发地、目的地,获取目的地景点产品列表
for item in a:
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit=0,8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep),urllib.request.quote(item),
urllib.request.quote(item))
time.sleep(1)
strhtml = requests.get(url)
routeCount = int(strhtml.json()['data']['limit']['routeCount']) #产品数
for limit in range(0, routeCount, 20):
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item),limit)
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item), limit)
time.sleep(1)
strhtml = requests.get(url)
# 将爬取数据储存入mongo
result = {
'date': time.strftime('%Y-%m-%d',time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': strhtml.json()
}
sheet_travel.insert_one(result)
效果
import requests
import urllib
import time
import pymongo
# 建立与mongo的连接
client = pymongo.MongoClient('localhost',27017)
# 新建名为travel的数据库
book_travel = client['travel']
# 在travel数据库中新建名为travel_all的表
sheet_travel = book_travel['travel_all']
# 确定出发地、目的地,获取旅游产品列表
def get_list(dep,item):
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit=0,8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item))
strhtml = get_json(url)
routeCount = int(strhtml['data']['limit']['routeCount']) # 产品数
for limit in range(0, routeCount, 20):
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item), limit)
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item), limit)
strhtml = get_json(url)
# 将爬取数据储存入mongo
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': strhtml
}
sheet_travel.insert_one(result)
print(limit)
# 爬虫并延时
def get_json(url):
strhtml = requests.get(url)
time.sleep(1)
return strhtml.json()
if __name__ == "__main__":
url = 'https://touch.dujia.qunar.com/depCities.qunar'
dep_dict = get_json(url) # 出发地列表
for dep_item in dep_dict['data']: # 循环每个首字母
for dep in dep_dict['data'][dep_item]: # 循环每个首字母名下的城市
a = [] # 储存出发地对应的目的地
# urllib.request.quote() 将UTF-8转化为URL编码
# URL编码:任何特殊的字符(就是那些不是简单的七位ASCII,如汉字)将以百分符%用十六进制编码
url = 'https://touch.dujia.qunar.com/golfz/domestic/domesticDest?dep={}' \
'&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
arrive_dict = get_json(url) # 获取出发地对应的目的地
# 对目的地列表去重
for arr_item in arrive_dict['data']['originalData']:
for arr_item_1 in arr_item['subModules']:
for arr_item_2 in arr_item_1['items']:
if arr_item_2['query'] not in a:
a.append(arr_item_2['query'])
for arr_item in arrive_dict['data']['ossData']:
for arr_item_1 in arr_item['subModules']:
for arr_item_2 in arr_item_1['items']:
if arr_item_2['query'] not in a:
a.append(arr_item_2['query'])
# 确定出发地、目的地,获取目的地景点产品列表
for item in a:
get_list(dep,item)
from main import sheet_travel
# 注:python不能读取自定义函数里面的对象
import time
while True:
print(sheet_travel.estimated_document_count())
time.sleep(10) # 每10秒监控一次数据库的记录数
同时运行两个程序
效果:
将爬虫优化成多进程爬虫,以提高效率
主代码main.py
# 第4章 大型爬虫案例:抓取某电商网站的商品数据
import requests
import urllib
import time
import pymongo
# 建立与mongo的连接
client = pymongo.MongoClient('localhost',27017)
# 新建名为travel的数据库
book_travel = client['travel']
# 在travel数据库中新建名为travel_all的表
sheet_travel = book_travel['travel_all']
# 确定出发地、目的地,获取旅游产品列表
def get_list(dep,item):
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit=0,8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item))
strhtml = get_json(url)
# 可能出现对应的出发地和目的地没有旅游产品,加上容错处理
try:
routeCount = int(strhtml['data']['limit']['routeCount']) # 产品数
except:
return
for limit in range(0, routeCount, 20):
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item), limit)
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&' \
'dep={}&query={}&dappDealTrace=true&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&needNoResult=true&' \
'originalquery={}&width=480&height=320&quality=90&limit={},8&includeAD=true&qsact=search' \
'&filterTagPlatform=mobile_touch'.format(urllib.request.quote(dep), urllib.request.quote(item),
urllib.request.quote(item), limit)
strhtml = get_json(url)
# 将爬取数据储存入mongo
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': strhtml
}
sheet_travel.insert_one(result)
print(limit)
# 爬虫并延时
def get_json(url):
strhtml = requests.get(url)
time.sleep(1)
return strhtml.json()
# 主函数 循环每个出发地城市
def get_all_data(dep):
a = [] # 储存出发地对应的目的地
# urllib.request.quote() 将UTF-8转化为URL编码
# URL编码:任何特殊的字符(就是那些不是简单的七位ASCII,如汉字)将以百分符%用十六进制编码
url = 'https://touch.dujia.qunar.com/golfz/domestic/domesticDest?dep={}' \
'&exclude=&extensionImg=255,175'.format(urllib.request.quote(dep))
arrive_dict = get_json(url) # 获取出发地对应的目的地
# 对目的地列表去重
for arr_item in arrive_dict['data']['originalData']:
for arr_item_1 in arr_item['subModules']:
for arr_item_2 in arr_item_1['items']:
if arr_item_2['query'] not in a:
a.append(arr_item_2['query'])
for arr_item in arrive_dict['data']['ossData']:
for arr_item_1 in arr_item['subModules']:
for arr_item_2 in arr_item_1['items']:
if arr_item_2['query'] not in a:
a.append(arr_item_2['query'])
# 确定出发地、目的地,获取目的地景点产品列表
for item in a:
get_list(dep,item)
# 出发地城市列表
dep_list = '''
澳门
阿坝州
阿克苏地区
阿拉尔
阿拉善盟
此处省略。。。
资阳
遵义
'''
mulpool.py 多进程爬虫
from main import get_all_data # 从文件中导入get_all_data函数
from main import dep_list # 从文件中导入dep_list字符串
from multiprocessing import Pool
if __name__ == '__main__':
# Pool()可以自定义多进程的数量,不设置时代表默认有多少个CPU就开多少个进程
pool = Pool()
# pool.map()将第二个参数映射到第一个参数上
pool.map(get_all_data, dep_list.split())
效果:
电脑上有4个逻辑CPU, 比单进程明显速度要快