一、网页分析
1.在浏览器访问:http://touch.qunar.com/
网页的排版会变,但是衣服变了身体还是那个身体,F12分析一波。
点击主页的“”自由行”,鼠标点击搜索栏,按F12观察网页的标签,发现在JS中有我们需要的网页请求,所有的旅游地区都保存的JSON格式的列表里。
2.查看Headers
我们需要的HTTP请求为:https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep=%E4%B8%8A%E6%B5%B7&exclude=&extensionImg=255,175&callback=jsonp_1535868021986_30812;通过分析发现,“dep”为地点。接下来我们点击网页上的“丽江”来继续分析网页,
这次我们所要查看的是旅游城市所在地的所有旅游景点,所以通过查找发现所需内容在XHR中,并且我们需要将网页HTTP请求复制下来进行处理
链接比较复杂,但是仔细查看就会发现,所要查找的内容依旧在“dep”关键字中,只不过被编码所替换,如果有需要可以下载转码工具对其转码。
二、实战
首先我们写一个简单版,对此我就不多啰嗦,关键地方我会在代码中给予注释。
# coding=utf-8
# au: Luo
# data:2018.08.30
import urllib.request
import requests # 爬虫requests库,非常强大好用
import time
import pymongo # pymongo库,不懂可以查看上一篇博客
client = pymongo.MongoClient('localhost', 27017)# 连接数数据库
book_qunar = client['qunar']
sheet_qunar_zyx = book_qunar['aunar_zyx']# 创建MongoDB数据库的表
url = 'https://touch.dujia.qunar.com/depCities.qunar'
# 请求头,注意这里必须添加cookie,否则在后续爬取过程中会出现非法请求的提示,在此本人也是花了很久尝试才得出结论
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch',
'accept-language': 'zh-CN,zh;q=0.8',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/49.0.2623.75 Safari/537.36',
'cookie': 'QN300=organic; QN1=eIQjm1uFGUGmVZroEV65Ag==; QN277=organic; '
'csrfToken=jlP8SVo79YjStBWaCDZMTzfRdk8vhrer; QN269=D27F47D0AAA6'
'11E8BB00FA163EF78B12; QN57=15354494109820.5188963907166866; '
'i=VInJOQycvl1TIZs3ZNjHGo-NWE6q; _vi=wntLd-u3hhb623qZeAtQj5Re8O'
'a8V_UFxv73OyS0PeOdzXMoC1PScrp4BJxGp_XlCJszxevRwtpqQ9XqrSDXGRtPO'
'2F53lXYVNmkniXsuS4XTTttFgFbhwQ9vP-d0pQNBRMzPvI6WYZPvXzM_cpoVqVIm'
'f_zF1VHFiXdrLBoyuEx; QN58=1535618769166%7C1535618769166%7C1; QN48'
'=tc_363b9bcfab7eadcd_1658a8854b4_34a6; QN267=078324963ae656e5e; _'
'RF1=116.228.53.168; _RSG=pf4OU7iqgs4zEmiQzYI7tA; _RDG=2822ea67142a'
'a520563dbbaa984354cd9a; _RGUID=4daad2c1-de60-4a73-adc7-4a2f5ab303b'
'd; PHPSESSID=sao7528ocg7qp4041ubv5f94a1; QN234=home_free_t; _pk_ref'
'.1.8600=%5B%22%22%2C%22%22%2C1535700974%2C%22http%3A%2F%2Ftouch.quna'
'r.com%2F%22%5D; _pk_id.1.8600=f10de5ca53398f77.1535627735.5.153570134'
'6.1535699139.; _pk_ses.1.8600=*; QN243=137; QN205=organic; QN233=dujia_hy_destination'
}
strhtml = requests.get(url, headers=headers)
dep_dict = strhtml.json()# 将获取的网页内容转换为json格式.
# 获取旅游城市中的景点
for dep_item in dep_dict['data']:
for dep in dep_dict['data'][dep_item]:
print(dep)
a = []
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175' \
.format(urllib.request.quote(dep))
time.sleep(1)
strhtml_1 = requests.get(url)
arrive_dict = strhtml_1.json()
for arr_item in arrive_dict['data']:
for arr_item_1 in arr_item['subModules']:
for query in arr_item_1['items']:
if query['query'] not in a:
a.append(query['query'])
print(a)
for item in a:# 将获取内容保存在A集合中,防止出现重复
print(item)
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}' \
'&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&originalquery={}' \
'&limit=0,28&includeAD=true&qsact=search'.\
format(urllib.request.quote(dep),
urllib.request.quote(query['query']),
urllib.request.quote(query['query']))
time.sleep(1)
strhtml_2 = requests.get(url, headers=headers)
routeCount = int(strhtml_2.json()['data']['limit']['routeCount'])
for limit in range(0, routeCount, 20):# 循环读取集合中的旅游景点信息
url = "https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&" \
"dep={}&query={}&" \
"dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&" \
"cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&" \
"originalquery={}&limit={},28&includeAD=true&qsact=search". \
format(urllib.request.quote(dep),
urllib.request.quote(item),
urllib.request.quote(item), limit)
time.sleep(1)
strhtml = requests.get(url, headers=headers)
# 保存旅游城市、景点等信息
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': strhtml.json(),
}
sheet_qunar_zyx.insert_one(result)
代码运行结果:
数据库保存结果:
到此,爬虫工作就结束啦,我也在学习着将数据进行可视化,后续可能会跟新这个版本的升级版,一步步进阶。
———————————————————————————————————————上传一个升级版:
# coding=utf-8
# au: Luo
# data:2018.09.02
import urllib.request
import requests
import time
import pymongo
client = pymongo.MongoClient('localhost', 27017)# 连接数数据库
book_qunar = client['qunar']
sheet_qunar_zyx = book_qunar['aunar_zyx']# 创建MongoDB数据库的表
headers = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, sdch',
'accept-language': 'zh-CN,zh;q=0.8',
'cache-control': 'max-age=0',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/49.0.2623.75 Safari/537.36',
'cookie': 'QN300=organic; QN1=eIQjm1uFGUGmVZroEV65Ag==; QN277=organic; '
'csrfToken=jlP8SVo79YjStBWaCDZMTzfRdk8vhrer; QN269=D27F47D0AAA6'
'11E8BB00FA163EF78B12; QN57=15354494109820.5188963907166866; '
'i=VInJOQycvl1TIZs3ZNjHGo-NWE6q; _vi=wntLd-u3hhb623qZeAtQj5Re8O'
'a8V_UFxv73OyS0PeOdzXMoC1PScrp4BJxGp_XlCJszxevRwtpqQ9XqrSDXGRtPO'
'2F53lXYVNmkniXsuS4XTTttFgFbhwQ9vP-d0pQNBRMzPvI6WYZPvXzM_cpoVqVIm'
'f_zF1VHFiXdrLBoyuEx; QN58=1535618769166%7C1535618769166%7C1; QN48'
'=tc_363b9bcfab7eadcd_1658a8854b4_34a6; QN267=078324963ae656e5e; _'
'RF1=116.228.53.168; _RSG=pf4OU7iqgs4zEmiQzYI7tA; _RDG=2822ea67142a'
'a520563dbbaa984354cd9a; _RGUID=4daad2c1-de60-4a73-adc7-4a2f5ab303b'
'd; PHPSESSID=sao7528ocg7qp4041ubv5f94a1; QN234=home_free_t; _pk_ref'
'.1.8600=%5B%22%22%2C%22%22%2C1535700974%2C%22http%3A%2F%2Ftouch.quna'
'r.com%2F%22%5D; _pk_id.1.8600=f10de5ca53398f77.1535627735.5.153570134'
'6.1535699139.; _pk_ses.1.8600=*; QN243=137; QN205=organic; QN233=dujia_hy_destination'
}
def get_list(dep, item):
url = 'https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&dep={}' \
'&query={}&dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C' \
'&cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&originalquery={}' \
'&limit=0,28&includeAD=true&qsact=search'. \
format(urllib.request.quote(dep),
urllib.request.quote(query['query']),
urllib.request.quote(query['query']))
time.sleep(1)
# strhtml = requests.get(url, headers=headers)
# routeCount = int(strhtml.json()['data']['limit']['routeCount'])
routeCount = int(get_json(url)['data']['limit']['routeCount'])
for limit in range(0, routeCount, 20): # 循环读取集合中的旅游景点信息
url = "https://touch.dujia.qunar.com/list?modules=list%2CbookingInfo%2CactivityDetail&" \
"dep={}&query={}&" \
"dappDealTrace=false&mobFunction=%E6%89%A9%E5%B1%95%E8%87%AA%E7%94%B1%E8%A1%8C&" \
"cfrom=zyx&it=dujia_hy_destination&date=&configDepNew=&needNoResult=true&" \
"originalquery={}&limit={},28&includeAD=true&qsact=search". \
format(urllib.request.quote(dep),
urllib.request.quote(item),
urllib.request.quote(item), limit)
time.sleep(1)
# strhtml = requests.get(url, headers=headers)
# 保存旅游城市、景点等信息
result = {
'date': time.strftime('%Y-%m-%d', time.localtime(time.time())),
'dep': dep,
'arrive': item,
'limit': limit,
'result': get_json(url),
}
sheet_qunar_zyx.insert_one(result)
def get_json(url):
strhtml = requests.get(url, headers=headers)
time.sleep(1)
return strhtml.json()
if __name__ == "__main__":
url = "https://touch.dujia.qunar.com/depCities.qunar"
dep_dic = get_json(url)
for dep_item in dep_dic['data']:
for dep in dep_dic['data'][dep_item]:
a= []
url = 'https://touch.dujia.qunar.com/golfz/sight/arriveRecommend?dep={}&exclude=&extensionImg=255,175' \
.format(urllib.request.quote(dep))
arrive_dict = get_json(url)
for arr_item in arrive_dict['data']:
for arr_item_1 in arr_item['subModules']:
for query in arr_item_1['items']:
if query['query'] not in a:
a.append(query['query'])
for item in a:
get_list(dep, item)