时间戳格式化成字符串之后的结果:
自己看结果
>>> time.time()
1530150193.873144
>>> '{}'.format(time.time())
'1530150224.11'
>>> '{}'.format(str(time.time()))
'1530150237.7'
>>> a = 1.33333
>>> str(a)
'1.33333'
>>> str(time.time())
'1530151047.78'
>>>
pyspider内置url去重机制,本想着,根据时间戳的唯一性来拼接url参数,达到url唯一性的目的,以至于不被自动去重而丢失请求,但是事实上,丢失了大量的请求;
问题代码段:
for i in range(1,pages+1):
url = 'http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?time={}'.format(time.time())
由于cpu的运算速度特别快,别说第二小数了,可能第6位小数都是重复的,那么这样的url肯定被去重,
导致的结果,我在调试的时候529个请求,有的时候是40多个,有的时候是50多个,我就纳闷了,到底是怎么回事,为什么会丢请求,后来想到,前段时间的另一个问题:
self.send_message(self.project_name,data,url=time.time())
由于pyspider自带数据去重,相同的数据特征是不会被保存的,这里是根据url来识别特征的!
在调试中我发现所有的数据都拿到了,但是保存的时候,总是会丢失大量的数据;基本上会丢失9/10,也就是10条数据才能被保存一条;
突然恍然大悟:
万恶的time.time()根本不能唯一标识;
于是修改代码,只需要修改一个地方,就是将time.time()换成其他的唯一标识符;
修改后的代码如下:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-21 10:35:32
# Project: ctrip_hotel
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
from pyspider.libs.base_handler import *
from lxml import etree
import random
import json
import time
from pyspider_util import proxy_util
from fake_useragent import UserAgent
import re
from pyspider.libs.base_handler import *
from lxml import etree
import random
import json
import time
from pyspider_util import proxy_util
from fake_useragent import UserAgent
import re
import hashlib
city_names = [u'上海', '北京', '广州', '深圳', '南京', '厦门', '大连', '天津', '宁波', '成都', '无锡', '杭州', '武汉', '沈阳', '苏州', '西安', '重庆', '长沙', '青岛', '东莞', '乌鲁木齐', '佛山', '南宁', '南昌', '南通', '合肥', '太原', '常州', '徐州', '惠州', '扬州', '昆明', '汕头', '泉州', '洛阳', '济南', '海口', '温州', '潍坊', '烟台', '珠海', '石家庄', '福州', '贵阳', '郑州', '金华', '长春', '哈尔滨', '三亚', '上饶', '中山', '临沂', '丽江', '保定', '兰州', '包头', '南充', '南平', '台州', '吉林', '呼伦贝尔', '呼和浩特', '咸阳', '唐山', '嘉兴', '大庆', '威海', '安阳', '宜宾', '宜昌', '宜春', '宝鸡', '岳阳', '常德', '廊坊', '张家口', '德阳', '怀化', '抚顺', '揭阳', '柳州', '株洲', '桂林', '梅州', '榆林', '泰州', '泸州', '济宁', '淄博', '淮安', '湖州', '湛江', '漳州', '盐城', '秦皇岛', '绍兴', '绵阳', '舟山', '芜湖', '荆州', '莆田', '蚌埠', '衡阳', '衢州', '西宁', '赣州', '赤峰', '运城', '连云港', '遵义', '邢台', '邯郸', '郴州', '鄂尔多斯', '银川', '镇江', '鞍山', '齐齐哈尔', '龙岩']
# type_code = ['100000', '100100', '100101', '100102', '100103', '100104', '100105', '100200', '100201']
class Handler(BaseHandler):
crawl_config = {
'proxy': 'forward.xdaili.cn:80'
}
ua = UserAgent()
# 请求城市列表页
@every(minutes=365 * 24 * 60)
def on_start(self):
headers = {}
headers = self.url_start(headers)
url = 'http://hotels.ctrip.com/domestic-city-hotel.html?time={}'.format(time.time())
self.crawl(url,headers=headers,callback=self.get_allcity_urls,retries=10)
# 请求所有城市的url
@config(age=364 * 24 * 60)
def get_allcity_urls(self,response):
tree = etree.HTML(response.text)
all_list = tree.xpath('//dl[@class="pinyin_filter_detail layoutfix"]/dd/a')
city_urls = []
url_name = {}
for i in all_list:
name = i.xpath('./text()')[0]
if name in city_names:
city_urls.append(i.xpath('./@href')[0])
url_name[i.xpath('./@href')[0]] = name
print(city_urls)
for city_url in city_urls:
headers = {}
headers = self.url_start(headers)
city_url = 'http://hotels.ctrip.com' + city_url + '?time={}'.format(time.time())
self.crawl(city_url,headers = headers,retries=10,callback=self.get_allpages,save=url_name)
#拿到总页数,发起所有列表页的请求
@config(age=364 * 24 * 60)
def get_allpages(self,response):
tree = etree.HTML(response.text)
url_name = response.save
url_e = re.findall(r'(/hotel/[a-z]+\d+)\?time=', response.url)[0]
name = url_name[url_e]
try:
pages = tree.xpath('//div[@class="c_page_list layoutfix"]/a[@rel="nofollow"]/text()')[0]
pages = int(pages)
except:
pages = 1
print(pages)
# python2
import urllib
print(name)
name_code = urllib.quote(name.decode('utf-8').encode('utf-8'))
city_id = re.findall(r'/hotel/[a-z]+(\d+)\?time=', response.url)[0]
city_py = re.findall(r'/hotel/([a-z]+)\d+\?time=',response.url)[0]
for i in range(1,pages+1):
url = 'http://hotels.ctrip.com/Domestic/Tool/AjaxHotelList.aspx?time={}'.format(i)
formdata = {
"__VIEWSTATEGENERATOR": "DB1FBB6D",
"cityName": name_code,
"RoomGuestCount": "1,1,0",
"operationtype": "NEWHOTELORDER",
"cityId": city_id,
"cityPY": city_py,
"page": i,
}
headers = {}
headers = self.url_start(headers)
self.crawl(url,method='POST', data=formdata,headers=headers,retries=10,callback=self.response_parse,save={'name':name})
# 保存单页的数据部分数据,发起详情页请求,拿到更多数据;
@config(age=364 * 24 * 60)
def response_parse(self, response):
city_name = response.save['name']
response_json = response.json
info_list = response_json["hotelPositionJSON"]
htllist = response_json['HotelMaiDianData']['value']['htllist']
htllist = eval(htllist)
num = 0
for info in info_list:
# 拿到列表页酒店的部分数据,传递给详情页的请求
info_json = {}
info_json['id'] = info['id']
info_json['名称'] = info['name']
info_json['地址'] = info['address']
info_json['评分'] = info['score']
star = info['star']
if 'diamond' in star:
info_json['携程评级'] = star
info['星级'] = ''
else:
info_json['携程评级'] = ''
info['星级'] = star
info_json['类型'] = info['stardesc']
info_json['省'] = ''
info_json['市'] = city_name
lon = info['lon']
lat = info['lat']
info_json['中心点'] = lon + lat
info_json['最低价'] = htllist[num]['amount']
# for ht in htllist:
# if ht['hotelid'] == info['id']:
# info_json['最低价'] = ht['amount']
url = 'http://hotels.ctrip.com'+info['url']
headers = {}
headers = self.url_start(headers)
num += 1
self.crawl(url,headers=headers,retries=10, callback=self.detail_parse, save=info_json)
# 解析详情页,拿到剩余字段,保存数据库
def detail_parse(self,response):
tree = etree.HTML(response.text)
#price = tree.xpath('//p[@class="staring_price"]/span[@class="price"]/text()')[0]
special = tree.xpath('//div[@class="grade"]/div[@class="special_label"]/i[@class="i_label"]/text()')
bar = tree.xpath('//div[@class="path_bar2"]/a/text()')
if len(bar) == 3:
district = bar[2]
brand = ''
elif len(bar) == 4:
district = bar[3]
brand = [2]
else:
district = ''
brand = ''
info_json = response.save
#info_json['最低价'] = price
info_json['品牌'] = brand
info_json['所在区县'] = district
info_json['特色'] = special
self.send_message(self.project_name, info_json, url=info_json['id'])
def on_message(self, project_name, msg):
return msg
#
def url_start(self, headers):
times = int(time.time())
planText = "orderno=ZF20186158891UccNQO,secret=fbba4b982cc64755b23404f99297ecbd,timestamp={}".format(times)
md = hashlib.md5()
md.update(planText.encode('utf-8'))
content = md.hexdigest()
ua = UserAgent()
headers['User-Agent'] = ua.random
headers['Proxy-Authorization'] = 'sign={}&orderno=ZF20186158891UccNQO×tamp={}'.format(content.upper(),
times)
return headers