import datetime
import io
import sys
import os
import requests
from queue import Queue
from pymongo import MongoClient
import json
import jsonpath
import threading
#采集线程
class ThreadCrawl(threading.Thread):
def __init__(self,threadName,cityQueue,dataQueue):
super(ThreadCrawl,self).__init__()
self.threadName = threadName
self.cityQueue = cityQueue
self.dataQueue = dataQueue
self.headers = {
'charset':'utf-8',
'Accept-Encoding':'gzip',
'referer':'https://servicewechat.com/wx4a68a5b1b2d89fea/32/page-frame.html',
'content-type':'application/json',
'User-Agent':'Mozilla/5.0 (Linux; Android 5.1.1; Redmi 3 Build/LMY47V; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 Mobile Safari/537.36 MicroMessenger/7.0.6.1460(0x27000634) Process/appbrand2 NetType/WIFI Language/zh_CN',
'Content-Length':'415',
'Host':'sjz.ihotels.cc',
'Connection':'Keep-Alive',
}
self.json = {"brandNameList":["AA连锁酒店"],
"serviceNameList":[],
"distance":"",
"commentLeave":"",
"beginDate":"2019-08-26",
"endDate":"2019-08-27",
#"cityName":"上海",
"areaList":[],
"keyword":"",
"openType":1,
"pageIndex":1,
"pageSize":50,
"sortOpt":"",
"sortType":"",
#"lat":40.001064,
#"lon":116.461409,
"memberId":"",
"channel":17,
"deviceType":"4",
"tagVersion":"5.0.0",
"deviceName":"Redmi 3",
"code":"023NrRa70BYVxF1c7ra70LBGa70NrRan"
}
def run(self):
print('启动'+self.threadName+'...')
while not CRAWL_EXIT:
try:
# 取出一个数字,,先进先出
# 可选参数block,默认是true
# 1.如果队列为空,block为true不会结束 就会进入阻塞状态,直到队列有新的数据
# 2.如果队列,block为false的话 就会弹出一个queue.empty()异常
city = self.cityQueue.get(False)
print(city+'-----------')
self.json['cityName'] = city
listUrl = "https://sjz.ihotels.cc//ethank-sjz-web/rest/hotelResource/v2.1/queryHotelList"
# print(url)
content = requests.post(listUrl, headers=self.headers,json=self.json)
if 'hotelName' in content.text:
self.dataQueue.put(content.text)
except:
pass
print("结束 " + self.threadName)
CRAWL_EXIT = False
PARSE_EXIT = False
class ThreadParse(threading.Thread):
def __init__(self,threadName,dataQueue,lock):
super(ThreadParse,self).__init__()
self.threadName = threadName
self.dataQueue = dataQueue
self.lock = lock
self.strings = []
def run(self):
print('启动' + self.threadName + '...')
while not CRAWL_EXIT:
try:
text = self.dataQueue.get(False)
self.parse(text)
except:
pass
print(self.strings)
print("结束"+self.threadName)
def parse(self,text):
setup_io()
textjson = json.loads(text,encoding='utf-8')
jsonList = jsonpath.jsonpath(textjson,'$..hotelList')[0]
for list in jsonList:
poi_id = jsonpath.jsonpath(list,'$..hotelId')[0]
poi_name = jsonpath.jsonpath(list, '$..hotelName')[0]
city = jsonpath.jsonpath(list,'$..areaName')[0]
address = jsonpath.jsonpath(list,'$..areaName')[0]
score = jsonpath.jsonpath(list,'$..score')[0]
crawl_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
source = "尚美荟"
keyword = 'AA连锁'
strings = {'task_id':os.environ.get('CRAWLAB_TASK_ID'),
'poi_id':poi_id,
'poi_name':poi_name,
'city':city,
'address':address,
'score':score,
'crawl_time':crawl_time,
'source':source,
'keyword':keyword}
print(strings)
self.strings.append(strings)
#编码转换错误解决方式
def setup_io():
sys.stdout = sys.__stdout__ = io.TextIOWrapper(sys.stdout.detach(), encoding='utf-8', line_buffering=True)
sys.stderr = sys.__stderr__ = io.TextIOWrapper(sys.stderr.detach(), encoding='utf-8', line_buffering=True)
#程序主函数
def mian(citylist):
setup_io()#编码转换错误解决方式
cityjson = json.loads(citylist, encoding='utf-8')#把列表转为json字符串
citys = jsonpath.jsonpath(cityjson, '$..city')#获取所有的city
print(citys)
#创建城市队列
cityQueue = Queue()
for i in citys:
cityQueue.put(i)
#创建数据队列
dataQueue = Queue()
#创建锁
lock = threading.local
#创建采集线程
crawlList = ['采集线程1号','采集线程2号','采集线程3号']
threadcrawl = [];
for threadName in crawlList:
thread = ThreadCrawl(threadName, cityQueue,dataQueue)
thread.start()
threadcrawl.append(thread)
#创建解析线程
parseList = ['解析线程1号','解析线程2号','解析线程3号']
threadparse = [];
for threadName in parseList:
thread = ThreadParse(threadName,dataQueue,lock)
thread.start()
threadparse.append(thread)
while not cityQueue.empty():
pass
# 如果cityQueue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
print("cityQueue为空")
# 让抓取主线程进入阻塞状态,等待子线程执行完毕再退出
for thread in threadcrawl:
thread.join()
print("1")
# 如果dataQueue为空,采集线程退出循环
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
#让数据主线程进入阻塞状态,等待子线程执行完毕再退出
for thread in threadparse:
thread.join()
print("2")
print("谢谢使用!")
#程序入口
if __name__ == '__main__':
cityjson=[ {
"city": "成都",
"initial": "c"
}, {
"city": "上海",
"initial": "s"
}, {
"city": "中卫",
"initial": "Z"
},{
"city": "重庆",
"initial": "C"
}, {
"city": "承德",
"initial": "C"
}, {
"city": "沧州",
"initial": "C"
}, {
"city": "长治",
"initial": "C"
}, {
"city": "赤峰",
"initial": "C"
}, {
"city": "朝阳",
"initial": "C"
}, {
"city": "长春",
"initial": "C"
}, {
"city": "常州",
"initial": "C"
}, {
"city": "滁州",
"initial": "C"
}, {
"city": "巢湖",
"initial": "C"
}, {
"city": "池州",
"initial": "C"
}, {
"city": "长沙",
"initial": "C"
}
]
json_str = json.dumps(cityjson, ensure_ascii=False) #[{"city": "阿拉善盟", "initial": "A"}, {"city": "鞍山", "initial": "A"}, {"city": "安庆", "initial": "A"}, {"city": "中卫", "initial": "Z"}]
mian(json_str)