抓取全国所有城市,美食的店铺信息。
没有IP代理,勿用
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-08-29 14:37:07
# Project: dianping_vi
from pyspider.libs.base_handler import *
import datetime
import re
import json
import copy
from pymongo import MongoClient
# 连接线下数据库
DB_IP =
DB_PORT =
#DB_IP = '127.0.0.1'
#DB_PORT = 27017
client = MongoClient(host=DB_IP, port=DB_PORT)
# admin 数据库有帐号,连接-认证-切换
db_auth = client.admin
db_auth.authenticate( )
DB_NAME = 'research'
DB_COL = 'dianping'
db = client[DB_NAME]
col = db[DB_COL]
detail_headers = {
'Host': 'www.dianping.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
'Cookie': 'hc.v=c433e5ea-ff94-9d82-2544-871b013c64eb.1536116212; _lxsdk_cuid=165a7a93ffcc8-0885d455e400d4-3b7b0d58-1aeaa0-165a7a93ffcc8; _lxsdk=165a7a93ffcc8-0885d455e400d4-3b7b0d58-1aeaa0-165a7a93ffcc8; _lxsdk_s=165a8c7b269-b24-ec-e63%7C%7C135',
#'Cookie': '_lxsdk_cuid=165419b77c0c8-0b7bab6ed7c246-1e2e130c-1fa400-165419b77c1c8; _lxsdk=165419b77c0c8-0b7bab6ed7c246-1e2e130c-1fa400-165419b77c1c8; _hc.v=b53c090b-d406-9c02-4cf2-ef330bf04f87.1534404033; switchcityflashtoast=1; source=m_browser_test_33; pvhistory="6L+U5ZuePjo8L3N1Z2dlc3QvZ2V0SnNvbkRhdGE/Y2FsbGJhY2s9anNvbnBfMTUzNDQwNDI0NjYxOV82NTg3NT46PDE1MzQ0MDQyNDY2NzddX1s="; m_flash2=1; default_ab=citylist%3AA%3A1%7Cshop%3AA%3A1%7Cindex%3AA%3A1%7CshopList%3AA%3A1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s=%7C%7C0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
}
def parse_score(taste, doc):
if len(taste) == 2:
taste_score = [num_map.get(x, '0') for x in taste]
taste = float('.'.join(taste_score))
elif len(taste) == 1:
taste_score = num_map.get(taste[0], '0')
_flag = doc.xpath('//span[@id="comment_score"]/span[1]/text()')[1]
if _flag.startswith('1'):
taste = float('1.' + taste_score)
else:
taste = float(taste_score + '1.')
else:
taste = 0
return taste
def get_today():
return datetime.datetime.strptime(datetime.datetime.now().strftime('%Y-%m-%d'), '%Y-%m-%d')
class Handler(BaseHandler):
crawl_config = {
'proxy': '',
"headers": {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
},
'retries': 6
}
@every(minutes=24 * 60)
def on_start(self):
url = 'https://m.dianping.com/citylist'
self.crawl(url, callback=self.get_city_id)
@config(age=60)
def get_city_id(self, response):
# 进入美食的url
url = 'https://m.dianping.com/{}/ch10/d1?from=m_nav_1_meishi'
result = re.findall('window.PAGE_INITIAL_STATE = ({.*})', response.text)[0]
print(result)
city_data = json.loads(result)
for each in city_data['hotcity']['data']['hotCity']:
item = {
'city_name': each['cityName'],
'city_id': each['cityId'],
'city_pyname': each['cityEnName'],
}
city_pyname = each['cityEnName']
print(each['cityName'])
self.crawl(url.format(city_pyname), validate_cert=False, callback=self.get_area_food, save=item)
for each in city_data['list']['data']['cityData']:
for i in each['list']:
item = {
'city_name': i['cityName'],
'city_id': i['cityId'],
'city_pyname': i['cityEnName'],
}
city_pyname = i['cityEnName']
print(i['cityName'], '-----',city_pyname)
#if city_pyname == 'rongcang':
self.crawl(url.format(city_pyname), validate_cert=False, callback=self.get_area_food, save=item)
@config(age=60)
def get_area_food(self, response):
result = re.findall('window.PAGE_INITIAL_STATE = ({.*})', response.text)[0]
item = response.save
print(item)
city_pyname = item.pop('city_pyname')
data = json.loads(result)
areas = data['mapiSearch']['data']['regionNavs']
areas = [x for x in areas if x['parentId'] == 0 and x['count'] > 0]
# {u'count': 9828, u'name': u'\u897f\u57ce\u533a', u'regionType': 0, u'parentId': 0, u'lat': 0, u'lng': 0, u'id': 16}
foods = data['mapiSearch']['data']['categoryNavs']
foods = [x for x in foods if x['parentId'] == 10 and x["id"] != 10]
city_id = item.pop('city_id')
for area in areas:
for food in foods:
_item = copy.deepcopy(item)
_item['region'] = area['name']
_item['category'] = food['name']
category_id = food['id']
region_id = area['id']
print(area['name'], food['name'], category_id, region_id)
_url = 'http://www.dianping.com/{}/ch10/g{}r{}p1'.format(city_pyname, category_id, region_id)
print(_url)
self.crawl(_url, callback=self.get_next_page, save={'item': item, 'url': _url})
@config(age=60)
def get_next_page(self, response):
_item = response.save['item']
doc = response.etree
## 翻页-------如果if不执行,翻页结束
if response.save.get('url'):
url = response.save['url']
pages = doc.xpath('//a[@class="PageLink"]/text()')
if len(pages) > 0 or pages[-1] != '1':
max_page = int(pages[-1])
for each in range(2, max_page + 1, 1):
_url = url.replace('p1', 'p{}'.format(each))
self.crawl(_url, callback=self.get_next_page, save={'item': _item})
shops = doc.xpath('//div[@id="shop-all-list"]/ul/li')
for shop in shops:
item = copy.deepcopy(_item)
shop_id = shop.xpath('.//div[@class="tit"]/a[1]/@data-shopid')[0]
item['shop_id'] = shop_id
name = shop.xpath('.//div[@class="tit"]/a[1]/@title')[0]
item['name'] = name # 店铺名
is_ad = shop.xpath('.//div[@class="tit"]/a[2]/text()')
if len(is_ad) == 0 or is_ad[0] != '广告':
item['ad_shop'] = False # 是否是广告
else:
item['ad_shop'] = True # 是否是广告
shoop_id = shop.xpath('.//div[@class="tit"]/a[1]/@data-shopid')[0]
item['shop_id'] = shoop_id # 店铺id
addr = shop.xpath('.//span[@class="addr"]/text()')
item['addr'] = addr[0] if len(addr) > 0 else '' # address
region_tag = shop.xpath('.//div[@class="tag-addr"]/a[2]/span[@class="tag"]/text()')
item['area'] = region_tag[0] if len(region_tag) > 0 else '' # 详细地区的名字
review = shop.xpath('.//div[@class="comment"]/a[1]/b/text()')
item['review_count'] = int(review[0]) if len(review) > 0 else 0 # 评论数
price = shop.xpath('.//div[@class="comment"]/a[2]/b/text()')
item['price_text'] = int(price[0].replace('¥', '')) if len(price) > 0 else 0
scores = shop.xpath('.//span[@class="comment-list"]/span/b/text()')
if len(scores) == 3:
taste, surrounding, service = scores
item['taste'] = float(taste)
item['surrounding'] = float(surrounding)
item['service'] = float(service)
else:
item['taste'] = 0
item['surrounding'] = 0
item['service'] = 0
star = shop.xpath('.//div[@class="comment"]/span/@class')[0]
item['star'] = float(star.replace('sml-rank-stars sml-str', ''))/10
## 打印
print(item)
_url = "http://www.dianping.com/shop/{}/review_all".format(item['shop_id'])
self.crawl(_url, callback=self.get_detail_page, save={'item': item}, headers=detail_headers)
@config(age=60)
def get_detail_page(self, response):
item = response.save['item']
doc = response.etree
tags_list = doc.xpath("//div[@class='reviews-tags']/div[@class='content']/span")
review_tags = []
for each in tags_list:
review_tags.append(''.join([i.strip() for i in each.xpath("./a/text()")[0].split('\n')]))
item["review_tags"] = review_tags
print(review_tags)
#comments_list = doc.xpath("//div[@id='summaryfilter-wrapper']/div[class='comment-filter-box clearfix J-filter']/label")
if doc.xpath("//label[@class='filter-item filter-pic']"):
item["pic"] = u'图片' + doc.xpath("//label[@class='filter-item filter-pic']/span[@class='count']/text()")[0]
else:
item["pic"] = ''
print(item["pic"])
if doc.xpath("//label[@class='filter-item filter-good']"):
item["good"] = u'好评' + doc.xpath("//label[@class='filter-item filter-good']/span[@class='count']/text()")[0]
else:
item["good"] = ''
print(item["good"])
if doc.xpath("//label[@class='filter-item filter-middle']"):
item["middle"] = u'中评' + doc.xpath("//label[@class='filter-item filter-middle']/span[@class='count']/text()")[0]
else:
item["middle"] = ''
print(item["middle"])
if doc.xpath("//label[@class='filter-item filter-bad']"):
item["bad"] = u'差评' + doc.xpath("//label[@class='filter-item filter-bad']/span[@class='count']/text()")[0]
else:
item["bad"] = ''
print(item["bad"])
item["date"] = get_today()
item["update_time"] = datetime.datetime.now()
print(item)
yield item
def on_result(self, result):
super(Handler, self).on_result(result)
if not result:
return
update_key = {
'name': result['name'],
'shop_id': result['shop_id'],
'date': result["date"]
}
col.update(update_key, {'$set': result}, upsert=True)