大众点评店铺页面最大分页数是50页,要抓取信息就是通过区域、店铺类型分解到最小达到尽可能全的抓取。以成都餐饮为例,每种分类先取到最小,区域范围依次从成都到区县到街道,如果大区域该分类小于50页就可以抓取,否则继续分解。
大众的页面有时候有加密,是通过把数据字体设置为它独有格式来实现,下载对应字体对应转码即可,有时候没有加密就可以跳过不管。
首先把数据根据地区和类型分解到小于50页并存在数据库,然后一页页抓取基本信息,最后通过观察的接口获取详细信息如详细地址、经纬度、各项评分、评价数等。
# -*- coding: utf-8 -*-
import json
import requests
import pymysql
import time
from fontTools.ttLib import TTFont
def woff_dict(key):
if key == 'address':
woff = TTFont('C:\\Users\\Administrator\\Desktop\\address.woff') # 读取woff文件
elif key == 'num':
woff = TTFont('C:\\Users\\Administrator\\Desktop\\num.woff') # 读取woff文件
# woff文件中ID编号为2~602的601个字符
woff_str_601 = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'
# ['cmap']为字符与Unicode编码的映射关系列表
woff_unicode = woff['cmap'].tables[0].ttFont.getGlyphOrder() # 获取603个字符对应的unicode编码
woff_character = ['.notdef', 'x'] + list(woff_str_601) # 添加编号为0、1的两个特殊字符
woff_dict = dict(zip(woff_unicode, woff_character))
return woff_dict
def decodestr(firststr):
strlist = firststr.split("<")
laststr = ""
for single in strlist:
single = single.replace("/d>","").replace("/e>","")
if single.find("address")>0:
single = single[-5:-1]
laststr += addressdict[single]
#print(addressdict[single])
elif single.find("num")>0:
single = single[-5:-1]
#print(numdict[single])
laststr += numdict[single]
elif single !="":
laststr += single
return laststr
#根据链接获取当前条件下结果的页数
def getpagecount(URLstr,countryname):
try:
res = requests.get(URLstr,headers=headers).text
except:
time.sleep(120)
return getpagecount(URLstr,countryname)
#如果抓取被限制,休眠后重新抓取
if res.find("403 Forbidden")>0:
time.sleep(60)
print(URLstr+" "+"403 forbidden "+countryname)
return getpagecount(URLstr,countryname)
#当分页栏不存在说明只有一页
if res.find("没有找到符合条件的商户")>0:
pageCount = 0
elif res.find("div class=\"page\"")<0:
pageCount = 1
print(URLstr+" "+"1页 "+countryname)
else:
pagestr = res[res.find("div class=\"page\""):]
pagestr = pagestr[:pagestr.find("
这一步完成后,根据这些分好类的连接抓取基本信息
# -*- coding: utf-8 -*-
import json
import requests
from fontTools.ttLib import TTFont
import pymysql
import time
def woff_dict(key):
if key == 'address':
woff = TTFont('C:\\Users\\Administrator\\Desktop\\address.woff') # 读取woff文件
elif key == 'num':
woff = TTFont('C:\\Users\\Administrator\\Desktop\\num.woff') # 读取woff文件
# woff文件中ID编号为2~602的601个字符
woff_str_601 = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'
# ['cmap']为字符与Unicode编码的映射关系列表
woff_unicode = woff['cmap'].tables[0].ttFont.getGlyphOrder() # 获取603个字符对应的unicode编码
woff_character = ['.notdef', 'x'] + list(woff_str_601) # 添加编号为0、1的两个特殊字符
woff_dict = dict(zip(woff_unicode, woff_character))
return woff_dict
def decodestr(firststr):
strlist = firststr.split("<")
laststr = ""
for single in strlist:
single = single.replace("/d>","").replace("/e>","")
if single.find("address")>0:
single = single[-5:-1]
laststr += addressdict[single]
#print(addressdict[single])
elif single.find("num")>0:
single = single[-5:-1]
#print(numdict[single])
laststr += numdict[single]
elif single !="":
laststr += single
return laststr
if __name__ == '__main__':
woffnum = (str)(woff_dict('num')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
woffaddress = (str)(woff_dict('address')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
numdict = {}
newdict = woffnum.split(",")
for d in newdict:
d = '{' + d + '}'
d = eval(d)
numdict.update(d)
addressdict = {}
newdict = woffaddress.split(",")
for d in newdict:
d = '{' + d + '}'
d = eval(d)
addressdict.update(d)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
"Cookie" : "自己的",
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
}
conn = pymysql.connect(host = 'localhost',user = "root",passwd = "自己的",db = "大众点评")
cur=conn.cursor()
querysql = "SELECT url,hasGet,finish FROM dazhong_paging_restaurant"
cur.execute(querysql)
lists = cur.fetchall()
for list in lists:
url = list[0]
hasGet = list[1]
finish = list[2]
if hasGet==None:
hasGet=0
hasGet += 1
if finish!=1:
url += "o3p"
for i in range(1,51):
if hasGet>i:
print("已抓取,跳过该页:"+(str)(i))
continue
urlnew = url+(str)(i)
requeststr0 = urlnew
try:
res = requests.get(requeststr0,headers=headers,timeout=100).text
except:
time.sleep(80)
res = requests.get(requeststr0,headers=headers,timeout=100).text
#如果抓取被限制,休眠后重新抓取
if res.find("403 Forbidden")>0:
print("403访问被限制,已退出")
exit()
#如果页数到尽头 就退出该分类
if res.find("没有找到符合条件的商户")>0:
break
res = res[res.find("shop-all-list"):res.find("商户没有被收录")]
res = res.split("")
for re in res:
if len(re)<50:
continue
shopid = re[re.find("data-shopid=\"")+13:]
shopid = shopid[:shopid.find("\"")]
shopAllname = re[re.find("")+4:re.find("
")].replace("'","\\'")
if re.find("https://www.dianping.com/brands/")>0:
shopGroupId = re[re.find("https://www.dianping.com/brands/")+32:re.find("\" module=\"list-branch\"")]
else:
shopGroupId = ""
if re.find("我要评价")>0:
defaultReviewCount = 0
else:
defaultReviewCount = re[re.find("")+3:re.find("")]
avgPrice = re[re.find("人均"):]
if avgPrice.find("-")==13:
avgPrice=0
else:
avgPrice = avgPrice[avgPrice.find("")+4:avgPrice.find("")]
if re.find("istopTrade")>0:
status = re[re.find("istopTrade")+12:]
status = status[:status.find("")]
else:
status=""
countryAndtype = re[re.find("tag-addr"):]
mainParentCategoryId = countryAndtype[countryAndtype.find("/g")+2:countryAndtype.find("\" data-click-name")]
categoryName = countryAndtype[countryAndtype.find("class=\"tag\">")+12:countryAndtype.find("")]
countryAndtype = countryAndtype[countryAndtype.find("\"sep\""):]
countryid = countryAndtype[countryAndtype.find("/r")+2:countryAndtype.find("\" data-click-name")]
countryname = countryAndtype[countryAndtype.find("class=\"tag\">")+12:countryAndtype.find("")]
if countryid.find("|")>0:
print("该店铺信息异常被跳过:"+shopid)
continue
if re.find("class=\"recommend\"")>0:
recommendstr = re[re.find("class=\"recommend\"")+16:]
recommendstr = recommendstr[:recommendstr.find("
最后通过接口获取更多详细丰富信息
# -*- coding: utf-8 -*-
import json
import requests
from fontTools.ttLib import TTFont
import pymysql
import time
def woff_dict(key):
if key == 'address':
woff = TTFont('C:\\Users\\Administrator\\Desktop\\address.woff') # 读取woff文件
elif key == 'num':
woff = TTFont('C:\\Users\\Administrator\\Desktop\\num.woff') # 读取woff文件
# woff文件中ID编号为2~602的601个字符
woff_str_601 = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕'
# ['cmap']为字符与Unicode编码的映射关系列表
woff_unicode = woff['cmap'].tables[0].ttFont.getGlyphOrder() # 获取603个字符对应的unicode编码
woff_character = ['.notdef', 'x'] + list(woff_str_601) # 添加编号为0、1的两个特殊字符
woff_dict = dict(zip(woff_unicode, woff_character))
return woff_dict
def decodestr(firststr):
strlist = firststr.split("<")
laststr = ""
for single in strlist:
single = single.replace("/d>","").replace("/e>","")
if single.find("address")>0:
single = single[-5:-1]
laststr += addressdict[single]
#print(addressdict[single])
elif single.find("num")>0:
single = single[-5:-1]
#print(numdict[single])
laststr += numdict[single]
elif single !="":
laststr += single
return laststr
if __name__ == '__main__':
woffnum = (str)(woff_dict('num')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
woffaddress = (str)(woff_dict('address')).replace("{","").replace("}","").replace(" ","").replace("'uni","'")
numdict = {}
newdict = woffnum.split(",")
for d in newdict:
d = '{' + d + '}'
d = eval(d)
numdict.update(d)
addressdict = {}
newdict = woffaddress.split(",")
for d in newdict:
d = '{' + d + '}'
d = eval(d)
addressdict.update(d)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36",
#"Cookie" : "自己的",
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
}
conn = pymysql.connect(host = 'localhost',user = "root",passwd = "自己的",db = "大众点评")
cur=conn.cursor()
querysql = "SELECT shopid FROM shopdetail_restaurant where fivescore is NULL"
cur.execute(querysql)
lists = cur.fetchall()
for list in lists:
shopid = list[0]
requeststr1 = "https://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?shopId={}&cityId=1&mainCategoryId=10".format(shopid)
requeststr2 = "https://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?shopId="+shopid
requeststr3 = "https://www.dianping.com/ajax/json/shopDynamic/shopAside?shopId="+shopid
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
}
res = requests.get(requeststr1,headers=headers).json()
avgPrice = decodestr(res['avgPrice'])
defaultReviewCount = decodestr(res['defaultReviewCount'])
try:
fivescore = res['fiveScore']
except:
fivescore = '-'
if fivescore=="-":
fivescore=0
scoreTaste = decodestr(res['shopRefinedScoreValueList'][0])
if scoreTaste=="-":
scoreTaste=0
scoreEnvironment = decodestr(res['shopRefinedScoreValueList'][1])
if scoreEnvironment=="-":
scoreEnvironment=0
scoreService = decodestr(res['shopRefinedScoreValueList'][2])
if scoreService=="-":
scoreService=0
res = requests.get(requeststr2,headers=headers).json()
shopName = res['msg']['shopInfo']['shopName'].replace("'","\\'")
branchName = res['msg']['shopInfo']['branchName']
address = decodestr(res['msg']['shopInfo']['address']).replace("'","\\'")
phoneNo = decodestr(res['msg']['shopInfo']['phoneNo'])
shopGroupId = res['msg']['shopInfo']['shopGroupId']
if shopGroupId==shopid:
shopGroupId=""
res = requests.get(requeststr3,headers=headers).json()
glat = res['shop']['glat']
glng = res['shop']['glng']
categoryName = res['category']['categoryName']
#enl = res['category']['mainParentCategoryId']
if branchName==None:
branchName=""
#print(avgPrice+" "+defaultReviewCount+" "+fivescore+" "+scoreTaste+" "+scoreEnvironment+" "+scoreService+" "+shopName+" "+branchName+" "+address+" "+phoneNo+" "+shopGroupId+" "+(str)(glat)+" "+(str)(glng)+" "+categoryName+" "+(str)(mainParentCategoryId))
print(avgPrice+" "+defaultReviewCount+" "+(str)(fivescore)+" "+(str)(scoreTaste)+" "+(str)(scoreEnvironment)+" "+(str)(scoreService)+" "+shopName+" "+branchName+" "+address+" "+phoneNo+" "+shopGroupId+" "+(str)(glat)+" "+(str)(glng)+" "+categoryName)
insertSQLStrings="update `大众点评`.`shopdetail_restaurant` SET `fivescore` = {},`scoreTaste`={},`scoreEnvironment`={},`scoreService`={},`avgPrice`={},`defaultReviewCount`={},`shopName`='{}',`branchName`='{}',`address`='{}',`phoneNo`='{}',`shopGroupId`='{}',`glat`={},`glng`={} WHERE shopid = '{}'".format(fivescore, scoreTaste, scoreEnvironment,scoreService,avgPrice,defaultReviewCount,shopName,branchName,address,phoneNo,shopGroupId,glat,glng,shopid)
#print(insertSQLStrings)
cur.execute(insertSQLStrings)
conn.commit()
time.sleep(2)
#exit()
最后结束如下