python新人尝试爬取大众点评齿科信息 获取评分 经纬度 团单销量 等信

新人初次尝试,就是访问的次数多了 会被点评 反爬 需要浏览器滑动验证 ,暂时还没有学会怎么破解,

20191017092225724.png
import requests
import re
import csv
import time
mts = []
def marse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
        'cookie':'navCtgScroll=100; navCtgScroll=200; _lxsdk_cuid=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _lxsdk=16d7bde3e45c8-0b491cbf188485-67e1b3f-1fa400-16d7bde3e46c8; _hc.v="\"ab6667ff-ff89-4c88-9924-2865edbe01ee.1569741222\""; s_ViewType=10; mpmerchant_portal_shopid=18189287; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; cy=24; cye=shijiazhuang; _lxsdk_s=16dd2e5facb-327-0e0-88a%7C%7C190'
    }
    surl = 'http://www.dianping.com/shop/'

    获取大众点评列表
    response = requests.get(url,headers=headers)
    text = response.content.decode('utf-8')
    lis = re.findall(r'li class=""(.*?)',text,re.DOTALL)


     循环大众点评商家列表
    for li in lis:
        # 获取商家网页链接
        urls = re.findall(r'
.*?data-shopid="(.*?)".*?',li,re.DOTALL) # 获取商家促销信息列表 cxl= re.findall(r'
(.*?)
',li,re.DOTALL) # 获取商家促销信息列表详细内容 listcx=[] # 循环促销列表 for cxs in cxl: cxss = re.findall(r'>团购:(.*?)\n',cxs,re.DOTALL)# 促销团单标题 cxurl = re.findall(r'已售(.*?)<',cxre,re.DOTALL) tuandan=(c,yishou) listcx.append(tuandan) 获取商家详细 mt1 = [] # 循环商家url列表 从而获取 星级 名称 评分 地理位置 经纬度 for ur in urls: durl =surl+ur res = requests.get(durl, headers=headers) t = res.content.decode('utf-8') name = re.findall(r'

(.*?) (.*?)<', t, re.DOTALL) avg = re.findall(r'(.*?).*?"item">(.*?)(.*?)(.*?)', t, re.DOTALL) xy = re.findall(r'shopGlat: "(.*?)", shopGlng:"(.*?)",', t, re.DOTALL) print(durl) time.sleep(0) mt2 = { 'name':name, 'title':title, 'reviewCount':reviewCount, 'avg':avg, 'score':score, 'address':address, 'xy':xy } print(mt2) mt1.append(mt2) mt = { 'mt':mt1, 'cx':listcx } mts.append(mt) def main(): lll=[] # 访问 1-10 页商家列表 for i in range(1,10): url = 'http://www.dianping.com/search/keyword/24/0_%E9%BD%BF%E7%A7%91/p{}'.format(i) print (url) marse_page(url) for xx in mts: name = xx['mt'][0]['name'] title = xx['mt'][0]['title'] reviewCount = xx['mt'][0]['reviewCount'] avg = xx['mt'][0]['avg'] address = xx['mt'][0]['address'] score = xx['mt'][0]['score'] xy = xx['mt'][0]['xy'] cx = xx['cx'] ll = (name,title,reviewCount,avg,score,address,xy,cx) lll.append(ll) tou = ['医院名', '星级', '评论数', '人均', '评分','地址','经纬度','团单'] with open('美团.csv', 'w', newline='')as fp: writer = csv.writer(fp) writer.writerow(tou) writer.writerows(lll) print(mts) if __name__ == '__main__': main()

你可能感兴趣的:(python新人尝试爬取大众点评齿科信息 获取评分 经纬度 团单销量 等信)