# 文件名:爬取美团网.py import re import pymongo import time from lxml import etree import requests from fake_useragent import UserAgent from selenium import webdriver MONGO_URL='localhost' MONGO_DB='meituan' MONGO_TABLE='info' client=pymongo.MongoClient(MONGO_URL) db=client[MONGO_DB] def get_html(url): try: headers={ 'Cookie':'_lxsdk_cuid=16610b2b5c816-0f8e1d339cc266-19117259-15f900-16610b2b5cf86; ci=1; rvct=1; client-id=66f49399-b814-4e0b-a704-43986a901666; _ga=GA1.2.865389101.1537880094; mtcdn=K; lsu=; oc=1EwO-ttGytSTskWthdH7HxZrMnwe94wkFZ4Du9QlKrqKeV9VM3ij2DsUCnL31RoSIJyZF7igWcGiXctjDTY_8FOGK67kYoquNpQJ5WqR39uRyDxJ8_hUpPCigykE160rKshldLOpaKXeG7tj7tASZ30u_ktfHhEmNYXTAU8OVpk; uuid=9f79af4c393c4441ac8e.1537921855.2.0.1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=222114727.1537878616312.1537968906127.1537969016134.10; _lxsdk=16610b2b5c816-0f8e1d339cc266-19117259-15f900-16610b2b5cf86; _gid=GA1.2.554947807.1537969474; _lxsdk_s=16616171cf8-5f4-e59-e71%7C%7C25', 'Host':'bj.meituan.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36' } repon=requests.get(url=url,headers=headers) repon.encoding='utf-8' if repon.status_code==200: html=repon.text e=etree.HTML(html) # 获取美食的url(从首页的url) urls=e.xpath('.//a[@class="link nav-text"]/@href')[0] return urls else: return None except: return None def get_meishi(urls,i): try: headers={ 'User-Agent':UserAgent().chrome } repon = requests.get(url=urls,headers=headers) repon.encoding = 'utf-8' if repon.status_code == 200: html = repon.text return html else: # 原因:后台只会返回3页的数据,做逻辑代码的处理,规避数据页数的限制 url = 'https://bj.meituan.com/' urls = get_html(url) i = i urls = urls + 'pn{}/'.format(i) print('爬取某团网第%i页的数据失败' % i) print('开始2次爬取某团网第%i页的数据' % i) html = get_meishi(urls,i) return html except: return None def get_info(html): # 获取商铺的名称 shop_name = re.findall(r'"title":"([^"]+)', html)[19:] # 获取商铺的评分 shop_avgScore = re.findall(r'"avgScore":([^,]+)', html) # 获取评价的记录数 pingjia_cuont=re.findall(r'"allCommentNum":([^,]+)', html) # 获取店铺的店址 shop_dizhi=re.findall(r'"address":"([^"]+)', html) # 获取店铺的价格 avgPrices=re.findall(r'"avgPrice":([^,]+)', html)[15:] for name,avgScore,cuont,dazhi,avgPrice in zip(shop_name,shop_avgScore,pingjia_cuont,shop_dizhi,avgPrices): data={ 'name':name, 'avgScore':avgScore+'分', 'cuont':cuont+'条', 'dazhi':dazhi, 'avgPrice':avgPrice+'元' } data_info(data) def data_info(data): try: if db[MONGO_TABLE].insert(data): print('数据成功', data) except Exception: print('数据保存失败!', data) def main(): url='https://bj.meituan.com/' urls=get_html(url) i=1 while i<=10: urls=urls+'pn{}/'.format(i) print('开始爬取某团网第%i页的数据'%i) time.sleep(2) html=get_meishi(urls,i) get_info(html) i+=1 if __name__ == '__main__': main()