爬取某团网

 

 

# 文件名:爬取美团网.py
import re

import pymongo
import time
from lxml import etree

import requests

from fake_useragent import UserAgent

from selenium import webdriver


MONGO_URL='localhost'

MONGO_DB='meituan'

MONGO_TABLE='info'
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]

def get_html(url):

    try:
        headers={
            'Cookie':'_lxsdk_cuid=16610b2b5c816-0f8e1d339cc266-19117259-15f900-16610b2b5cf86; ci=1; rvct=1; client-id=66f49399-b814-4e0b-a704-43986a901666; _ga=GA1.2.865389101.1537880094; mtcdn=K; lsu=; oc=1EwO-ttGytSTskWthdH7HxZrMnwe94wkFZ4Du9QlKrqKeV9VM3ij2DsUCnL31RoSIJyZF7igWcGiXctjDTY_8FOGK67kYoquNpQJ5WqR39uRyDxJ8_hUpPCigykE160rKshldLOpaKXeG7tj7tASZ30u_ktfHhEmNYXTAU8OVpk; uuid=9f79af4c393c4441ac8e.1537921855.2.0.1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=222114727.1537878616312.1537968906127.1537969016134.10; _lxsdk=16610b2b5c816-0f8e1d339cc266-19117259-15f900-16610b2b5cf86; _gid=GA1.2.554947807.1537969474; _lxsdk_s=16616171cf8-5f4-e59-e71%7C%7C25',
            'Host':'bj.meituan.com',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3486.0 Safari/537.36'
        }

        repon=requests.get(url=url,headers=headers)

        repon.encoding='utf-8'

        if repon.status_code==200:

           html=repon.text
           e=etree.HTML(html)
           # 获取美食的url(从首页的url)
           urls=e.xpath('.//a[@class="link nav-text"]/@href')[0]

           return urls

        else:
            return None
    except:
        return None


def get_meishi(urls,i):
    try:
        headers={
            'User-Agent':UserAgent().chrome
        }
        repon = requests.get(url=urls,headers=headers)

        repon.encoding = 'utf-8'

        if repon.status_code == 200:

            html = repon.text
            return html

        else:
            # 原因:后台只会返回3页的数据,做逻辑代码的处理,规避数据页数的限制
            url = 'https://bj.meituan.com/'

            urls = get_html(url)
            i = i
            urls = urls + 'pn{}/'.format(i)
            print('爬取某团网第%i页的数据失败' % i)
            print('开始2次爬取某团网第%i页的数据' % i)
            html = get_meishi(urls,i)

            return html

    except:
        return None

def get_info(html):
    # 获取商铺的名称
    shop_name = re.findall(r'"title":"([^"]+)', html)[19:]
    # 获取商铺的评分
    shop_avgScore = re.findall(r'"avgScore":([^,]+)', html)
    # 获取评价的记录数
    pingjia_cuont=re.findall(r'"allCommentNum":([^,]+)', html)
    # 获取店铺的店址
    shop_dizhi=re.findall(r'"address":"([^"]+)', html)
    # 获取店铺的价格
    avgPrices=re.findall(r'"avgPrice":([^,]+)', html)[15:]

    for name,avgScore,cuont,dazhi,avgPrice in zip(shop_name,shop_avgScore,pingjia_cuont,shop_dizhi,avgPrices):
        data={
            'name':name,
            'avgScore':avgScore+'分',
            'cuont':cuont+'条',
            'dazhi':dazhi,
            'avgPrice':avgPrice+'元'
        }

        data_info(data)



def data_info(data):
    try:
        if db[MONGO_TABLE].insert(data):
            print('数据成功', data)
    except Exception:
        print('数据保存失败!', data)

def main():

    url='https://bj.meituan.com/'

    urls=get_html(url)
    i=1
    while i<=10:
        urls=urls+'pn{}/'.format(i)
        print('开始爬取某团网第%i页的数据'%i)

        time.sleep(2)
        html=get_meishi(urls,i)

        get_info(html)

        i+=1

if __name__ == '__main__':
    main()

你可能感兴趣的:(爬虫,python)