饿了么微信小程序抓取

饿了么需要准备多个账号获取cookie,很容易封cookie

1.评论版本

import requests
from lxml import etree
import re
import json
import csv
import pandas as pd
import hashlib

product_lists=[]
def down_load(url):
    headers1 = {
        # "Cookie":"SINAGLOBAL=7238757845138.87.1528291392417; UOR=,,spr_web_360_hao360_weibo_t001; login_sid_t=bd5a4abe734c091249cdce71379c0348; cross_origin_proto=SSL; Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; TC-V5-G0=866fef700b11606a930f0b3297300d95; _s_tentry=-; Apache=685802145012.8082.1542780237180; ULV=1542780237187:19:3:1:685802145012.8082.1542780237180:1541462062210; TC-Page-G0=cdcf495cbaea129529aa606e7629fea7; WBtopGlobal_register_version=18608f873d5d88f2; SSOLoginState=1542781061; wvr=6; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W50QEC8VdzuOHjXwxjEGser5JpX5K2hUgL.Fo-feo.ceKe4S0M2dJLoIpjLxKqLBoqL1-qLxKqLB.eLB-2LxKqL1KMLB.2t; ALF=1574324177; SCF=ApgoQqG5luyu67rkHic6LidzChLHTIe5EQZgRnsuPrfkK57iJqk723zd_GSb5ZMq2jbGlYvGXkZ6LbJj5PpY6zI.; SUB=_2A2528WQBDeRhGeNL6VsX8S3FzDuIHXVVh9LJrDV8PUNbmtAKLVnXkW9NSQ30mXwLfrcwH1SRYaTHBUXB4ipbEQrL; SUHB=02MvCTyTmQYvsK; un=18514476337; YF-V5-G0=a5a6106293f9aeef5e34a2e71f04fae4; wb_view_log_5529613977=1920*10801",

        # "Cache-Control":"max-age=0",
        "Connection": "keep-alive",
        "Cookie": "bid=YTzZGjuP; gr_user_id=97f998e4-aa93-486a-99f8-854c784f6bcd; BAIDU_SSP_lcr=https://www.baidu.com/link?url=Pwq6pphyjxk3oQRJg05jjUGiwtLjL7FBVFdqA8oFqpZoXc0hIxm9p0sa3fyWRSSs&wd=&eqid=c9c1816400092dd0000000035c218bdd; Hm_lvt_ecd4feb5c351cc02583045a5813b5142=1545271479,1545702369; __utmc=177678124; __utmz=177678124.1545702370.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; gr_session_id_8187ff886f0929da=3a10677b-5117-45e8-9de4-d80fc87ffc01; gr_session_id_da48e7b9eb89482489897fc1e45e98b6=3f128e3a-5f5c-42fd-9c45-e0e1ddeb97b2; _ga=GA1.2.377109310.1545271479; _gid=GA1.2.1629322193.1545716866; __utma=177678124.377109310.1545271479.1545702370.1545717020.3; __utmt=1; gr_session_id_8187ff886f0929da_3a10677b-5117-45e8-9de4-d80fc87ffc01=true; Hm_lpvt_ecd4feb5c351cc02583045a5813b5142=1545717163; __utmb=177678124.4.10.1545717020",
        "Host": "www.xiachufang.com",
        # "Referer":"https://weibo.com/",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36",

        # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        # "Accept-Encoding":"gzip, deflate, br",
        # "Accept-Language":"zh-CN,zh;q=0.9",
    }

    html = requests.get(url=url,headers=headers1,allow_redirects=False)
    print(html.status_code)
    # print(html.headers["location"])
    if html.status_code == 302:
        new_id_url="http://www.xiachufang.com"+html.headers["location"]
        print(new_id_url)
        new_html=requests.get(url=new_id_url,headers=headers1).text
        return new_id_url,etree.HTML(new_html)
    else:
        print("++++++++++++++++")
        print(url)
        # print(requests.get(url=url,headers=headers1).text)
        return url,etree.HTML(requests.get(url=url,headers=headers1).text)

    # return (new_html,new_id_url)


def down_load1(url):
    headers2 = {
        # "Cookie":"SINAGLOBAL=7238757845138.87.1528291392417; UOR=,,spr_web_360_hao360_weibo_t001; login_sid_t=bd5a4abe734c091249cdce71379c0348; cross_origin_proto=SSL; Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; TC-V5-G0=866fef700b11606a930f0b3297300d95; _s_tentry=-; Apache=685802145012.8082.1542780237180; ULV=1542780237187:19:3:1:685802145012.8082.1542780237180:1541462062210; TC-Page-G0=cdcf495cbaea129529aa606e7629fea7; WBtopGlobal_register_version=18608f873d5d88f2; SSOLoginState=1542781061; wvr=6; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W50QEC8VdzuOHjXwxjEGser5JpX5K2hUgL.Fo-feo.ceKe4S0M2dJLoIpjLxKqLBoqL1-qLxKqLB.eLB-2LxKqL1KMLB.2t; ALF=1574324177; SCF=ApgoQqG5luyu67rkHic6LidzChLHTIe5EQZgRnsuPrfkK57iJqk723zd_GSb5ZMq2jbGlYvGXkZ6LbJj5PpY6zI.; SUB=_2A2528WQBDeRhGeNL6VsX8S3FzDuIHXVVh9LJrDV8PUNbmtAKLVnXkW9NSQ30mXwLfrcwH1SRYaTHBUXB4ipbEQrL; SUHB=02MvCTyTmQYvsK; un=18514476337; YF-V5-G0=a5a6106293f9aeef5e34a2e71f04fae4; wb_view_log_5529613977=1920*10801",

        # "Cache-Control":"max-age=0",
        "Connection": "keep-alive",
        "Cookie": "bid=YTzZGjuP; gr_user_id=97f998e4-aa93-486a-99f8-854c784f6bcd; BAIDU_SSP_lcr=https://www.baidu.com/link?url=Pwq6pphyjxk3oQRJg05jjUGiwtLjL7FBVFdqA8oFqpZoXc0hIxm9p0sa3fyWRSSs&wd=&eqid=c9c1816400092dd0000000035c218bdd; Hm_lvt_ecd4feb5c351cc02583045a5813b5142=1545271479,1545702369; __utmc=177678124; __utmz=177678124.1545702370.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; gr_session_id_8187ff886f0929da=3a10677b-5117-45e8-9de4-d80fc87ffc01; gr_session_id_da48e7b9eb89482489897fc1e45e98b6=3f128e3a-5f5c-42fd-9c45-e0e1ddeb97b2; _ga=GA1.2.377109310.1545271479; _gid=GA1.2.1629322193.1545716866; __utma=177678124.377109310.1545271479.1545702370.1545717020.3; __utmt=1; gr_session_id_8187ff886f0929da_3a10677b-5117-45e8-9de4-d80fc87ffc01=true; Hm_lpvt_ecd4feb5c351cc02583045a5813b5142=1545717163; __utmb=177678124.4.10.1545717020",
        "Host": "www.xiachufang.com",
        # "Referer":"https://weibo.com/",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

        # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        # "Accept-Encoding":"gzip, deflate, br",
        # "Accept-Language":"zh-CN,zh;q=0.9",
    }

    html = requests.get(url=url,headers=headers2).text
    # return (new_html,new_id_url)
    return etree.HTML(html)


def down_load2(url):
    headers3 = {
        # "Cookie":"SINAGLOBAL=7238757845138.87.1528291392417; UOR=,,spr_web_360_hao360_weibo_t001; login_sid_t=bd5a4abe734c091249cdce71379c0348; cross_origin_proto=SSL; Ugrow-G0=e66b2e50a7e7f417f6cc12eec600f517; TC-V5-G0=866fef700b11606a930f0b3297300d95; _s_tentry=-; Apache=685802145012.8082.1542780237180; ULV=1542780237187:19:3:1:685802145012.8082.1542780237180:1541462062210; TC-Page-G0=cdcf495cbaea129529aa606e7629fea7; WBtopGlobal_register_version=18608f873d5d88f2; SSOLoginState=1542781061; wvr=6; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W50QEC8VdzuOHjXwxjEGser5JpX5K2hUgL.Fo-feo.ceKe4S0M2dJLoIpjLxKqLBoqL1-qLxKqLB.eLB-2LxKqL1KMLB.2t; ALF=1574324177; SCF=ApgoQqG5luyu67rkHic6LidzChLHTIe5EQZgRnsuPrfkK57iJqk723zd_GSb5ZMq2jbGlYvGXkZ6LbJj5PpY6zI.; SUB=_2A2528WQBDeRhGeNL6VsX8S3FzDuIHXVVh9LJrDV8PUNbmtAKLVnXkW9NSQ30mXwLfrcwH1SRYaTHBUXB4ipbEQrL; SUHB=02MvCTyTmQYvsK; un=18514476337; YF-V5-G0=a5a6106293f9aeef5e34a2e71f04fae4; wb_view_log_5529613977=1920*10801",

        # "Cache-Control":"max-age=0",
        "Connection": "keep-alive",
        "Cookie": "bid=YTzZGjuP; gr_user_id=97f998e4-aa93-486a-99f8-854c784f6bcd; BAIDU_SSP_lcr=https://www.baidu.com/link?url=Pwq6pphyjxk3oQRJg05jjUGiwtLjL7FBVFdqA8oFqpZoXc0hIxm9p0sa3fyWRSSs&wd=&eqid=c9c1816400092dd0000000035c218bdd; Hm_lvt_ecd4feb5c351cc02583045a5813b5142=1545271479,1545702369; __utmc=177678124; __utmz=177678124.1545702370.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; gr_session_id_8187ff886f0929da=3a10677b-5117-45e8-9de4-d80fc87ffc01; gr_session_id_da48e7b9eb89482489897fc1e45e98b6=3f128e3a-5f5c-42fd-9c45-e0e1ddeb97b2; _ga=GA1.2.377109310.1545271479; _gid=GA1.2.1629322193.1545716866; __utma=177678124.377109310.1545271479.1545702370.1545717020.3; __utmt=1; gr_session_id_8187ff886f0929da_3a10677b-5117-45e8-9de4-d80fc87ffc01=true; Hm_lpvt_ecd4feb5c351cc02583045a5813b5142=1545717163; __utmb=177678124.4.10.1545717020",
        "Host": "www.xiachufang.com",
        # "Referer":"https://weibo.com/",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

        # "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
        # "Accept-Encoding":"gzip, deflate, br",
        # "Accept-Language":"zh-CN,zh;q=0.9",
    }

    html = requests.get(url=url,headers=headers3).text
    # return (new_html,new_id_url)
    return etree.HTML(html)




word_pd=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\ele1.csv",engine='python',header=None).values.tolist()
print(word_pd)
# for m in range(1):
for m in range(1,len(word_pd)):
    key_word=word_pd[m][0]
    print(key_word)

    url1="http://www.xiachufang.com/search/?keyword={}".format(key_word)

    try:
# key_word="酸奶"
#         url1 = "http://www.xiachufang.com/search/?keyword={}".format(key_word)
        product_html_all=down_load(url1)
        product_html=product_html_all[1]
        print(len(product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li')))
        for w in range(len(product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li'))):
            # for i in range(1):
            print("**************")
            product_url="http://www.xiachufang.com"+product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/@href'.format(w+1))[0]
            print(product_url)
            product_name=product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/text()'.format(w+1))[0].strip()
            if product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[1]/text()'.format(w+1)):

                product_star=product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[1]/text()'.format(w+1))[0]
            else:
                product_star="暂无"
            if product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[@class="bold score"]/text()'.format(w+1)):
                product_make=product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[@class="bold score"]/text()'.format(w+1))[0]
            else:
                product_make=0
            product_author=product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="author"]/a[1]/text()'.format(w+1))[0]
            product_author_url="http://www.xiachufang.com"+product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="author"]/a[1]/@href'.format(w+1))[0]
            person_html=down_load2(product_author_url)
            person_info1=person_html.xpath('//div[@class="gray-font"]/div[1]')[0]
            person_info=person_info1.xpath('string(.)').strip().replace(" ","").replace("\n","")

            print(person_info)
            print(product_html.xpath('//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/@href'.format(w+1))[0])
            print(product_name)
            detil_html=down_load1(product_url)[1]
            product_save=detil_html.xpath('//div[@class="pv"]/text()')[0]
            product_content1=detil_html.xpath('//div[contains(@class,"recipe-show")]')[0]
            product_content=product_content1.xpath('string(.)').strip().replace(" ","").replace("\n","")
            product_time=detil_html.xpath('//div[@class="time"]/span/text()')[0]
            product_meke_all=detil_html.xpath('//div[contains(@class,"cooked")]/span[@class="number"]/text()')[0]
            print(product_meke_all)
            product_lists.append(
                [key_word, product_author, person_info, product_name, product_url, product_content, product_star,
                 product_meke_all, product_make, product_save, product_time])
        print(product_html[0])
        print("++++++++")
        if "category" in product_html_all[0]:
            for k in range(2):
                url2 = product_html_all[0]+"?page={}".format(str(k+2))
                product_html2 = down_load2(url2)
                for p in range(len(product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li'))):
                    # for i in range(1):
                    print("**************")
                    product_url = "http://www.xiachufang.com" + product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/@href'.format(
                            p + 1))[0]
                    print(product_url)
                    product_name = product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/text()'.format(
                            p + 1))[0].strip()
                    if product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[1]/text()'.format(
                            p + 1)):
                        product_star = product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[1]/text()'.format(
                            p + 1))[0]
                    else:
                        product_star="暂无"
                    if product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[@class="bold score"]/text()'.format(
                            p + 1)):
                        product_make = product_html2.xpath(
                            '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[@class="bold score"]/text()'.format(
                                p + 1))[0]
                    else:
                        product_make=0
                    product_author = product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="author"]/a[1]/text()'.format(
                            p + 1))[0]
                    product_author_url = "http://www.xiachufang.com" + product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="author"]/a[1]/@href'.format(
                            p + 1))[0]
                    person_html = down_load2(product_author_url)
                    person_info1 = person_html.xpath('//div[@class="gray-font"]/div[1]')[0]
                    person_info = person_info1.xpath('string(.)').strip().replace(" ", "").replace("\n", "")

                    print(person_info)
                    print(product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/@href'.format(
                            p + 1))[0])
                    print(product_name)
                    detil_html = down_load1(product_url)[1]
                    product_save = detil_html.xpath('//div[@class="pv"]/text()')[0]
                    product_content1 = detil_html.xpath('//div[contains(@class,"recipe-show")]')[0]
                    product_content = product_content1.xpath('string(.)').strip().replace(" ", "").replace("\n", "")
                    product_time = detil_html.xpath('//div[@class="time"]/span/text()')[0]
                    product_meke_all = \
                    detil_html.xpath('//div[contains(@class,"cooked")]/span[@class="number"]/text()')[0]
                    print(product_meke_all)
                    product_lists.append(
                        [key_word, product_author, person_info, product_name, product_url, product_content,
                         product_star,
                         product_meke_all, product_make, product_save, product_time])
        else:
            for k in range(2):
                url3 = product_html_all[0] + "&page={}".format(str(k + 2))
                product_html2 = down_load2(url3)
                for p in range(len(product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li'))):
                    # for i in range(1):
                    print("**************")
                    product_url = "http://www.xiachufang.com" + product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/@href'.format(
                            p + 1))[0]
                    print(product_url)
                    product_name = product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/text()'.format(
                            p + 1))[0].strip()
                    if product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[1]/text()'.format(
                            p + 1)):
                        product_star = product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[1]/text()'.format(
                            p + 1))[0]
                    else:
                        product_star="暂无"
                    if product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[@class="bold score"]/text()'.format(
                            p + 1)):
                        product_make = product_html2.xpath(
                            '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="stats"]/span[@class="bold score"]/text()'.format(
                                p + 1))[0]
                    else:
                        product_make=0
                    product_author = product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="author"]/a[1]/text()'.format(
                            p + 1))[0]
                    product_author_url = "http://www.xiachufang.com" + product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="author"]/a[1]/@href'.format(
                            p + 1))[0]
                    person_html = down_load2(product_author_url)
                    person_info1 = person_html.xpath('//div[@class="gray-font"]/div[1]')[0]
                    person_info = person_info1.xpath('string(.)').strip().replace(" ", "").replace("\n", "")

                    print(person_info)
                    print(product_html2.xpath(
                        '//div[contains(@class,"pure-u-3-4")]//div[@class="normal-recipe-list"]//ul[@class="list"]/li[{}]//p[@class="name"]/a/@href'.format(
                            p + 1))[0])
                    print(product_name)
                    detil_html = down_load1(product_url)[1]
                    product_save = detil_html.xpath('//div[@class="pv"]/text()')[0]
                    product_content1 = detil_html.xpath('//div[contains(@class,"recipe-show")]')[0]
                    product_content = product_content1.xpath('string(.)').strip().replace(" ", "").replace("\n", "")
                    product_time = detil_html.xpath('//div[@class="time"]/span/text()')[0]
                    product_meke_all = \
                        detil_html.xpath('//div[contains(@class,"cooked")]/span[@class="number"]/text()')[0]
                    print(product_meke_all)
                    product_lists.append(
                        [key_word, product_author, person_info, product_name, product_url, product_content,
                         product_star,
                         product_meke_all, product_make, product_save, product_time])

            # replys=""
            #评论信息
            # for j in range(len(detil_html.xpath('//div[contains(@class,"page-bottom-outer")]/div[@class="page-container"]//li'))):
            #     reply_name=detil_html.xpath('//div[contains(@class,"page-bottom-outer")]/div[@class="page-container"]//li[{}]//div[contains(@class,"info")]/a/text()'.format(j+1))[0]
            #     product_reply=detil_html.xpath('//div[contains(@class,"page-bottom-outer")]/div[@class="page-container"]//li[{}]//div[@class="right-bottom"]/text()'.format(j+1))[0]
            #     # replys+="用户名"+reply_name+"评论内容"+product_reply+"{{{{{}}}}}"
            #     reply_time1=detil_html.xpath('//div[contains(@class,"page-bottom-outer")]/div[@class="page-container"]//li[{}]//div[@role="question"]//div[contains(@class,"info")]/span/text()'.format(j+1))
            #     reply_time="".join(reply_time1)
            #     reply_url="http://www.xiachufang.com"+detil_html.xpath('//div[contains(@class,"page-bottom-outer")]/div[@class="page-container"]//li[{}]//div[contains(@class,"info")]/a/@href'.format(j+1))[0]
            #     person_reply_html = down_load2(reply_url)
            #     person_reply_info1 = person_reply_html.xpath('//div[@class="gray-font"]/div[1]')[0]
            #     person_reply_info = person_reply_info1.xpath('string(.)').strip().replace(" ","").replace("\n","")
            #     print(person_reply_info)
            #     print(key_word,product_author,person_info,product_name,product_url,product_content,product_star,product_meke_all,product_make,product_save,product_time,reply_name,person_reply_info,reply_time,product_reply)
            #     product_lists.append([key_word,product_author,person_info,product_name,product_url,product_content,product_star,product_meke_all,product_make,product_save,product_time,reply_name,person_reply_info,reply_time,product_reply])
    except:
        with open("下厨房61.csv", "w", encoding="utf-8-sig", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["关键词", "作者", "作者信息", "产品名称", "产品链接","内容", "产品评分", "做过人数", "7天做过","产品收藏", "创建时间"])

            for list in product_lists:
                k.writerow(list)
        pass

print(product_lists)
print(len(product_lists))
with open("下厨房62.csv", "w", encoding="utf-8-sig", newline="") as f:
    k = csv.writer(f, dialect="excel")
    k.writerow(["关键词", "作者", "作者信息", "产品名称", "产品链接","内容", "产品评分", "做过人数", "7天做过","产品收藏", "创建时间"])
    # k.writerow(["关键词", "作者", "作者信息", "产品名称", "产品链接","内容", "产品评分", "做过人数", "7天做过","产品收藏", "创建时间",  "评论人","评论者信息", "评论时间","评论"])

    for list in product_lists:
        k.writerow(list)

2.无评论版

import requests
import json
import csv
import pandas as pd
import time

lists_content=[]
word_pd=pd.read_csv(r"C:\Users\Lavector\Desktop\百事小红书\ele1.csv",engine='python',header=None).values.tolist()
print(word_pd)
for m in range(len(word_pd)):
    key_word=word_pd[m][0]
    print(key_word)
    try:
        for m in range(3):
            url="https://mainsite-restapi.ele.me/shopping/v2/restaurants/search?extras[]=activities&search_item_type=3&is_rewrite=1&latitude=36.04272&longitude=103.871645&keyword={}&order_by=0&offset={}&limit=10&terminal=weapp".format(key_word,m*10)

            headers={
            "charset":"utf-8",
            "Accept-Encoding":"gzip",
            "referer":"https://servicewechat.com/wxece3a9a4c82f58c9/185/page-frame.html",
            "x-shard":"loc=103.871645,36.04272",
            "cookie":"SID=8mNKOGfiLGqBnp1xixcAE3TAtI2B2q6kheHg",
            "content-type":"application/json",
            "User-Agent":"Mozilla/5.0 (Linux; Android 7.1.1; OD103 Build/NMF26F; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/62.0.3202.84 Mobile Safari/537.36 MicroMessenger/6.7.3.1360(0x26070333) NetType/WIFI Language/zh_CN Process/appbrand0",
            "Host":"mainsite-restapi.ele.me",
            "Connection":"Keep-Alive",

            }


            html=requests.get(url,headers=headers,verify=False).text
            # time.sleep(2)
            print(html)
            html=json.loads(html)
            restaurants=html['inside']["3"]["restaurant_with_foods"]
            print(restaurants)
            print(len(restaurants))
            if len(restaurants) >=5:
                for i in range(len(restaurants)):
                    shop_name=restaurants[i]["restaurant"]["name"]
                    shop_id=restaurants[i]["restaurant"]["id"]
                    shop_rate=restaurants[i]["restaurant"]["rating"]
                    shop_rely=restaurants[i]["restaurant"]["rating_count"]
                    shop_url=restaurants[i]["restaurant"]["scheme"]
                    shop_foods_name = ""
                    for j in range(len(restaurants[i]["foods"])):
                        shop_foods_name+=restaurants[i]["foods"][j]['name']+"{{{{{}}}}}"
                    print(shop_name,shop_id,shop_rate,shop_rely,shop_foods_name)
                    # url1="https://h5.ele.me/pizza/ugc/restaurants/{}/batch_comments?has_content=true&offset=0&limit=20".format(shop_id)
                    # headers1={
                    #     "Host": "h5.ele.me",
                    #     "Connection": "keep-alive",
                    #     "User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; MuMu Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36 MicroMessenger/6.6.7.1321(0x26060736) NetType/WIFI Language/zh_CN MicroMessenger/6.6.7.1321(0x26060736) NetType/WIFI Language/zh_CN miniProgram",
                    #     "x-shard": "shopid={};loc=104.076776,30.653985".format(shop_id),
                    #     "Accept": "*/*",
                    #     "Referer": "https://h5.ele.me/shop/",
                    #     "Accept-Encoding": "gzip, deflate",
                    #     "Accept-Language": "zh-CN,en-US;q=0.8",
                    #     "Cookie": "perf_ssid=xyyvoble4waynps09k99ll8lm3v41r98_2018-12-25; ubt_ssid=72i7evuki0294ufdmz7g367ec1e8d7ti_2018-12-25; cna=tPWoFBQiUSACAT2U8+AvSuXB; _utrace=f581d37d6d2894dab36eab4df5928444_2018-12-25; track_id=1547709004|9f932c41c9ed5855c61245cb12271e13939951ec221730de18|6640aee9e0134517080d9b27b4ececac; isg=BAkJYN4isfjpBE1uHJjuBGvUEzej_vzJCobL2at-hfAv8ikE86YNWPcDMBBunpXA; SID=f0n218703hEV9kxyinpe6SMQTFrFhaXcLEMQ; USERID=3561782466",
                    #     "X-Requested-With": "com.tencent.mm",
                    # }
                    # html1=requests.get(url1,headers=headers1,verify=False).text
                    # html1=json.loads(html1)
                    # for k in range(len(html1['comments'])):
                    #     reply_username=html1['comments'][k]['username']
                    #     reply_rate=html1['comments'][k]['rating']
                    #     reply_time=html1['comments'][k]['rated_at']
                    #     reply_text=html1['comments'][k]['rating_text']
                    #     reply_buy=""
                    #     for l in range(len(html1['comments'][k]['food_ratings'])):
                    #         reply_buy+=html1['comments'][k]['food_ratings'][l]['rate_name']
                    #     print(key_word,"国贸",shop_name,shop_url,shop_rely,shop_foods_name,reply_username,reply_buy,reply_rate,reply_time,reply_text)
                    lists_content.append([key_word,"兰州",shop_name,shop_url,shop_rely,shop_foods_name])

            else:
                for i in range(len(restaurants)):
                    shop_name=restaurants[i]["restaurant"]["name"]
                    shop_id=restaurants[i]["restaurant"]["id"]
                    shop_rate=restaurants[i]["restaurant"]["rating"]
                    shop_rely=restaurants[i]["restaurant"]["rating_count"]
                    shop_url=restaurants[i]["restaurant"]["scheme"]
                    shop_foods_name = ""
                    for j in range(len(restaurants[i]["foods"])):
                        shop_foods_name+=restaurants[i]["foods"][j]['name']+"{{{{{}}}}}"
                    print(shop_name,shop_id,shop_rate,shop_rely,shop_foods_name)
                    # url1="https://h5.ele.me/pizza/ugc/restaurants/{}/batch_comments?has_content=true&offset=0&limit=20".format(shop_id)
                    # headers1={
                    #     "Host": "h5.ele.me",
                    #     "Connection": "keep-alive",
                    #     "User-Agent": "Mozilla/5.0 (Linux; Android 6.0.1; MuMu Build/V417IR; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.100 Mobile Safari/537.36 MicroMessenger/6.6.7.1321(0x26060736) NetType/WIFI Language/zh_CN MicroMessenger/6.6.7.1321(0x26060736) NetType/WIFI Language/zh_CN miniProgram",
                    #     "x-shard": "shopid={};loc=104.076776,30.653985".format(shop_id),
                    #     "Accept": "*/*",
                    #     "Referer": "https://h5.ele.me/shop/",
                    #     "Accept-Encoding": "gzip, deflate",
                    #     "Accept-Language": "zh-CN,en-US;q=0.8",
                    #     "Cookie": "perf_ssid=xyyvoble4waynps09k99ll8lm3v41r98_2018-12-25; ubt_ssid=72i7evuki0294ufdmz7g367ec1e8d7ti_2018-12-25; cna=tPWoFBQiUSACAT2U8+AvSuXB; _utrace=f581d37d6d2894dab36eab4df5928444_2018-12-25; track_id=1547709004|9f932c41c9ed5855c61245cb12271e13939951ec221730de18|6640aee9e0134517080d9b27b4ececac; isg=BAkJYN4isfjpBE1uHJjuBGvUEzej_vzJCobL2at-hfAv8ikE86YNWPcDMBBunpXA; SID=f0n218703hEV9kxyinpe6SMQTFrFhaXcLEMQ; USERID=3561782466",
                    #     "X-Requested-With": "com.tencent.mm",
                    # }
                    # html1=requests.get(url1,headers=headers1,verify=False).text
                    # html1=json.loads(html1)
                    # for k in range(len(html1['comments'])):
                    #     reply_username=html1['comments'][k]['username']
                    #     reply_rate=html1['comments'][k]['rating']
                    #     reply_time=html1['comments'][k]['rated_at']
                    #     reply_text=html1['comments'][k]['rating_text']
                    #     reply_buy=""
                    #     for l in range(len(html1['comments'][k]['food_ratings'])):
                    #         reply_buy+=html1['comments'][k]['food_ratings'][l]['rate_name']
                    #     print(key_word,"国贸",shop_name,shop_url,shop_rely,shop_foods_name,reply_username,reply_buy,reply_rate,reply_time,reply_text)
                    lists_content.append([key_word,"兰州",shop_name,shop_url,shop_rely,shop_foods_name])
            # pass

    except:
        with open("佳吉兰州3.csv", "w", encoding="utf-8-sig", newline="") as f:
            k = csv.writer(f, dialect="excel")
            k.writerow(["关键词", "地点","店铺名称", "店铺链接", "店铺评论数", "热销产品"])

            for list in lists_content:
                k.writerow(list)
print(lists_content)
with open("佳吉兰州5.csv", "w", encoding="utf-8-sig", newline="") as f:
    k = csv.writer(f, dialect="excel")
    k.writerow(["关键词", "地点","店铺名称", "店铺链接", "店铺评论数", "热销产品"])

    for list in lists_content:
        k.writerow(list)






你可能感兴趣的:(爬虫)