美团商品各项数据(1.爬取)

美团商品各项数据(1.爬取)

程序

import requests
import time
import json
import pandas as pd

这里只需要这四个库,pandas主要作保存数据用

headers={
     #这里填写请求头,主要为cookie,User-agent
}
rows = []
n = 0
#num_list = ['单1一', '2二双', '3三', '4四', '5五', '6六', '7七', '8八', '9九', '10十']
for i in range(100):
    url = 'https://cd.meituan.com/meishi/c17/pn'+str(i+1)+'/'
    res = requests.get(url, headers=headers)
    html = res.content.decode(res.apparent_encoding)
    for i in range(len(html)):
        if html[i:i+8] == 'poiInfos':
            for j in range(i+8, len(html)):
                if html[j:j+9] == 'comHeader':
                    html = html[i+10:j-3]
    result = json.loads(html)	#json库解析html
    for i in result:
        row = []
        #row.append(i['title'])
        #row.append(str(i['poiId']))
        row.append(str(i['avgScore']))
        row.append(str(i['allCommentNum']))
        row.append(str(int(i['avgScore']*i['allCommentNum'])))
        row.append(str(i['avgPrice']))
        for key, value in i.items():
            if key == 'address':
                address = ''
                for j in value:
                    if j != '区' and j != '市':
                        address += j
                    else:
                        row.append(address+j)
                        break
            if key == 'dealList' and i['dealList'] != None:
                soldCounts_total = 0
                #soldCounts_total_num = 0
                for j in value:
                    if '券' not in j['title']:
                        #num = 0
                        soldCounts_total += j['soldCounts']
                        #for i in range(len(j['title'])):
                            #if j['title'][i] == '人' or '位' or '个' or '份':
                                #for x in range(10):
                                    #if j['title'][i-1] in num_list[x] and num == 0:
                                        #row.append([j['price'], j['soldCounts']])
                                        #soldCounts_total_num += j['soldCounts']*(
                                            #x+1)
                                        # print(x+1, j['soldCounts'])
                                        #num += 1
                if soldCounts_total != 0:
                    row.append(str(soldCounts_total))
                    #row.append(str(soldCounts_total_num))
        # time.sleep(0.5)
        #urls = 'https://www.meituan.com/meishi/'+row[1]+'/'
        #ress = requests.get(urls, headers=headers)
        #htmls = ress.content.decode(ress.apparent_encoding)
        # for i in range(len(htmls)):
            # if htmls[i:i+10] == 'extraInfos':
                # for j in range(i+10, len(htmls)):
                    # if htmls[j:j+15] == 'hasFoodSafeInfo':
                    #htmls = htmls[i+12:j-2]
        #results = json.loads(htmls)
        # if len(results) == 0:
            # row.append(0)
            # row.append(0)
        # elif len(results) == 1:
            # row.append(1)
            # row.append(0)
        # else:
            # row.append(1)
            # row.append(1)

        if len(row) == 6 and row[2] != 0 and row[3] != 0 and row[-1] != 0:
            rows.append(row)
    time.sleep(0.5)
    n += 1
    print('已经完成{}'.format(n))

# print(html)
# 写入数据
output = pd.DataFrame(rows)
#print(output.iloc[:, 6])
output.to_csv('meituan huoguo.csv')

代码简单易懂,此处不再说明,只需要对照解析的html即可理解,加注释的部分供参考,可自行探索。此后将尝试爬取店铺的评论以及对数据的分析。

你可能感兴趣的:(python,爬虫,美团,python,json,美团,爬虫)