import requests # 数据请求模块
import pprint # 格式化输入模块
import csv # 保存csv文件
import re # 正则表达式,进行文字匹配
import time # 引入时间模块,用来控制请求频率
# f = open('milk.csv', mode='a', encoding='utf_8_sig', newline='')
# csv_writer = csv.DictWriter(f, fieldnames=[
# '店铺名称',
# '店铺详情',
# '店铺评分',
# '评价人数',
# '人均消费',
# '店铺地址',
# ])
# csv_writer.writeheader() # 写入表头
# # 店铺名称
# findTitle = re.compile(r'(.*)')
# # 店铺详情
# findLink = re.compile(r'') # 创建正则表达式对象,表示规则(字符串的模式)
# # 店铺评分
# findRating = re.compile(r'
')# # 评价人数
# findJuge = re.compile(r'(\d*)人评价')
for page in range(0, 1024, 32):
# 发送请求,对于店铺数据包发送请求
url = "https://apimobile.meituan.com/group/v4/poi/pcsearch/10"
time.sleep(3)
data = {
'uuid': '30235d22f37442e1a33a.1641554598.1.0.0',
'userid': '2226566483',
'limit': '32',
'offset': page,
'cateId': '21329',
'q': '奶茶',
'token': 'HWhYBIFOfA14T2EVR8sStFLRGHUAAAAA5Q8AALOcbxCEx8TmSFgb1AObqOBrLbEuvHSRCThLA4SlKJ16vkD0nCr1FHg7ti1fw1W19g',
'areaId': -1
}
headers= {
'Referer': 'https://sh.meituan.com/', # 防盗链,告诉服务器发送请求的url地址是从哪里跳转过来的
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
}
response = requests.get(url=url, params=data, headers=headers)
# print(response.json()) # 获取json字典数据
# pprint.pprint(response.json())
searchResult = response.json()['data']['searchResult'] # 键值对提取数据
# print(searchResult)
for items in searchResult: # 遍历列表,逐一提取元素
# pprint.pprint(items)
shop_id = items['id']
shop_url = f'https://www.meituan.com/meishi/{shop_id}/'
data = {
'店铺名称': items['title'],
'店铺详情': shop_url,
'店铺评分': items['avgscore'],
'评价人数': items['comments'],
'人均消费': items['avgprice'],
'店铺地址': items['areaname'],
}
# csv_writer.writerow(data) # 保存数据
print(data)
# f.close()
print("爬取完毕!")
import jieba # 分词,词云显示中文
from matplotlib import pyplot as plt # 绘图,数据可视化
from wordcloud import WordCloud, STOPWORDS # 词云
from PIL import Image # 图像处理
import numpy as np # 矩阵运算
import sqlite3 # 数据库
import pymysql
import pandas as pd
# 准备词云所需要的文字(词)
con = pymysql.connect(host='localhost',
user='root',
password='123456',
database='movie_demo')
cur = con.cursor()
sql = 'select 店铺名称 from milk'
# data = cur.execute(sql)
df = pd.read_sql('select 店铺名称 from milk', con)
text = ""
df['店铺名称'] = df['店铺名称'].str.split("(").str[0]
print(df['店铺名称'].tolist())
# for item in range(len(data)):
# text = text + item[0]
# # print(item[0])
# # print(text)
# cur.close()
# con.close()
#
# cut = jieba.cut(text)
string = ' '.join(df['店铺名称'].tolist())
# print(len(string))
stopwords = STOPWORDS
stopwords.add('茶馆')
stopwords.add('棋牌')
# stopwords.add('我')
# stopwords.add('人')
# stopwords.add('都')
# stopwords.add('了')
# stopwords.add('是')
img = Image.open(r'.\static\assets\img\tea.jpg') # 打开遮罩图片
img_array = np.array(img) # 将图片转换为数组
wc = WordCloud(
# stopwords=stopwords,
background_color='white',
mask=img_array, font_path="msyh.ttc" # 字体所在位置:C:\Windows\Fonts
)
wc.generate_from_text(string)
# 绘制图片
fig = plt.figure(1)
plt.imshow(wc)
plt.axis('off') # 是否显示坐标轴
plt.show() # 显示生成的词云图片
# 输出词云图片到文件
# plt.savefig(r'.\static\assets\img\3.jpg', dpi=500)
import pandas as pd
from numpy import *
import codecs
import sqlite3 as sql
tea = pd.io.parsers.read_csv('D:/TeaDataAnalysis/tea.csv')
tea.head()
tea1 = tea.loc[:, ['店铺名称', '店铺详情', '评价人数', '店铺评分']]
tea1.head()
C = tea1['店铺评分'].mean() # 所有店铺评分的均值
print("茶饮店铺评分均值:%f分" % C)
M = tea1['评价人数'].quantile(0.9) # 筛选的评价顾客人数阈值,也就是如果某个店铺评价的个数低于阈值,则该店铺将被忽略不计
print("茶饮店铺评价人数阈值:%d" % M) # M的取值可以根据需求自由选取,在下面的模型中,采用90分位值,也就是只选取评价人数为前10%的店铺进行分析
q_tea1 = tea1.copy().loc[tea1['评价人数'] > M]
q_tea1.shape
def weighted_rating(x, M=M, C=C):
V = x['评价人数'] # 某店铺参与评价的顾客人数
R = x['店铺评分'] # 该店铺的评分
return round((V / (V + M) * R) + (M / (V + M) * C),2)
q_tea1['综合得分'] = q_tea1.apply(weighted_rating, axis=1)
q_tea1 = q_tea1.sort_values('综合得分', ascending=False)
df = q_tea1.head(10)
list = []
for i in range(10):
list.append(i+1)
# print (list)
df.insert(0,'id',list)
# df.to_csv('recommend-tea.csv', encoding='utf-8', index=False)
print(df)
完整代码已上传至Github,各位下载时麻烦给个follow和star,感谢!
链接:Tea Data Analysis 茶饮数据分析