代码采用模块化处理,每个脚本可以分别运行,也可以使用可视化进行辅助运行
重要包版本:
pyecharts0.5.5
jinja23.0.3
# -*- coding:utf8 -*-
from urllib import request
import json
import pymysql
import re
ROOT_URL = 'https://music.163.com/api/v1/user/detail/'
DATABASE = ****
TABLE_USERS = ****
TABLE_COMMENTS = ****
PATTERN = re.compile(r'[\n\t\r\/]')
def getData_user(url):
if not url:
return None
print('Crawling>>> ' + url)
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.74 Safari/537.36 Edg/99.0.1150.46',
}
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
data = {}
if js['code'] == 200:
data['userId'] = js['profile']['userId']
data['userName'] = js['profile']['nickname']
data['avatar'] = js['profile']['avatarUrl']
data['gender'] = js['profile']['gender']
if int(js['profile']['birthday'])<0:
data['age'] = 0
else:
data['age'] =(2018-1970)-(int(js['profile']['birthday'])//(1000*365*24*3600))
if int(data['age'])<0:
data['age'] = 0
data['level'] = js['level']
data['sign'] = PATTERN.sub(' ', js['profile']['signature'])
data['eventCount'] = js['profile']['eventCount']
data['followCount'] = js['profile']['follows']
data['fanCount'] = js['profile']['followeds']
data['city'] = js['profile']['city']
data['recordCount'] = js['listenSongs']
return data
except Exception as e:
print('Down err>>> ', e)
return None
def saveData_user(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='root', passwd='qwer', db=DATABASE, charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储签名中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + 'users' + '(id,userName,gender,age,level,city,sign,eventCount,followsCount,followedCount,recordCount,avatar,userId) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
try:
# cursor.execute('SELECT max(id) FROM '+TABLE_USERS)
# id_ = cursor.fetchone()[0]
# cursor.execute(sql)
cursor.execute(sql,(0,data['userName'],data['gender'],data['age'],data['level'],data['city'],data['sign'],data['eventCount'],data['followCount'],data['fanCount'],data['recordCount'],data['avatar'],data['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ',data['userId'],e)
pass
finally:
cursor.close()
conn.close()
def getID_user():
conn = pymysql.connect(host='localhost', user='root', passwd='qwer', db=DATABASE, charset='utf8mb4')
cursor = conn.cursor()
sql = 'SELECT userId FROM '+TABLE_COMMENTS
try:
cursor.execute(sql)
res = cursor.fetchall()
return res
except Exception as e:
print('get err>>> ', e)
pass
finally:
cursor.close()
conn.close()
return None
if __name__ == '__main__':
usersID = getID_user()
for i in usersID:
data = getData_user(ROOT_URL+i[0].strip())
saveData_user(data)
# -*- coding:utf8 -*-
from urllib import request
import json
import pymysql
from datetime import datetime
import re
ROOT_URL = 'http://music.163.com/api/v1/resource/comments/R_SO_4_%s?limit=%s&offset=%s'
LIMIT_NUMS = 50 # 每页限制爬取数
DATABASE = **** # 数据库名
TABLE = **** # 数据库表名
PATTERN = re.compile(r'[\n\t\r\/]')
def getData_com(url):
if not url:
return None, None
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36',
"Host": "music.163.com",
}
print('Crawling>>> ' + url)
try:
req = request.Request(url, headers=headers)
content = request.urlopen(req).read().decode("utf-8")
js = json.loads(content)
total = int(js['total'])
datas = []
for c in js['comments']:
data = dict()
data['commentId'] = c['commentId']
data['content'] = PATTERN.sub('', c['content'])
data['time'] = datetime.fromtimestamp(c['time'] // 1000)
data['likedCount'] = c['likedCount']
data['userId'] = c['user']['userId']
datas.append(data)
return total, datas
except Exception as e:
print('Down err>>> ', e)
pass
def saveData_com(data):
if not data:
return None
conn = pymysql.connect(host='localhost', user='root', passwd='qwer', db='wangyiyun',
charset='utf8mb4') # 注意字符集要设为utf8mb4,以支持存储评论中的emoji表情
cursor = conn.cursor()
sql = 'insert into ' + TABLE + ' (id,commentId,content,likedCount,time,userId) VALUES (%s,%s,%s,%s,%s,%s)'
for d in data:
try:
cursor.execute('SELECT max(id) FROM ' + TABLE)
# id_ = cursor.fetchone()[0]
cursor.execute(sql, (0,d['commentId'], d['content'], d['likedCount'], d['time'], d['userId']))
conn.commit()
except Exception as e:
print('mysql err>>> ', d['commentId'], e)
pass
cursor.close()
conn.close()
if __name__ == '__main__':
songId = input('歌曲ID:').strip()
total, data = getData_com(ROOT_URL % (songId, LIMIT_NUMS, 0))
saveData_com(data)
if total:
for i in range(1, total // num + 1): #num页数
_, data = getData_com(ROOT_URL % (songId, LIMIT_NUMS, i * (LIMIT_NUMS)))
saveData_com(data)
# -*- codeing = utf-8 -*-
import re
import sys
import time
import pandas as pd
import pymysql
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from snownlp import SnowNLP
from pyecharts import Bar, Pie, Line, Scatter, Map
import pymysql
from PyQt5.QtCore import QUrl, QFileInfo
from PyQt5.QtCore import *
from PyQt5.QtWidgets import *
from ui_main import Ui_MainWindow
from PyQt5.QtWidgets import *
from PyQt5 import QtGui, QtWidgets
from PyQt5.QtWidgets import QApplication, QMainWindow, QHeaderView
from ui_show_users import Ui_Form2
from ui_show_comments import Ui_Form
from comment import *
from users import *
from PyQt5.QtWebEngineWidgets import *
plt.style.use('ggplot')
plt.rcParams['axes.unicode_minus'] = False
ROOT_URL_com = 'http://music.163.com/api/v1/resource/comments/R_SO_4_%s?limit=%s&offset=%s'
ROOT_URL = 'https://music.163.com/api/v1/user/detail/'
LIMIT_NUMS = 50 # 每页限制爬取数
HOST='localhost'
PORT=3306
USER='root'
PWD='qwer'
DATABASE = 'wangyiyun' # 数据库名
TABLE = 'comments' # 数据库表名
TABLE_USERS = 'users'
TABLE_COMMENTS = 'comments'
PATTERN = re.compile(r'[\n\t\r\/]')
class mainwindow(QMainWindow):
# 初始化
def __init__(self):
super().__init__()
# 实例化一个 Ui_MainWindow对象
self.ui=Ui_MainWindow()
self.ui.setupUi(self)
# 这里使用的是 self.show(),和之后的区分一下
self.show()
# 实例化状态栏
self.statusBar = QStatusBar()
# 设置状态栏,类似布局设置
self.setStatusBar(self.statusBar)
self.ui.pushButton.clicked.connect(self.p_comments)
self.ui.pushButton_2.clicked.connect(self.p_users)
self.ui.pushButton_4.clicked.connect(self.show_comments)
self.ui.pushButton_3.clicked.connect(self.show_users)
self.ui.pushButton_5.clicked.connect(self.show_com_day)
self.ui.pushButton_6.clicked.connect(self.show_com_week)
self.ui.pushButton_9.clicked.connect(self.show_com_hour)
self.ui.pushButton_10.clicked.connect(self.show_user_city)
self.ui.pushButton_11.clicked.connect(self.show_user_age)
self.ui.pushButton_12.clicked.connect(self.show_user_emotion)
self.ui.pushButton_13.clicked.connect(self.show_emotion_label)
self.ui.pushButton_14.clicked.connect(self.show_could)
self.ui.pushButton_15.clicked.connect(self.del_all_data)
self.ui.pushButton_16.clicked.connect(self.admin_ok)
# 管理员权限确认
def admin_ok(self):
self.ui.pushButton_15.setEnabled(True)
self.ui.pushButton_16.setEnabled(False)
# 初始化数据库
def del_all_data(self):
self.connect_mysql()
sql1 = "delete from users"
sql2="delete from comments"
self.cur.execute(str(sql1))
self.cur.execute(str(sql2))
self.conn.close()
self.cur.close()
self.status_msg('全部数据已清空!',10000)
# 用户地区分布分析
# 城市code编码转换
def city_group(self,cityCode):
city_map = {
'11': '北京',
'12': '天津',
'31': '上海',
'50': '重庆',
'5e': '重庆',
'81': '香港',
'82': '澳门',
'13': '河北',
'14': '山西',
'15': '内蒙古',
'21': '辽宁',
'22': '吉林',
'23': '黑龙江',
'32': '江苏',
'33': '浙江',
'34': '安徽',
'35': '福建',
'36': '江西',
'37': '山东',
'41': '河南',
'42': '湖北',
'43': '湖南',
'44': '广东',
'45': '广西',
'46': '海南',
'51': '四川',
'52': '贵州',
'53': '云南',
'54': '西藏',
'61': '陕西',
'62': '甘肃',
'63': '青海',
'64': '宁夏',
'65': '新疆',
'71': '台湾',
'10': '其他',
'0': '其他',
}
return city_map[cityCode[:2]]
# 评论爬虫机制
def p_comments(self):
try:
songId = self.ui.lineEdit.text().strip()
if songId == '':
return 0
else:
total, data = getData_com(ROOT_URL_com % (songId, LIMIT_NUMS, 0))
saveData_com(data)
num=self.ui.lineEdit_2.text()
if num == '':
num=10
if total:
for i in range(1, total // int(num) + 1):
_, data = getData_com(ROOT_URL_com % (songId, LIMIT_NUMS, i * (LIMIT_NUMS)))
saveData_com(data)
except Exception as e:
print(str(e))
self.status_msg(str(e),5000)
# 用户信息爬虫
def p_users(self):
try:
usersID = getID_user()
for i in usersID:
data = getData_user(ROOT_URL + i[0].strip())
saveData_user(data)
except Exception as e:
print(str(e))
self.status_msg(str(e),5000)
# 评论信息展示
def show_comments(self):
form = QDialog()
self.ui2 = Ui_Form()
self.ui2.setupUi(form)
form.setWindowModality(Qt.NonModal) # 非模态,可与其他窗口交互
# form.setWindowModality(Qt.WindowModal) # 窗口模态,当前未处理完,阻止与父窗口交互
# form.setWindowModality(Qt.ApplicationModal) # 应用程序模态,阻止与任何其他窗口交互
sql = "select * from comments"
self.connect_mysql()
try:
self.ui2.tableWidget.clearContents()
self.cur.execute(str(sql))
data = self.cur.fetchall()
x = 0
for i in data:
y = 0
for j in i:
self.ui2.tableWidget.setItem(x, y, QtWidgets.QTableWidgetItem(str(data[x][y])))
y = y + 1
x = x + 1
except Exception as e:
print(e)
self.conn.close()
self.cur.close()
form.show()
QApplication.processEvents()
form.exec_()
# 用户信息展示
def show_users(self):
form = QDialog()
self.ui3 = Ui_Form2()
self.ui3.setupUi(form)
form.setWindowModality(Qt.NonModal) # 非模态,可与其他窗口交互
# form.setWindowModality(Qt.WindowModal) # 窗口模态,当前未处理完,阻止与父窗口交互
# form.setWindowModality(Qt.ApplicationModal) # 应用程序模态,阻止与任何其他窗口交互
sql = "select * from users"
self.connect_mysql()
try:
self.ui3.tableWidget.clearContents()
self.cur.execute(str(sql))
data = self.cur.fetchall()
x = 0
for i in data:
y = 0
for j in i:
self.ui3.tableWidget.setItem(x, y, QtWidgets.QTableWidgetItem(str(data[x][y])))
y = y + 1
x = x + 1
except Exception as e:
print(e)
self.conn.close()
self.cur.close()
form.show()
QApplication.processEvents()
form.exec_()
# 评论时间分布图(天)展示
def show_com_day(self):
self.conn = pymysql.connect(host=HOST, user=USER, passwd=PWD, db=DATABASE, charset='utf8mb4')
self.sql_users = 'SELECT id,gender,age,city FROM ' + TABLE_USERS
self.sql_comments = 'SELECT id,time FROM ' + TABLE_COMMENTS
self.comments = pd.read_sql(self.sql_comments, con=self.conn)
self.users = pd.read_sql(self.sql_users, con=self.conn)
# 评论时间(按天)分布分析
comments_day = self.comments['time'].dt.date
data = comments_day.index.groupby(comments_day)
data_key = []
data_val = []
for i in data:
p = 0
data_key.append(i)
for j in data[i]:
p += 1
data[i] = p
data_val.append(p)
print(type(data), data_key, data_val)
line = Line('评论时间(按天)分布')
line.use_theme('dark')
line.add(
'',
data_key,
data_val,
is_fill=True,
)
line.render(r'./output/com_day.html')
self.conn.close()
self.img_show(r'./output/com_day.html')
# 评论时间分布图(周)展示
def show_com_week(self):
self.conn = pymysql.connect(host=HOST, user=USER, passwd=PWD, db=DATABASE, charset='utf8mb4')
self.sql_users = 'SELECT id,gender,age,city FROM ' + TABLE_USERS
self.sql_comments = 'SELECT id,time FROM ' + TABLE_COMMENTS
self.comments = pd.read_sql(self.sql_comments, con=self.conn)
self.users = pd.read_sql(self.sql_users, con=self.conn)
# 评论时间(按周)分布分析
comments_week = self.comments['time'].dt.dayofweek
data = comments_week.index.groupby(comments_week)
data_key = []
data_val = []
for i in data:
p = 0
data_key.append(i)
for j in data[i]:
p += 1
data[i] = p
data_val.append(p)
print(data_key, data_val)
line = Line('评论时间(按周)分布')
line.use_theme('dark')
line.add(
'',
data_key,
data_val,
is_fill=True,
)
line.render(r'./output/com_week.html')
self.conn.close()
self.img_show(r'./output/com_week.html')
# 评论时间分布图(时)展示
def show_com_hour(self):
self.conn = pymysql.connect(host=HOST, user=USER, passwd=PWD, db=DATABASE, charset='utf8mb4')
self.sql_users = 'SELECT id,gender,age,city FROM ' + TABLE_USERS
self.sql_comments = 'SELECT id,time FROM ' + TABLE_COMMENTS
self.comments = pd.read_sql(self.sql_comments, con=self.conn)
self.users = pd.read_sql(self.sql_users, con=self.conn)
# 评论时间(按小时)分布分析
comments_hour = self.comments['time'].dt.hour
data = comments_hour.index.groupby(comments_hour)
data_key = []
data_val = []
for i in data:
p = 0
data_key.append(i)
for j in data[i]:
p += 1
data[i] = p
data_val.append(p)
print(data_key, data_val)
line = Line('评论时间(按小时)分布')
line.use_theme('dark')
line.add(
'',
data_key,
data_val,
is_fill=True,
)
line.render(r'./output/com_hour.html')
self.conn.close()
self.img_show(r'./output/com_hour.html')
# 用户所在城市图展示
def show_user_city(self):
self.conn = pymysql.connect(host=HOST, user=USER, passwd=PWD, db=DATABASE, charset='utf8mb4')
self.sql_users = 'SELECT id,gender,age,city FROM ' + TABLE_USERS
self.sql_comments = 'SELECT id,time FROM ' + TABLE_COMMENTS
self.comments = pd.read_sql(self.sql_comments, con=self.conn)
self.users = pd.read_sql(self.sql_users, con=self.conn)
city = self.users['city'].apply(self.city_group)
data = city.index.groupby(city)
data_key = []
data_val = []
for i in data:
p = 0
data_key.append(i)
for j in data[i]:
p += 1
data[i] = p
data_val.append(p)
print(data_key, data_val)
map_ = Map('用户地区分布图')
map_.add(
'',
data_key,
data_val,
maptype='china',
is_visualmap=True,
visual_text_color='#000',
is_label_show=True,
)
map_.render(r'./output/user_city.html')
self.conn.close()
self.img_show(r'./output/user_city.html')
# 用户年龄分布图展示
def show_user_age(self):
self.conn = pymysql.connect(host=HOST, user=USER, passwd=PWD, db=DATABASE, charset='utf8mb4')
self.sql_users = 'SELECT id,gender,age,city FROM ' + TABLE_USERS
self.sql_comments = 'SELECT id,time FROM ' + TABLE_COMMENTS
self.comments = pd.read_sql(self.sql_comments, con=self.conn)
self.users = pd.read_sql(self.sql_users, con=self.conn)
# 用户年龄分布分析
age = self.users[self.users['age'] > 0] # 清洗掉年龄小于1的数据
data = age.index.groupby(age['age'])
data_key = []
data_val = []
for i in data:
p = 0
data_key.append(i)
for j in data[i]:
p += 1
data[i] = p
data_val.append(p)
print(data_key, data_val)
Bar = Line('用户年龄分布')
Bar.use_theme('dark')
Bar.add(
'',
data_key,
data_val,
is_fill=True,
)
Bar.render(r'./output/user_age.html') # 生成渲染的html文件
self.conn.close()
self.img_show(r'./output/user_age.html')
# 获取用户评论数据
def getText(self):
self.conn = pymysql.connect(host=HOST, user=USER, passwd=PWD, db=DATABASE, charset='utf8')
sql = 'SELECT id,content FROM ' + TABLE_COMMENTS
self.text = pd.read_sql(sql, con=self.conn)
return self.text
# 用户情感分析展示图
def show_user_emotion(self):
text=self.getText()
text['content'] = text['content'].apply(lambda x: round(SnowNLP(x).sentiments, 2))
semiscore = text.id.groupby(text['content']).count()
bar = Bar('评论情感得分')
bar.use_theme('dark')
bar.add(
'',
y_axis=semiscore.values,
x_axis=semiscore.index.values,
is_fill=True,
)
bar.render(r'./output/user_emotion.html')
self.conn.close()
self.img_show(r'./output/user_emotion.html')
# 用户情感标签图展示
def show_emotion_label(self):
text=self.getText()
text['content'] = text['content'].apply(lambda x: round(SnowNLP(x).sentiments, 2))
text['content_num'] = text['content'].apply(lambda x: 1 if float(x) > 0.5 else -1)
semilabel = text.id.groupby(text['content_num']).count()
bar = Bar('评论情感标签')
bar.use_theme('dark')
bar.add(
'',
y_axis=semilabel.values,
x_axis=semilabel.index.values,
is_fill=True,
)
bar.render(r'./output/emotion_label.html')
self.conn.close()
self.img_show(r'./output/emotion_label.html')
# 词云图展示
def show_could(self):
text=self.getText()
text = ''.join(str(s) for s in text['content'] if s)
word_list = jieba.cut(text, cut_all=False)
print(word_list)
stopwords = [line.strip() for line in open(r'./StopWords.txt', 'r', encoding='utf-8').readlines()] # 导入停用词
clean_list = [seg for seg in word_list if seg not in stopwords] # 去除停用词
clean_text = ''.join(clean_list)
# 生成词云
cloud = WordCloud(
font_path=r'C:/Windows/Fonts/msyh.ttc',
background_color='white',
max_words=800,
max_font_size=64
)
word_cloud = cloud.generate(clean_text)
# 绘制词云
plt.figure(figsize=(12, 12))
plt.imshow(word_cloud)
plt.axis('off')
img_name = str(int(time.mktime(time.localtime())))
plt.savefig("./output/" + img_name + ".png")
self.conn.close()
self.img_show("./output/" + img_name + ".png")
# 数据库连接
def connect_mysql(self):
try:
self.conn = pymysql.connect(host=HOST, port=PORT, user=USER, password=PWD, db=DATABASE)
self.cur = self.conn.cursor()
except Exception as e:
self.status_msg('[-]数据库连接错误!<' + str(e) + '>', 5000)
# html文件展示
def img_show(self,url):
self.ui.browser = QWebEngineView()
# 加载本地html
# 相对路径:file:///D:/spark/PyQT5_Main/Window/html/aa.html
# 绝对路径:./html/aa.html
self.ui.browser.load(QUrl(QFileInfo("./"+str(url)).absoluteFilePath()))
self.ui.browser.show()
# 刷新界面
def f5_data(self):
try:
self.connect_mysql()
self.renew_table()
self.status_msg('[+]刷新界面成功!', 5000)
except Exception as e:
self.status_msg('[-]刷新界面有误!<' + str(e) + '>', 5000)
QApplication.processEvents()
# 状态栏消息
def status_msg(self, msg, time):
# 设置状态栏的显示文本以及显示时间
self.statusBar.showMessage(str(msg), int(time))
QApplication.processEvents()
if __name__=="__main__":
app=QApplication(sys.argv)
window=mainwindow()
sys.exit(app.exec_())
有问题评论区告诉我!