需求:下载爬取隔壁老樊的热门歌曲以及热门评论,并保存到数据库中,然后从数据库导入excel 数据库格式: 数据库名称:163com 数据库表结构:歌曲表(id+歌曲id+歌曲名称) 评论表(id+歌曲id+评论内容)
数据库操作:
#建表
import pymysql
#连接数据库
connection = pymysql.connect(
host = 'localhost',
user = 'root',
password = '******',
db = '163com',
charset = 'utf8mb4'
)
with connection.cursor() as cursor:
sq1 = 'create table song (id int primary key auto_increment , song_name varchar(50),song_id int) default charset=utf8MB4'
sq2 = 'create table conment (id int primary key auto_increment, song_name varchar(100), song_id int, context varchar(200)) default charset=utf8MB4'
cursor.execute(sq1)
cursor.execute(sq2)
connection.commit()
#数据库函数
import pymysql
connection = pymysql.connect(
host = 'localhost',
user = 'root',
password = '******',
db = '163com',
charset = 'utf8mb4'
)
#插入歌曲id和名称
def insert_song(song_name,song_id):
with connection.cursor() as cursor:
sql = 'insert into song (song_name,song_id) values (%s,%s)'
cursor.execute(sql,(song_name,song_id))
connection.commit()
#查询歌曲名称和id
def get_info():
with connection.cursor() as cursor:
sql = 'select song_name,song_id from song'
cursor.execute(sql)
return cursor.fetchall()
#插入歌曲评论
def insert_conment(song_name,song_id,context):
with connection.cursor() as cursor:
sql = 'insert into conment (song_name,song_id,context) values (%s,%s,%s)'
cursor.execute(sql,(song_name,song_id,context))
connection.commit()
#查看歌曲名及相应的精彩评论
def get_info_con(song_name):
with connection.cursor() as cursor:
sql = "select context from conment where song_name = %s limit 15"
cursor.execute(sql,song_name)
return cursor.fetchall()
#歌曲信息和评论信息操作
#爬取歌曲信息
from selenium import webdriver
from nest163 import sql
url = 'https://music.163.com/#/artist?id=4292'
# 采用selenium
def save_song(url):
driver = webdriver.Chrome()
driver.get(url)
driver.switch_to.frame('g_iframe') # 进入iframe
div_list = driver.find_elements_by_xpath("//span[@class='txt']")
for div in div_list:
song_id = div.find_element_by_xpath('a').get_attribute('href').replace('https://music.163.com/song?id=','')
song_name = div.find_element_by_xpath('a/b').get_attribute('title')
sql.insert_song(song_name,song_id)
if __name__=='__main__':
save_song(url)
#爬取每首歌的精彩评论
from selenium import webdriver
from nest163 import sql
import time
import random
def save_conment():
url_part = 'https://music.163.com/#/song?id='
#从数据库导入歌曲id 歌曲名
song_id_name= sql.get_info() #(('麻雀', 1407551413), ('年少有为', 1293886117))
driver = webdriver.Chrome()
for song_id in song_id_name:
url = url_part + str(song_id[1])
driver.get(url)
time.sleep(random.randint(2,5))
# driver.page_source #查看网页源码 与实际不同
driver.switch_to.frame('g_iframe')
time.sleep(random.randint(1,3))
div_list = driver.find_elements_by_xpath("//div[@class ='cnt f-brk']")
for div in div_list:
context = div.text
# song_name,song_id,context
song_name = song_id[0]
print(context)
sql.insert_conment(song_name,song_id[1],context)
if __name__=='__main__':
save_conment()
保存txt和xlsx
#保存txt
from nest163 import sql
def toTxt():
#取歌曲名
song_id_name = sql.get_info() #(('麻雀', 1407551413), ('年少有为', 1293886117))
for name in song_id_name:
with open('lrh.txt', 'a+',encoding='utf-8') as f:
f.write('\n')
text = '歌曲:'+name[0]
f.write(text+'\n')
f.write('评论:'+'\n')
# 取歌曲评论def
context_list = sql.get_info_con(name[0])
for context in context_list:
with open('lrh.txt','a+',encoding='utf-8') as f:
f.write(' '+context[0]+'\n')
if __name__ == '__main__':
toTxt()
#保存xlsx
from nest163 import sql
import xlrd, xlwt
from xlutils.copy import copy
def toExc():
dir = r'E:\pycharmcode\music_spyder\nest163\lirh.xlsx'
wb = xlrd.open_workbook(dir)
wb_n = copy(wb)
# 取歌曲名
song_id_name = sql.get_info() #(('麻雀', 1407551413), ('年少有为', 1293886117))
print(song_id_name)
sheet_num = 1
for name in song_id_name:
con_num = 0
wb_n.add_sheet(name[0])
ws_n = wb_n.get_sheet(sheet_num)
ws_n.write(con_num,0,name[0])
# 取歌曲评论def
context_list = sql.get_info_con(name[0])
con_num = con_num+1
for context in context_list:
ws_n.write(con_num, 0, context)
con_num = con_num + 1
wb_n.save(r'E:\pycharmcode\music_spyder\nest163\lirh.xlsx')
sheet_num = sheet_num+1
if __name__ == '__main__':
toExc()