网易云歌曲评论爬取(selenium+python)

需求:下载爬取隔壁老樊的热门歌曲以及热门评论,并保存到数据库中,然后从数据库导入excel
数据库格式:
数据库名称:163com
数据库表结构:歌曲表(id+歌曲id+歌曲名称)
             评论表(id+歌曲id+评论内容) 

数据库操作:

#建表
import pymysql
#连接数据库
connection = pymysql.connect(
    host = 'localhost',
    user = 'root',
    password = '******',
    db = '163com',
    charset = 'utf8mb4'
)

with connection.cursor() as cursor:
    sq1 = 'create table song (id int primary key auto_increment , song_name varchar(50),song_id int) default charset=utf8MB4'
    sq2 = 'create table conment (id int primary key auto_increment, song_name varchar(100), song_id int, context varchar(200)) default charset=utf8MB4'
    cursor.execute(sq1)
    cursor.execute(sq2)
connection.commit()

#数据库函数
import pymysql
connection = pymysql.connect(
    host = 'localhost',
    user = 'root',
    password = '******',
    db = '163com',
    charset = 'utf8mb4'
)

#插入歌曲id和名称
def insert_song(song_name,song_id):
    with connection.cursor() as cursor:
        sql = 'insert into song (song_name,song_id) values (%s,%s)'
        cursor.execute(sql,(song_name,song_id))
    connection.commit()

#查询歌曲名称和id
def get_info():
    with connection.cursor() as cursor:
        sql = 'select song_name,song_id from song'
        cursor.execute(sql)
        return cursor.fetchall()

#插入歌曲评论
def insert_conment(song_name,song_id,context):
    with connection.cursor() as cursor:
        sql = 'insert into conment (song_name,song_id,context) values (%s,%s,%s)'
        cursor.execute(sql,(song_name,song_id,context))
    connection.commit()

#查看歌曲名及相应的精彩评论
def get_info_con(song_name):
    with connection.cursor() as cursor:
        sql = "select context from conment where song_name = %s limit 15"
        cursor.execute(sql,song_name)
        return cursor.fetchall()

网易云歌曲评论爬取(selenium+python)_第1张图片 

 网易云歌曲评论爬取(selenium+python)_第2张图片

#歌曲信息和评论信息操作

#爬取歌曲信息
from selenium import webdriver
from nest163 import sql

url = 'https://music.163.com/#/artist?id=4292'

# 采用selenium
def save_song(url):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.switch_to.frame('g_iframe')  # 进入iframe
    div_list = driver.find_elements_by_xpath("//span[@class='txt']")
    for div in div_list:
        song_id = div.find_element_by_xpath('a').get_attribute('href').replace('https://music.163.com/song?id=','')
        song_name = div.find_element_by_xpath('a/b').get_attribute('title')
        sql.insert_song(song_name,song_id)

if __name__=='__main__':
    save_song(url)


#爬取每首歌的精彩评论
from selenium import webdriver
from nest163 import sql
import time
import random


def save_conment():
    url_part = 'https://music.163.com/#/song?id='
    #从数据库导入歌曲id  歌曲名
    song_id_name= sql.get_info()  #(('麻雀', 1407551413), ('年少有为', 1293886117))
    driver = webdriver.Chrome()
    for song_id in song_id_name:
        url = url_part + str(song_id[1])
        driver.get(url)
        time.sleep(random.randint(2,5))
        # driver.page_source #查看网页源码  与实际不同
        driver.switch_to.frame('g_iframe')
        time.sleep(random.randint(1,3))
        div_list = driver.find_elements_by_xpath("//div[@class ='cnt f-brk']")
        for div in div_list:
            context = div.text
            # song_name,song_id,context
            song_name = song_id[0]
            print(context)
            sql.insert_conment(song_name,song_id[1],context)


if __name__=='__main__':
    save_conment()

保存txt和xlsx

#保存txt
from nest163 import sql

def toTxt():
    #取歌曲名
    song_id_name = sql.get_info() #(('麻雀', 1407551413), ('年少有为', 1293886117))
    for name in song_id_name:
        with open('lrh.txt', 'a+',encoding='utf-8') as f:
            f.write('\n')
            text = '歌曲:'+name[0]
            f.write(text+'\n')
            f.write('评论:'+'\n')

        # 取歌曲评论def
        context_list = sql.get_info_con(name[0])
        for context in context_list:
            with open('lrh.txt','a+',encoding='utf-8') as f:
                f.write('  '+context[0]+'\n')

if __name__ == '__main__':
    toTxt()

#保存xlsx
from nest163 import sql
import xlrd, xlwt
from xlutils.copy import copy


def toExc():
    dir = r'E:\pycharmcode\music_spyder\nest163\lirh.xlsx'
    wb = xlrd.open_workbook(dir)
    wb_n = copy(wb)
    # 取歌曲名
    song_id_name = sql.get_info() #(('麻雀', 1407551413), ('年少有为', 1293886117))
    print(song_id_name)
    sheet_num = 1
    for name in song_id_name:
        con_num = 0
        wb_n.add_sheet(name[0])
        ws_n = wb_n.get_sheet(sheet_num)
        ws_n.write(con_num,0,name[0])
        # 取歌曲评论def
        context_list = sql.get_info_con(name[0])
        con_num = con_num+1
        for context in context_list:
            ws_n.write(con_num, 0, context)
            con_num = con_num + 1
        wb_n.save(r'E:\pycharmcode\music_spyder\nest163\lirh.xlsx')
        sheet_num = sheet_num+1

if __name__ == '__main__':
    toExc()

网易云歌曲评论爬取(selenium+python)_第3张图片 

网易云歌曲评论爬取(selenium+python)_第4张图片 

网易云歌曲评论爬取(selenium+python)_第5张图片

你可能感兴趣的:(selenium,python,数据库)