本篇博客是利用Python爬虫网易云音乐,并利用jieba分词,对歌词进行分词,然后分析周杰伦top50中最常出现的词语。
我们先尝试获取一首歌的歌词吧,比如等你下课这首歌
上面的id就是等你下课这首歌的id,但是这种不适合批量提取,所以最好从网页的html中提取歌曲的id,这才是正确方法
import requests
import json
import re
#根据歌词id提取歌词
lrc_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(531051217) + '&lv=1&kv=1&tv=-1'
lyric = requests.get(lrc_url)
json_obj = lyric.text
j = json.loads(json_obj)
lrc = j['lrc']['lyric']
pat = re.compile(r'\[.*\]')
lrc = re.sub(pat, "", lrc)
lrc = lrc.strip()
print(lrc)
f=open("D:/等你下课.txt",'w')
for each in lrc:
f.write(each)
import requests
from bs4 import BeautifulSoup
singer_url = 'http://music.163.com/artist?id=' + str(6452) # 获取歌手链接,根据歌手的id获取数据
web_data = requests.get(singer_url)
soup = BeautifulSoup(web_data.text, 'lxml')
singer_name = soup.select("#artist-name")#获取歌手名字
r = soup.find('ul', {'class': 'f-hide'}).find_all('a')
r = (list(r))
music_id_set = []#此歌手音乐的id列表
music_name_set=[]#此歌手音乐的名字列表
for each in r:
song_name = each.text # 歌曲名字 music_name_set.append(song_name)
song_id = each.attrs["href"]
music_id_set.append(song_id[9:])#歌曲id,从第九个字符开始取id,上面图片展示过
print(music_id_set)
dic = dict(map(lambda x, y: [x, y], music_id_set,music_name_set)) # 将音乐名字和音乐id组成一个字典
print(dic)
根据上面的id批量获取歌词,把上面的根据id获取单曲歌词的代码写成函数形式
def get_lyric_by_music_id(music_id): # 定义一个函数,通过音乐的id得到歌词
lrc_url = 'http://music.163.com/api/song/lyric?' + 'id=' + str(music_id) + '&lv=1&kv=1&tv=-1'
lyric = requests.get(lrc_url)
json_obj = lyric.text
# print(json_obj)
j = json.loads(json_obj)
# print(type(j))#打印出来j的类型是字典
try: # 部分歌曲没有歌词,这里引入一个异常
lrc = j['lrc']['lyric']
pat = re.compile(r'\[.*\]')
lrc = re.sub(pat, "", lrc)
lrc = lrc.strip()
return lrc
except KeyError as e:
pass
for i in music_id_set:
f=open("D:/untitled/"+dic[i]+".txt",'w')
lyric = get_lyric_by_music_id(i)#获取某一首歌的歌词
if lyric==None:#有的歌没有歌词 醉了哈
print("No lyric")
continue
else:
print(dic[i]) # 单个文件存储一个歌手某一首歌以歌名命名
try:
for index in lyric:
f.write(index)
except UnicodeEncodeError as u:
continue
f.close()
我们把上面的代码改一下,让50首歌曲歌词全部放到一个文件夹下,为下面分词做准备
import jieba
#读词
file=open("D:/untitled/周杰伦歌词.txt",'r')
lyric_str=file.read()
seg=jieba.cut(lyric_str)#jieba分词
word_list=[]
word_dict={}
for each in seg:
#print(each+' ')
if len(each)>1:#过滤长度为1的词
word_list.append(each)#加入到词语列表中
for index in word_list:#遍历词语列表
if index in word_dict:
word_dict[index]+=1#根据字典键访问键值,如果该键在字典中,则其值+1
else:
word_dict[index]=1#如果键不在字典中,则设置其键值为1
sorted(word_dict.items(),key=lambda e:e[1],reverse=False)
fc=open("D:/untitled/fenci.txt",'w')
for item in word_dict.items():
print(item)
fc.write(item[0]+str(item[1])+'\n')#将分词词频输出到txt文本中
#将分词和词频输出到excel中
file=Workbook()
table=file.add_sheet('data')
ldata = []
num = [a for a in word_dict]
num.sort()
for item in num:#频次
ldata.append(str(word_dict[item]))#次数
for i in range(1000):
table.write(i,0,num[i])
table.write(i,1,ldata[i])
file.save("D:/untitled/fenci.xls")
参考博客:
https://www.cnblogs.com/Beyond-Ricky/p/6757954.html