先看结果
1、获取列表页信息,url为https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?tpl=3&page=detail&date=2019_02&topid=26&type=top&song_begin=0&song_num=30&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0
json样式为:
2、获取详情页
headers = {
"authority": "c.y.qq.com",
"method": "GET",
"path": "/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=225716644&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0",
"scheme": "https",
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "pgv_pvi=5936793600; pt2gguin=o1952436511; RK=g+4hNa7BQD; ptcz=653047c5b0174eb6b929c242110d08693b9dfcbaa701ddbf37ccc23c3366b94c; pgv_pvid=9049425500; ts_uid=9851761599; o_cookie=1952436511; tvfe_boss_uuid=5e81ff5fb8d5a1ea; yqq_stat=0; pgv_info=ssid=s484511232; ts_refer=ADTAGbaiduald; pgv_si=s21197824; yq_index=0; player_exist=1; qqmusic_fromtag=66; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
"origin": "https://y.qq.com",
"referer": "https://y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
jsond = {
"nobase64": "1",
"musicid": item['data']['songid'],
"-": "jsonp1",
"g_tk": "5381",
"loginUin": "0",
"hostUin": "0",
"format": "json",
"inCharset": "utf8",
"outCharset": "utf-8",
"notice": "0",
"platform": "yqq.json",
"needNewCode": "0"
}
r = requests.get("https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg", params=jsond, headers=headers)
json样式为:
3、将歌词存到文件test.txt里,用于读取。
4、逐行读取文件、构建要处理的数据字符串
5、jieba库、词云制作。
上爬虫代码:
# -*-coding:UTF-8 -*-
import json
import re
import requests
headers = {
"authority": "c.y.qq.com",
"method": "GET",
"path": "/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg?nobase64=1&musicid=225716644&-=jsonp1&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8¬ice=0&platform=yqq.json&needNewCode=0",
"scheme": "https",
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-encoding": "gzip, deflate, br",
"accept-language": "zh-CN,zh;q=0.9",
"cookie": "pgv_pvi=5936793600; pt2gguin=o1952436511; RK=g+4hNa7BQD; ptcz=653047c5b0174eb6b929c242110d08693b9dfcbaa701ddbf37ccc23c3366b94c; pgv_pvid=9049425500; ts_uid=9851761599; o_cookie=1952436511; tvfe_boss_uuid=5e81ff5fb8d5a1ea; yqq_stat=0; pgv_info=ssid=s484511232; ts_refer=ADTAGbaiduald; pgv_si=s21197824; yq_index=0; player_exist=1; qqmusic_fromtag=66; yplayer_open=0; ts_last=y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
"origin": "https://y.qq.com",
"referer": "https://y.qq.com/n/yqq/song/002krvKI4Jgvq9.html",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
}
jsonlist={
"tpl":"3" ,
"page": "detail",
"date": "2019_02",
"topid": "26",
"type": "top",
"song_begin": "0",
"song_num": "100",
"g_tk": "5381",
"loginUin": "0",
"hostUin": "0",
"format": "json",
"inCharset": "utf8",
"outCharset": "utf-8",
"notice": "0",
"platform": "yqq.json",
"needNewCode": "0"
}
r1 = requests.get("https://c.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg", params=jsonlist)
jlist = json.loads(r1.text)
f = open('test.txt', 'a+')
for item in jlist['songlist']:
#print (str(item['data']['songid'])+" "+item['data']['songname'])
jsond = {
"nobase64": "1",
"musicid": item['data']['songid'],
"-": "jsonp1",
"g_tk": "5381",
"loginUin": "0",
"hostUin": "0",
"format": "json",
"inCharset": "utf8",
"outCharset": "utf-8",
"notice": "0",
"platform": "yqq.json",
"needNewCode": "0"
}
r = requests.get("https://c.y.qq.com/lyric/fcgi-bin/fcg_query_lyric_yqq.fcg", params=jsond, headers=headers)
r.encoding = "utf-8"
ch_pat = re.compile(r'[\u4e00-\u9fa5:]+')
ch_words = ch_pat.findall(r.text)
first = 0
for i in range(1, int(len(ch_words) / 2)):
if ch_words[i].find(':') > 0:
first = i
break
flag = first
for i in range(first, int(len(ch_words) / 2)):
if ch_words[i].find(':') < 0 and ch_words[i + 1].find(':') < 0 and ch_words[i + 2].find(':') < 0:
flag = i
break
#print(ch_words[flag:], "\n", flag)
#strres = ','.join(ch_words[flag:])
strquqita = ''
for i in ch_words[flag:]:
if i.find(':')<0:
strquqita = strquqita+i+","
#chuli = r.text.replace(" ",'').replace('[:','').replace("]
",'')
#f.write(codecs.BOM_UTF8)
f.write(strquqita+"\n")
print (strquqita)
f.close()
上词云代码
#-*-coding:UTF-8 -*-
import jieba
from wordcloud import WordCloud
f = open('test.txt', 'r+')
f.readline()
strchuli = ''
for i in f:
strchuli = strchuli+i+"。"
wordlist = jieba.cut(strchuli, cut_all=False)
#print (len(list(wordlist)))
word_string = " ".join(wordlist)
wordcloud = WordCloud(font_path='C:\Windows\Fonts\simkai.ttf', background_color="white",width=1000, height=860, margin=2).generate(word_string)
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file('jieguo.png')