1、通过热门歌手,抓取歌曲ID。
2、通过歌曲ID,抓取评论用户ID。
3、通过评论用户ID,发送定向推送消息。
上一篇完成了步骤1,本文完成步骤2。
题外话:上篇用的requests无页面的方法获取歌曲ID,速度比较快,但是获取到2000条左右就会被服务器识别成爬虫而被禁IP,通过连接手机热点,重启飞行模式后再连接就又可以获取2000条。
上篇我们用MYSQL存储爬取结果,本次也将使用相同方法,同时本篇将支持错误重做,每处理完一条记录就打一个处理标志位Y,和我们生产系统的做法类似。
这里又需要创建一个叫userinf的表,存储用户的ID和评论时间,主页地址。
建表语句如下:
DROP TABLE IF EXISTS `userinf`;
CREATE TABLE `userinf` (
`id` int(12) NOT NULL AUTO_INCREMENT,
`user_id` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`user_name` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci ,
`user_time` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci ,
`user_url` varchar(400) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
`clbz` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci ,
`bysz` float(3, 0) NULL DEFAULT 0.00,
PRIMARY KEY (`id`) USING BTREE,
INDEX `user_id`(`user_id`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
创建完以后,我们需要创建一个python程序来插入这个表。
python程序命名为:useridSpiderSQL.py,代码为:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'luoji'
import pymysql
# from ,where, group by, select, having, order by, limit
class Mysql_pq(object):
def __init__(self):
self.conn = pymysql.Connect(host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='python',
#创建数据库格式时用utf8mb4这个格式,因为可以存储表情等非字符
charset='utf8mb4'
)
self.cursor = self.conn.cursor()
def modify_sql(self,sql,data):
self.cursor.execute(sql,data)
self.conn.commit()
def __del__(self):
self.cursor.close()
self.conn.close()
def insert_userinf(user_id,user_name,user_time,user_url,clbz):
helper = Mysql_pq()
print('连接上了数据库python,准备插入歌曲信息')
# 插入数据
insert_sql = 'insert into userinf(user_id,user_name,user_time,user_url,clbz) value (%s,%s,%s,%s,%s)'
data = (user_id,user_name,user_time,user_url,clbz)
helper.modify_sql(insert_sql, data)
if __name__ == '__main__':
# helper = Mysql_pq()
# print('test db')
# #测试
# insert_sql = 'insert into weibo_paqu(werbo) value (%s)'
# data = ('222222xxxxxx2222 ',)
# helper.modify_sql(insert_sql,data)
user_id='519250015'
user_name= '请记住我'
user_url = 'https://music.163.com/#/song?id=1313052960&lv=-1&kv=-1&tv=-1'
user_time = '2021年2月18日'
clbz = 'N'
insert_userinf(user_id,user_name,user_time,user_url,clbz)
print('test over')
为了错误重做,我们处理完一条songinf就更新处理标志为Y,当出错时,程序自动跳过处理标志位Y的记录,仅仅处理处理标志位N的记录,这样就可以完成接力。
所以为了完成这个接力,我们需要在爬完一首歌的评论用户后,回头更新songinf。我们需要创建一个python程序来插入这个表。
python程序命名为:updateSongURLSQL.py,代码为:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'luoji'
import pymysql
# from ,where, group by, select, having, order by, limit
class Mysql_pq(object):
def __init__(self):
self.conn = pymysql.Connect(host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='python',
#创建数据库格式时用utf8mb4这个格式,因为可以存储表情等非字符
charset='utf8mb4'
)
self.cursor = self.conn.cursor()
def __del__(self):
self.cursor.close()
self.conn.close()
def updater_songurl(url):
helper = Mysql_pq()
print('连接上了数据库python,准备插入歌曲信息')
sql = "UPDATE songinf SET clbz = 'Y' WHERE song_url= '%s'" % (url)
print('sql is :', sql)
helper.cursor.execute(sql)
helper.conn.commit()
if __name__ == '__main__':
url = 'https://music.163.com/#/song?id=569213220&lv=-1&kv=-1&tv=-1'
updater_songurl(url)
print('urllist = ',url )
print('update over')
为了防止被服务器禁,我们本次使用selenium自动化控制模块来控制浏览器访问,这样服务器无法区分是爬虫还是用户的访问,缺点是速度比较慢,目前爬取速度大概为1小时1000条用户数据。
运行了一晚上,目前获得10万+用户ID。这里需要使用上篇获取的song的URL信息,我们需要创建一个python程序来插入这个表。
python程序命名为:getSongURLSQL.py,代码为:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
__author__ = 'luoji'
import pymysql
# from ,where, group by, select, having, order by, limit
class Mysql_pq(object):
def __init__(self):
self.conn = pymysql.Connect(host='127.0.0.1',
port=3306,
user='root',
passwd='root',
db='python',
#创建数据库格式时用utf8mb4这个格式,因为可以存储表情等非字符
charset='utf8mb4'
)
self.cursor = self.conn.cursor()
# def modify_sql(self,sql,data):
# self.cursor.execute(sql,data)
# self.conn.commit()
def __del__(self):
self.cursor.close()
self.conn.close()
def select_songurl():
helper = Mysql_pq()
print('连接上了数据库python,准备插入歌曲信息')
urllist = []
sql = "SELECT * FROM songinf WHERE clbz = 'N'"
helper.cursor.execute(sql)
results = helper.cursor.fetchall()
for row in results:
id = row[0]
song_id = row[1]
song_name = row[2]
song_url = row[3]
clbz = row[4]
# 打印结果
print('id =', id)
print('song_url =',song_url)
urllist.append(song_url)
return urllist
if __name__ == '__main__':
# helper = Mysql_pq()
# print('test db')
# #测试
# insert_sql = 'insert into weibo_paqu(werbo) value (%s)'
# data = ('222222xxxxxx2222 ',)
# helper.modify_sql(insert_sql,data)
# song_id='519250015'
# song_name= '请记住我'
# song_url = 'https://music.163.com/#/song?id=1313052960&lv=-1&kv=-1&tv=-1'
# clbz = 'N'
urllist = select_songurl()
print('urllist = ',urllist )
print('test over')
所以数据库mysql很重要。
代码为:
import re
import time
import numpy as np
from flask_cors.core import LOG
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ChromeOptions
from getSongURLSQL import *
from useridSpiderSQL import *
from updateSongURLSQL import *
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass
return False
def geturl(urllist):
# 如果driver没加入环境变量中,那么就需要明确指定其路径
# 验证于2021年2月19日
driver = webdriver.Firefox()
#driver = webdriver.Chrome()
driver.maximize_window()
driver.set_page_load_timeout(30)
driver.set_window_size(1124, 850)
# locator = (By.)
for url in urllist:
print('now the url is :',url)
driver.get(url)
time.sleep(3)
print('开始登陆')
driver.switch_to.frame('g_iframe') # 网易云的音乐元素都放在框架内!!!!先切换框架
href_xpath = "//div[contains(@class,'cntwrap')]//div[contains(@class,'cnt f-brk')]//a[contains(@class,'s-fc7')]"
songid = driver.find_elements_by_xpath(href_xpath)
useridlist = []
usernamelist = []
for i in songid:
userurl = i.get_attribute('href')
userid = userurl[35:] #用户的id数字
print('userid = ',userid)
username = i.text
print('username = ',username)
try:
print('userid is ',userid)
if is_number(userid) : #说明是纯数字
print('用户id是数字,保留')
useridlist.append(userid)
usernamelist.append(username)
else:
iter
except (TypeError, ValueError):
print('用户id非数字,丢弃')
iter
#获取用户评论时间
commenttimelist=[]
time_xpath = "//div[contains(@class,'cntwrap')]//div[contains(@class,'rp')]//div[contains(@class,'time s-fc4')]"
songtime = driver.find_elements_by_xpath(time_xpath)
for itime in songtime:
#print(i.get_attribute('href'))
commenttime = itime.text
print('commenttime = ',commenttime)
commenttimelist.append(commenttime)
if len(commenttimelist)< len(useridlist):
for i in np.arange(0,len(useridlist)-len(commenttimelist),1):
commenttimelist.append('2021年2月18日')
print('len(useridlist) is = ',len(useridlist))
for i in np.arange(0,len(useridlist),1):
userid_i = useridlist[i]
username_i = usernamelist[i]
commenttime_i = commenttimelist[i]
#插入到数据库中
print('userid_i=',userid_i)
print('username_i=', username_i)
print('commenttime_i=', commenttime_i)
userurl_i ='https://music.163.com/#/user/home?id=' + str.strip(userid_i)
print('userurl_i=', userurl_i)
clbz = 'N'
try:
insert_userinf(userid_i, username_i, commenttime_i, userurl_i, clbz)
except :
print('插入数据库有错')
pass
time.sleep(5)
updater_songurl(url)
def is_login(source):
rs = re.search("CONFIG\['islogin'\]='(\d)'", source)
if rs:
return int(rs.group(1)) == 1
else:
return False
if __name__ == '__main__':
#url = 'https://music.163.com/#/discover/toplist?id=2884035'
urllist = select_songurl()
# urllist =['https://music.163.com/#/song?id=569200214&lv=-1&kv=-1&tv=-1','https://music.163.com/#/song?id=569200213&lv=-1&kv=-1&tv=-1']
geturl(urllist)
抓的结果如下:
这里有几点说明:
1、 最新评论的翻页我没有做,要做的话,需要爬取翻页的按钮并点击,然后又重新抓取用户id。
2、具体评论的内容暂时没有存储。
3、 爬取到的评论日期信息格式很不规范,需要后续处理。
下篇,将完成步骤3,就具备向10w级别的用户推送歌曲了。