使用python对斗鱼平台进行自动化爬取。
爬取斗鱼直播分类里面每个房间的信息:
地址url = “https://www.douyu.com/directory/all”;
url为get请求,项目使用的selenium.webdriver模块,webdriver.get就可以获取信息,selenium对cooike,user-agent等header这块是不需要设置的,浏览器打开后获取信息用用到selenium内部的信息提取技术,这一块附有中文文档链接selenium中文文档,建议大家多看文档,提取信息保存到MySQL中,这里本人在保存数据库这块引用了自定义模块,代码在下面。
from selenium import webdriver
import time
import sys
import os
sys.path.append(os.getcwd() + "\\Douyu_spider\\mysql") # 这里是vscode的问题需要添加全局路径
from mysql import Mysql # 自定义模块,数据库建表、存储等
class DouyuSpider:
def __init__(self):
super().__init__()
self.start_url = "https://www.douyu.com/directory/all"
self.driver = webdriver.Chrome(
executable_path=".\Douyu_spider\chromedriver.exe") # executable_path添加的是关于selenium用到的自动化配件chromedriver.exe的路径
self.mysql = Mysql()
# sql语句这个要根据自己情况更改
self.sql = """CREATE TABLE douyu_item(
home_id INT(11) NOT NULL AUTO_INCREMENT,
home_name VARCHAR(100) NOT NULL DEFAULT '',
home_url VARCHAR(100) NOT NULL,
home_type CHAR(20) NOT NULL DEFAULT '',
home_heat CHAR(20) NOT NULL DEFAULT '',
PRIMARY KEY(home_id,home_url)
)"""
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath(
"//ul[@class='layout-Cover-list']/li")
print("list is good!!!")
home_item = []
# 提取数据
for li in li_list:
item = {}
item['home_name'] = li.find_element_by_xpath(".//h3").text
item['home_url'] = li.find_element_by_xpath(
".//a[@class='DyListCover-wrap']").get_attribute("href")
item['home_type'] = li.find_element_by_xpath(
".//span[@class='DyListCover-zone']").text
item['home_heat'] = li.find_element_by_xpath(
".//span[@class='DyListCover-hot']").text
home_item.append(item)
print(item)
time.sleep(1)
# 获取下一页元素
next_url = self.driver.find_element_by_xpath(
"//div//li[@class=' dy-Pagination-next']").get_attribute("aria-disabled")
print(next_url)
if next_url is not True:
next_url = self.driver.find_elements_by_xpath(
"//div//li[@class=' dy-Pagination-next']/span"
)[0]
else:
next_url = None
return home_item, next_url
def save_content_list(self, content_list):
print(content_list)
# 进行数据保存
for item_dict in content_list:
# 也是sql语句也要根据自身情况更改
into = "INSERT INTO douyu_item(home_name,home_url,home_type,home_heat) VALUES (%s,%s, %s, %s)"
values = (
item_dict['home_name'],
item_dict['home_url'],
item_dict['home_type'],
item_dict['home_heat']
)
self.mysql.write(into, values)
print("Saved successfully!!!")
def run(self): # 实现主要逻辑
# start_url
# 发送请求获取响应
self.driver.get(self.start_url)
time.sleep(8)
# 提取数据,下一页按钮
content_list, next_url = self.get_content_list()
# 保存数据
self.mysql.create(self.sql) # 创建表
self.save_content_list(content_list)
# 点击下一页元素,循环
while next_url is not None:
next_url.click()
time.sleep(2)
content_list, next_url = self.get_content_list()
self.save_content_list(content_list)
print("carry out !!!")
self.mysql.close()
if __name__ == '__main__':
douyu = DouyuSpider()
douyu.run()
# 自定义模块部分,主要功能引用SQL第三方模块
import MySQLdb
class Mysql:
def __init__(self):
super().__init__()
self.conn = MySQLdb.Connect(
host='127.0.0.1',
user='root',
passwd='******',
db='******',
charset='utf8',
)
self.cur = self.conn.cursor()
def create(self, sql):
# 创建表
self.cur.execute(sql)
self.conn.commit()
def write(self, into, values):
# 写入数据
self.cur.execute(into, values)
self.conn.commit()
def close(self):
# 关闭数据库
self.conn.close()
"1","今天目标传说 11点15分看发布会","https://www.douyu.com/21267","炉石传说","107.5万"
"2","蛋白:我来了!!!","https://www.douyu.com/762505","我的世界","77.1万"
"3","8点PK赛求助啦。。。。","https://www.douyu.com/6515025","舞蹈","36.2万"
"4","巅峰赛冲冲冲 1811143","https://www.douyu.com/1811143","王者荣耀","162.2万"
"5","","https://www.douyu.com/11579","",""
"6","","https://www.douyu.com/607575","",""
"7","","https://www.douyu.com/6868395","",""
"8","","https://www.douyu.com/93589","",""
"9","","https://www.douyu.com/6512","",""
"10","","https://www.douyu.com/25515","",""
"11","LPL春季赛WEvsFPX","https://www.douyu.com/288016","英雄联盟","683万"
"12","王者1500 小号上分教学!!!","https://www.douyu.com/7846871","lol云顶之弈","8.8万"
"13","我是一个被13卷伤过的人","https://www.douyu.com/99999","DNF","763.4万"
"14","众生皆苦我超甜(*˘︶˘*).。.:*♡","https://www.douyu.com/7711664","颜值","9.1万"
"15","北大学霸首秀活动","https://www.douyu.com/7781490","穿越火线","38.4万"
"16","下午冒险岛,7点LOL,10点吃鸡","https://www.douyu.com/101","英雄联盟","624.2万"
"17","骚白巅峰赛系列","https://www.douyu.com/911","王者荣耀","417.7万"
"18","东北大鹌鹑 相声艺术家 他来了","https://www.douyu.com/96291","英雄联盟","303.3万"
"19","希望我能成为你坚定的选择","https://www.douyu.com/8043680","颜值","4.1万"
"20","一周年快乐 洪湖小肖337852","https://www.douyu.com/337852","户外","373.1万"
"21","今晚是火箭筒大战+飞机大战","https://www.douyu.com/2222","绝地求生","298.3万"
"22","呆妹:嘿~你的小奶猪上线啦~","https://www.douyu.com/92000","绝地求生","236万"
"23","回头看看走过的路,满满的青春和回忆~","https://www.douyu.com/728","穿越火线","233万"
"24","吃个冰棍呀呀呀。","https://www.douyu.com/4626326","颜值","15.3万"
"25","来,进来我们好好聊聊呗","https://www.douyu.com/8145735","颜值","9.6万"
"26","cod教学 9418","https://www.douyu.com/9418","CS:GO","221.1万"
"27","好!!!!! 9999","https://www.douyu.com/9999","DOTA2","189.9万"
"28","挑战uno连胜,输了唱征服","https://www.douyu.com/5524515","绝地求生","177.9万"
"29","期待有你的每一天~","https://www.douyu.com/7665617","颜值","7.3万"
"30","可以做我的小幸运吗?","https://www.douyu.com/7951501","颜值","16.5万"
"31","神秘的直播间 88080","https://www.douyu.com/88080","绝地求生","173.7万"
"32","秋风:全忍者教学直播,四月新版本即将来袭","https://www.douyu.com/535534","火影忍者","170.8万"
"33","淑怡:爷来了","https://www.douyu.com/22222","英雄联盟","168.5万"
"34","快点来保护我吧~~~","https://www.douyu.com/7555161","颜值","3.9万"
"35","明天去海南","https://www.douyu.com/777","户外","338.2万"
"36","巅峰赛冲冲冲 1811143","https://www.douyu.com/1811143","王者荣耀","162.2万"
"37","吃可爱长大的蘑菇~","https://www.douyu.com/3733860","欢乐麻将","161.9万"
"38","二指巅峰,至尊手速!","https://www.douyu.com/9595","和平精英","152.7万"
"39","9点皇冠大作战","https://www.douyu.com/485503","户外","223.6万"
"40","一个普普通通小姑娘","https://www.douyu.com/6677","颜值","155.8万"
"41","Gemini:猛男上分记!","https://www.douyu.com/36252","王者荣耀","152.1万"
"42","最强身法,秀翻全场","https://www.douyu.com/101367","绝地求生","151.5万"
"43","滴神上线了 找怪碰瓷!","https://www.douyu.com/6512","传奇","148.5万"
"44","我是个篮球手,晚点恐怖游戏。","https://www.douyu.com/554559","户外","145.1万"
"45","张文中的数字零售革命","https://www.douyu.com/2631870","直播中国","120.6万"
"46","pigff【COD快递模式】","https://www.douyu.com/24422","绝地求生","147.3万"
"47","冲国服露娜!!!","https://www.douyu.com/2124270","王者荣耀","139.7万"
"48","【斯祥】逃离塔科夫最强男人!锤!!!","https://www.douyu.com/2311698","绝地求生","130.4万"
"49","奥利给!! 533813","https://www.douyu.com/533813","户外","113.7万"
"50","户外美食(≧ω≦)/","https://www.douyu.com/55777","美食","84.1万"
"51","寅子的游戏教室","https://www.douyu.com/71415","腐烂国度","127.5万"
"52","小乔新皮肤玩玩","https://www.douyu.com/5063899","王者荣耀","122.6万"
"53","双排认证号冲战神_","https://www.douyu.com/10086","和平精英","122万"
"54","给我一首歌的时间 861644","https://www.douyu.com/861644","颜值","76.4万"
"55","美女来别墅做节目。","https://www.douyu.com/2279072","户外","69.8万"
"56","来来来,进来找个位子坐坐","https://www.douyu.com/25515","王者模拟战","121.8万"
"57","欢迎收看浪子断的1000种死法","https://www.douyu.com/29599","和平精英","120.5万"
"58","COD吃鸡真好玩 70002","https://www.douyu.com/70002","绝地求生","119.1万"
"59","豆子:周二晚上八点见哦~( ˘ ³˘)❤","https://www.douyu.com/1209","二次元","62.4万"
"60","小时榜第二名加油","https://www.douyu.com/607575","二次元","60.7万"
"61","叶知秋: 吃鸡环节","https://www.douyu.com/111111","绝地求生","117万"
"62","老李:全民免费COD,BR战区!","https://www.douyu.com/480298","COD16","113.1万"
"63","全DY家庭地位最高的男人","https://www.douyu.com/63136","CS:GO","111.6万"
"64","想谈一场甜甜的恋爱~","https://www.douyu.com/7476010","颜值","58.9万"
"65","大家晚上见 7932689","https://www.douyu.com/7932689","颜值","56.1万"
"66","放手一搏吧别顾虑太多,这是男人该有的性格","https://www.douyu.com/66666","DNF","109.9万"
"67","装男人教学和小姐姐玩耍!","https://www.douyu.com/123455","英雄联盟","109.5万"
"68","小朋友 你是否有很多问好","https://www.douyu.com/109027","COD16","109.5万"
"69","【交友】 招收优质全职女主持!","https://www.douyu.com/7597559","交友","52.9万"
"70","8.30左右播","https://www.douyu.com/1221923","户外","52.2万"
"71","海岛带妹手册创始人","https://www.douyu.com/16166","和平精英","109.4万"
"72","百级待出 我将冲起来!!!","https://www.douyu.com/486808","DNF","107.7万"
"73","今天目标传说 11点15分看发布会","https://www.douyu.com/10029","炉石传说","107.5万"
"74","涂乙冬:疫情中的员工压力管理","https://www.douyu.com/7824526","直播中国","51.4万"
"75","录-是你有温度的播放器~","https://www.douyu.com/5767883","电台","51.3万"
"76","三指巅峰 1v4 开锤!!","https://www.douyu.com/6550676","和平精英","106.8万"
"77","看看青训队的比赛,然后随便单排2把~","https://www.douyu.com/88660","DOTA2","106.2万"
"78","胡凯利: 欢乐时光已经开始了!","https://www.douyu.com/138243","英雄联盟","106.1万"
"79","80级冲鸭","https://www.douyu.com/4238637","颜值","50.2万"
"80","【交友】招收交友主持/颜值第一","https://www.douyu.com/7255932","交友","50.2万"
"81","《剑网3》竞技大师赛-俱乐部争锋赛","https://www.douyu.com/641986","剑网3","105.8万"
"82","乌鸦:半蛇兜教学!无限上超!","https://www.douyu.com/9293","火影忍者","100.5万"
"83","今晚olimo联赛","https://www.douyu.com/3484","星际争霸","99.5万"
"84","【交友】本厅由“四万操碎心”冠名!","https://www.douyu.com/7597583","交友","49.3万"
"85","静待有缘人 7528274","https://www.douyu.com/7528274","舞蹈","49万"
"86","九日:欢乐排位","https://www.douyu.com/2127419","王者荣耀","98.1万"
"87","KPL精彩回顾 3月18日春季赛开幕","https://www.douyu.com/998","王者荣耀","97.8万"
"88","【曜】英雄教学系列..","https://www.douyu.com/2207607","王者荣耀","92.8万"
"89","哪有什么四季,遇见你四季如春","https://www.douyu.com/7034996","颜值","47.5万"
"90","进来不后悔?","https://www.douyu.com/6294690","舞蹈","46.3万"
"91","宇宇:落地不努力 监狱做兄弟","https://www.douyu.com/3572778","COD16","92万"
"92","【Sopa】COD16使命必达","https://www.douyu.com/651654","COD16","89.5万"
"93","吹事班开饭了今日菜谱5鸡","https://www.douyu.com/48699","炉石传说","88.5万"
"94","萌新她不香嘛???","https://www.douyu.com/8332732","颜值","46.2万"
"95","三憨憨砥砺前行,","https://www.douyu.com/7626940","户外","46.1万"
"96","西瓜表弟:3出1,210个盒子,醉了","https://www.douyu.com/147887","DNF","86万"
"97","【错觉】今天的我.猛到看不懂!","https://www.douyu.com/12345","绝地求生","85.8万"
"98","COD:世界第一胜利全平台排行榜","https://www.douyu.com/4087173","COD16","84.8万"
"99","等一个英雄腾云驾雾为我而来","https://www.douyu.com/7148558","颜值","44.3万"
"100","丛林争霸,极限挑战","https://www.douyu.com/7714592","户外","41.4万"
首先,selenium的用处大部分使用在登陆处理这块,后续我也会更新登陆处理这块文章,selenium优缺点有很多,优点:就是类人化,服务器是分辨不出来的。缺点:效率低,如果说要用来爬取网页的话,还是算了吧,在这里我爬取斗鱼平台的信息也只是爬取少量部分,仅供学习使用。
其次,这一个小项目主要的学习方向,还是构思代码、selenium的使用、以及python与MySQL的交互等等,当然selenium在爬虫的主要应用还是在登录处理,已经cooike获取等等,优点还是很明显的。
最后,小编声明此次小项目只是研究学习,别无用处,后续会继续更新关于登录处理这块文章,来自电脑小白的总结,学艺不精,献丑了。