python原生爬虫爬取熊猫TV LOL主播人气排行

本文采取phthon原生爬虫,没有采用常用的爬虫框架,比较适合新手练手。

首先进入熊猫TV英雄联盟主页————https://www.panda.tv/cate/lol?pdt=1.24.s1.2.4jhlr7qfu0h

F12,找到要爬取的标签,

 直接上代码

from urllib import request
import re

class Spider():
    # 定义需要的变量和正则表达式
    url = 'https://www.panda.tv/cate/lol?pdt=1.24.s1.2.4jhlr7qfu0h'
    root_pattern = '
([\s\S]*?)
' name_pattern = '' number_pattern='([\s\S]*?)' # 获取html def __fetch_content(self): r = request.urlopen(Spider.url) #r.read返回的是字节码格式,需要用字符串转换 htmls = r.read() htmls = str(htmls,encoding="utf-8") return htmls #主要爬取主播姓名video-nickname和主播人气值video-number video-info标签包含这两个标签而且是闭合的 def __analysis(self,htmls): root_html = re.findall(Spider.root_pattern,htmls) anchors = [] for html in root_html: name = re.findall(Spider.name_pattern,html) number = re.findall(Spider.number_pattern,html) anchor = {'name':name,'number':number} anchors.append(anchor) return anchors # 对获取的数据进行处理 def refine(self,anchors): l = lambda anchor:{ "name":anchor["name"][0].strip(), "number":anchor["number"][0] } return map(l, anchors) # 在控制台输出结果 def __show(self,anchors): for rank in range(0,len(anchors)): print('rank'+str(rank + 1) +" : "+anchors[rank]['name'] +" "+anchors[rank]['number']) # 降序排序 def __sort(self,anchors): anchors = sorted(anchors,key=self.__sort_seed,reverse=True) return anchors #对降序需要的字段进行处理 def __sort_seed(self,anchor): r = re.findall('\d*',anchor['number']) number = float(r[0]) if "万" in anchor['number']: number *=10000 return number #主方法 def go(self): htmls = self.__fetch_content() anchors = self.__analysis(htmls) anchors = list(self.refine(anchors)) anchors = self.__sort(anchors) self.__show(anchors) spider =Spider() spider.go()

注意要是想计算别的游戏的主播排行,可以直接把网址换一下,代码不用换

你可能感兴趣的:(python,python,pachong,spider,lol)