Python爬虫学习笔记

参照七月的原生爬虫教程练习结果,有部分修改

代码

from urllib import request
import re
import operator

class Spider(object):
    url = 'https://www.panda.tv/cate/lol'
    root_pattern = '
([\s\S]*?)
' name_pattern = '([\s\S]*?)' number_pattern = '([\s\S]*?)' def __fetch_content(self): r = request.urlopen(Spider.url) htmls = r.read() htmls = str(htmls, encoding='utf-8') return htmls def __analysis(self, htmls): root_html = re.findall(Spider.root_pattern, htmls) anchors = [] for html in root_html: name = re.findall(Spider.name_pattern, html)[0].strip() number = re.findall(Spider.number_pattern, html) if '万' in number[0]: number = number[0].rstrip('万') number = float(number) * 10000 else: number = float(number[0]) anchor = {'name': name, 'number': number} anchors.append(anchor) return anchors def __sort(self, anchors): anchors = sorted(anchors, key = operator.itemgetter('number'), reverse = True) return anchors def __show(self, anchors): for ranking, anchor in enumerate(anchors, start = 1) : print(f"第{ranking}位, {anchor['name']} --- {int(anchor['number'])} ") def go(self): htmls = self.__fetch_content() anchors = self.__analysis(htmls) anchors = self.__sort(anchors) self.__show(anchors) spider = Spider() spider.go()

你可能感兴趣的:(Python爬虫学习笔记)