爬虫入门学习

跟着学习了爬取虎牙lol 主播名字+流量的排序列表

from urllib import request
from io import BytesIO
import gzip
import re
from re import sub
# 断点调试,

class Spider():
    url = 'https://www.huya.com/g/1'
    root_pattern = '
  • ([\s\S]*?)
  • '
    name_pattern = '[\s\S]*?' numnber_pattern = '([\s\S]*?)' def __fetch_content(self): r = request.urlopen(Spider.url) htmls = r.read() htmls = str(htmls,encoding='utf-8') # buff = BytesIO(htmls) # f = gzip.GzipFile(fileobj=buff) # htmls = f.read().decode('utf-8') #可以用来解决编码问题 return htmls def __sort(self,anchors): anchors = sorted(anchors, key=self.__sort_seed,reverse=True) return anchors def __sort_seed(self,anchor): r = re.sub(r'[^\d.]', '', anchor['number']) #用来解决千分符问题 r = re.findall('[1-9]\d*\.?\d*',r) number = float(r[0]) if '万' in anchor['number']: number *= 10000 #解决万字导致的数字无法比较问题 return number def __analysis(self,htmls): root_html = re.findall(Spider.root_pattern, htmls) anchors = [] for html in root_html: name = re.findall(Spider.name_pattern,html) number = re.findall(Spider.numnber_pattern,html) anchor = {'name':name,'number':number} anchors.append(anchor) return anchors def __refine(self,anchors): l = lambda anchor: {'name':anchor['name'][0].strip(), 'number':anchor['number'][0]} return map(l,anchors) def __show(self,anchors): for rank in range(0,len(anchors)): print('rank ' + str(rank+1) + ' : ' + anchors[rank]['name'] + ' :' + anchors[rank]['number']) def go(self): htmls = self.__fetch_content() anchors = self.__analysis(htmls) anchors = list(self.__refine(anchors)) anchors = self.__sort(anchors) self.__show(anchors) s = Spider() s.go()

    你可能感兴趣的:(聚沙成塔,python)