Python原生爬虫小demo

from urllib import request

import re

'''

可用 beautifulSoup scrapy 框架,爬虫,反爬虫,反反爬虫,ip封,代理ip

获取内容

提取内容

精炼内容

内容排序

输出,存数据库等

'''

class Spider():

url ="https://www.panda.tv/cate/lol"

    root_pattern ='

([\s\S]*?)
'  # 正则匹配,[]表区间, *匹配无限多次,?非贪婪(匹配0次或者一次),()只匹配中间部分

    name_pattern ='([\s\S]*?)'

    number_pattern ='([\s\S]*?)'

    def __fetch_content(self):# 获取内容

        r = request.urlopen(Spider.url)

htmls = r.read()

htmls =str(htmls, encoding='utf-8')

return htmls

def __analysis(self, htmls):# 提取内容

        root_html = re.findall(spider.root_pattern, htmls)

# print(root_html[0])

        anchors = []

for htmlin root_html:

name = re.findall(Spider.name_pattern, html)

number =re.findall(Spider.number_pattern, html)

anchor = {'name':name,'number':number}

anchors.append(anchor)

return anchors

def __refine(self, anchors):# 精炼 (去掉空白,换行符等)

        l =lambda anchors: {# lambda 表达式

            'name': anchors['name'][0].strip(),

            'number': anchors['number'][0]

}

return list(map(l, anchors))

def __sort(self, anchors):# 排序

        anchors =sorted(anchors, key=self.__sort_seed, reverse=True)# reverse 控制排序升降

        return anchors

def __sort_seed(self, anchor):# 排序键 有万字的要按乘以10000计

        r = re.findall('\d*', anchor['number'])

number =float(r[0])

if '万' in anchor['number']:

number *=10000

        return number

def __show(self, anchors):# 打印输出

        for rankin range(0, len(anchors)):

print('rank:'+str(rank+1)+';'+'name:'+anchors[rank]['name'] +';' +'number:' + anchors[rank]['number']+';')

def go(self):

htmls =self.__fetch_content()

anchors =self.__analysis(htmls)

anchors =self.__refine(anchors)

self.__show(anchors)

spider = Spider()

spider.go()


爬取结果

你可能感兴趣的:(Python原生爬虫小demo)