初级爬虫爬取笔趣阁小说

import requests
from pyquery import PyQuery as pq 

def get_content(a):
    response=requests.get(a)  
    response.encoding = 'gbk'
    doc = pq(response.text)
    text=doc('#content.showtxt')
    bookname=doc('div.bookname h1')
    c=str(bookname)
    a=str(text)
    b=a.replace("

","\n").replace('

','\n').replace('
 请记住本书首发域名:www.biqugexsw.com。笔趣阁小说网手机版阅读网址:m.biqugexsw.com
','').replace('\xa0','').replace('
','')#初级过滤 file = open(u'F:\python\小说下载区\小说.txt','a+') file.write(a)#写入章节名称 file.write(b)#写入内容 file.close() def get_mulu(): index_url='https://www.biqugexsw.com/71_71883/'#替换任意一本小说url response=requests.get(index_url) response.encoding = response.apparent_encoding#自判断编码 doc = pq(response.text) urls = doc('div.listmain a') length=int(len(urls)) count=0 for i in urls.items(): a='https://www.biqugexsw.com/'+i.attr.href#获取所有章节 get_content(a) count += 1 print('进度:%0.5f' % (count / length)+'%') get_mulu()

入门级爬虫,3500章的小说大概是用了20分钟,还可以吧,中间没有发生堵塞和timeout估计是这家网站没做反爬。

你可能感兴趣的:(爬虫,python,爬虫)