python爬取小说

一、准备

安装 requests 、pyquery库

二、使用

定义了search类 初始化时传入小说第一章url 和小说名即可
再调用all_content方法即可

# -*-coding:utf8-*-
import re
import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq

class searchbook:
    def __init__(self,url,bookname):
        self.url=url
        self.bookname=bookname
        self.baseurl=url.split('/')[0]+'//'+url.split('/')[2]
    
    _headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }

    def __writeintxt__(self,info):
        """写入txt文件,封装"""
        newname=self.bookname + '.txt'
        with open(newname,'a+',encoding='utf-8') as file:
            file.write('\n'.join([info]))
            file.write('\n' + '='*20 + '\n')  
    
    def __getallchapter__(self,url):
        """获取章节内容"""
        try:
            one_chapter=requests.get(url,headers=self._headers)
            #为防止编码问题调用apparent_encoding,自动识别编码
            #print(one_chapter.encoding)
            
            one_chapter.encoding=self.codemode(one_chapter)
            
            #print(one_chapter.encoding)
            #加入headers标识计算机
            if one_chapter.status_code==200:
                content=one_chapter.text
                #print(content)
                doc=pq(content)
                #获取文章标题、正文、下一章节URL
                title=doc('.bookname h1').text()
                self.__writeintxt__(title)
                maincontent=doc('#content').text()
                self.__writeintxt__(maincontent)
                next_url=doc('.bottem1 a').items()
                #print(next_url,type(next_url))
                all_url=[]
                for a in next_url:
                    #print (a.text())
                    if a.text() in ["上一章","章节目录","下一章","章节列表",]:
                        all_url.append(a.attr.href)
                #print(all_url)  
                #获取下一章节的url
                if len(all_url) == 3:
                    if all_url[-1]:
                    #print(all_url)
                        if all_url[-1][:1:]=='/':
                            next_url=self.baseurl+all_url[-1]
                            print(title,'\n',next_url) 
                        else:
                            next_url=self.baseurl +all_url[-2] + all_url[-1]
                            print(title,'\n',next_url)
                        self.__getallchapter__(next_url)
                else:
                    pass
            else:
                print('end')
        except RequestException:
            print('error')   
    
    def getcharset(self,content):
        """该方法是apparent_code的源代码截取
        留在此处可以考虑重载该方法
        但本例暂不考虑"""
        charset=re.compile(r']', flags=re.I)
        
        print(charset.findall(content))

        return  (charset.findall(content))


    def all_content(self):
        """此函数获取"""
        self.__getallchapter__(self.url)

    def getcontent(self,info):
        """此方法为避免__getallchapter__方法太过冗长而设"""
        pass

    def codemode(self,getrequest):
    """为应对网页出现乱码问题
    例如无法识别charset等导致apparent_encoding无法正常识别编码
    重新定义编码"""
        if getrequest.encoding in ['Windows-1252','ISO-8859-1']:
            return ('gbk')
        else :
            return getrequest.apparent_encoding
        
   

你可能感兴趣的:(Python)