网络爬虫抓取小说并整理为可读文件

现在爬小说越来越难了,受到很多限制,格式也有很多变化,这里是一些用过的旧代码,可以改改再继续用。

# 爬取笔趣阁小说
from urllib.request import urlopen
from requests.exceptions import RequestException
import re
from requests import get
import time
import requests

from lxml import etree

requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

params = {
    'enc':'utf-8'
}

urlfirst ="https://www.biqugexx.com/0_427/" #目录页  格式 

第1章 重返地球

# response = get(urlfirst) response = requests.get(url=urlfirst,params=params,headers=headers) # 打印出所请求页面返回的编码方式 # print(response.encoding) # response.apparent_encoding是通过内容分析出的编码方式,这里是urf-8 # print(response.apparent_encoding) # 转码 content = response.text.encode(response.encoding).decode(response.apparent_encoding) # print(content) # print(response.text) html = etree.HTML(response.text) #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正 k = "" # s="
第一百二十八章 阳明公也是
" # p = "
第" # r=re.findall("
第",s) # pattern = "
(.*?)
" # result = re.findall(pattern,content) # print(result) # exit() # content = result # pattern = "
(第[\u4e00-\u9fa5]+章 [\u4e00-\u9fa5]+)
" pattern = "
第[0-9]+章
" # /28/28056/21593229.html' >第六十一章 死道友不死贫道 result = re.findall(pattern,content) # # print(result) for pp in result: time.sleep(3) # 太频繁会被封IP k += "\n"+pp[1]+"\n" # 标题 print(pp[1]) chapter = "https://www.biqugexx.com"+pp[0]+".html" print(chapter) try: response = requests.get(url=chapter,params=params,headers=headers) content = response.text.encode(response.encoding).decode(response.apparent_encoding) pattern = "
    (.*?)    ps." result = re.findall(pattern,content) # 获得每章数据 ; print(result) # 测试输出内容 for r in result: r = r.replace(" "," ") r = r.replace("
","\n") k += r except RequestException as e: print(e) with open("biquge.txt","w+",encoding="utf8")as f: f.write(k)
# 爬取笔趣阁小说
from urllib.request import urlopen
from requests.exceptions import RequestException
import re
from requests import get
import time
import requests

from lxml import etree

requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}

params = {
    'enc':'utf-8'
}

urlfirst ="https://www.bokan.cc/12/12620/" #目录页  格式 

第1章 重返地球

# response = get(urlfirst) response = requests.get(url=urlfirst,params=params,headers=headers) # 打印出所请求页面返回的编码方式 # print(response.encoding) # response.apparent_encoding是通过内容分析出的编码方式,这里是urf-8 # print(response.apparent_encoding) # 转码 content = response.text.encode(response.encoding).decode(response.apparent_encoding) # print(content) # print(response.text) html = etree.HTML(response.text) #etree.HTML():构造了一个XPath解析对象并对HTML文本进行自动修正 k = "" # s="
第一百二十八章 阳明公也是
"
# p = "
第" # r=re.findall("
第",s) # pattern = "
(.*?)
"
# result = re.findall(pattern,content) # print(result) # exit() # content = result # pattern = "
(第[\u4e00-\u9fa5]+章 [\u4e00-\u9fa5]+)
" pattern = "
  • 第[0-9]+章
  • "
    # /28/28056/21593229.html' >第六十一章 死道友不死贫道 result = re.findall(pattern,content) # # print(result) for pp in result: time.sleep(3) # 太频繁会被封IP k += "\n"+pp[1]+"\n" # 标题 print(pp[1]) chapter = "https://www.bokan.cc"+pp[0]+".html" print(chapter) try: response = requests.get(url=chapter,params=params,headers=headers) content = response.text.encode(response.encoding).decode(response.apparent_encoding) pattern = "    第(.*?)
    "
    result = re.findall(pattern,content) # 获得每章数据 ; print(result) # 测试输出内容 for r in result: r = r.replace(" "," ") r = r.replace("
    "
    ,"\n") k += r except RequestException as e: print(e) with open("biquge.txt","w+",encoding="utf8")as f: f.write(k)
    import requests
    import re
    import time
    
    s = requests.Session()
    url = 'https://www.xxbiquge.com/2_2634/'
    url = 'https://www.xsbiquge.com/91_91879/'
    html = s.get(url)
    html.encoding = 'utf-8'
    
    # 获取章节
    caption_title_1 = re.findall(r'',html.text)
    caption_title_2 = re.findall(r'',html.text) #第1章
    print(len(caption_title_1),len(caption_title_2))
    # 写文件
    path = r'title.txt'     # 这是我存放的位置,你可以进行更改
    file_name = open(path,'a',encoding='utf-8')
    
    k = 0
    # 循环下载每一章
    for i in caption_title_1:
        nn = int(caption_title_2[k])
        k = k+1
        print(nn)
        if(nn>1000):
            time.sleep(1)
            caption_title_1 = 'https://www.xxbiquge.com'+i
            # 网页源代码
            s1 = requests.Session()
            r1 = s1.get(caption_title_1)
            r1.encoding = 'utf-8'
    
            # 获取章节名
            name = re.findall(r'',r1.text)[0]
            print(name)
            # cc = re.findall(r'第([0-9]+)章',name)
            # nn = int(cc[0])
            # if(nn>900):
    
            file_name.write(name)
            file_name.write('\n')
    
            # 获取章节内容
            chapters = re.findall(r'
    (.*?)
    '
    ,r1.text,re.S)[0] chapters = chapters.replace(' ', '') chapters = chapters.replace('readx();', '') chapters = chapters.replace('& lt;!--go - - & gt;', '') chapters = chapters.replace('<!--go-->', '') chapters = chapters.replace('()', '') # 转换字符串 s = str(chapters) s_replace = s.replace('
    '
    ,"\n") while True: index_begin = s_replace.find("<") index_end = s_replace.find(">",index_begin+1) if index_begin == -1: break s_replace = s_replace.replace(s_replace[index_begin:index_end+1],"") pattern = re.compile(r' ',re.I) fiction = pattern.sub(' ',s_replace) file_name.write(fiction) file_name.write('\n') file_name.close()

    你可能感兴趣的:(笔记,Python,电子书)