又来爬小说了,纯原创

最近发现一个手机在线看小说的网站,里面没有任何广告,先上链接地址:

https://m.lread.net/

然后在线看起来也是挺累,还是下载txt用掌阅书城app看单机小说有趣,想办法爬下来。

爬个小说本来就不是什么难受,现在问题是这个网站的小说做了页面做了分段处理,一个网页是没法拿到整篇小说的源码的。人家把一篇小说分成多个html页面储存了,而且你还得不到这篇小说的所有html页面,主页面上还没有提示。

不过,经过一通html分析,写了几段正则表达式,还是成功把小说爬了下来,就是单线程,非分布式,速度很慢,将就用吧

代码发上来以供参考,重点是网页结构分析,分析懂了才知道怎么爬。

#!/usr/bin/env python3
# _*_ coding: utf-8 _*_
# File  : 爬小说.py
# Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
# Date  : 2019/11/13

import requests
import re

def getCode(url): #获取主页源码
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
        "Cookie": "l=AurqcPuigwQdnQv7WvAfCoR1OlrRQW7h; isg=BHp6mNB79CHqYXpVEiRteXyyyKNcg8YEwjgLqoRvCI3ddxqxbLtOFUBGwwOrZ3ad; thw=cn; cna=VsJQERAypn0CATrXFEIahcz8; t=0eed37629fe7ef5ec0b8ecb6cd3a3577; tracknick=tb830309_22; _cc_=UtASsssmfA%3D%3D; tg=0; ubn=p; ucn=unzbyun; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; miid=981798063989731689; hng=CN%7Czh-CN%7CCNY%7C156; um=0712F33290AB8A6D01951C8161A2DF2CDC7C5278664EE3E02F8F6195B27229B88A7470FD7B89F7FACD43AD3E795C914CC2A8BEB1FA88729A3A74257D8EE4FBBC; enc=1UeyOeN0l7Fkx0yPu7l6BuiPkT%2BdSxE0EqUM26jcSMdi1LtYaZbjQCMj5dKU3P0qfGwJn8QqYXc6oJugH%2FhFRA%3D%3D; ali_ab=58.215.20.66.1516409089271.6; mt=ci%3D-1_1; cookie2=104f8fc9c13eb24c296768a50cabdd6e; _tb_token_=ee7e1e1e7dbe7; v=0",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
    }
    resp = requests.request("GET",url,headers=headers)
    resp.encoding = resp.apparent_encoding
    if resp.status_code == requests.codes.ok:
        return resp.text
    else:
        return False
def getNovelList(page_source,reurl="https://m.lread.net"):
    rl = re.compile(r"

(.*?)

") # 匹配文章列表 novels = rl.findall(page_source) novel_list = [] for i in novels: novel_dict = (i[1],reurl+i[0]) novel_list.append(novel_dict) return novel_list def getOneNovel(pagecode): rl = re.compile(r'

(.*?)

') # 匹配小说内容 content = str(rl.findall(pagecode)[0]) content = content.replace('\n','').replace('\t','').replace(' ','').replace(" "," ").replace("

","\n") return content def oneCapter(url): page_source = getCode(url.replace(".html","-1.html")) rl = re.compile(r'

(.*?)(1/(.*?))

') # 匹配总页数 content = rl.findall(page_source) page = int(content[0][1]) all_content = "" for i in range(1,page+1): v_url = url.replace(".html", f"-{i}.html") all_content += getOneNovel(getCode(v_url)) return all_content def download_novel(url="https://m.lread.net/read/208/", name="修仙狂少.txt"): page_source = getCode(url=url) novelList = getNovelList(page_source) print(novelList) f = open(name, encoding="utf-8", mode="w+") for novel in range(len(novelList)): title = novelList[novel][0] page_url = novelList[novel][1] all_content = title + "\n\n" + oneCapter(page_url) + "\n\n" print(f"\r正在写入: {title} {novel + 1}/{len(novelList)}", end="") f.write(all_content) f.close() if __name__ == '__main__': download_novel()

 

你可能感兴趣的:(python小项目)