Don't repeat yourself温故而知新

import requests, re
from urllib.parse import urljoin
# 写文件
def fileWrite(title, message, name):
    with open(name + ".txt", "a+") as f:
        f.write(title + "\n" + message + "\n")
        print(title, "下载成功!")
# 获取页面信息
def getWebPage(url):
    headers = {"User-Agent": "Mozilla/5.0 (iPad; CPU OS 11_0 like Mac OS X) "
                             "AppleWebKit/604.1.34 (KHTML, like Gecko) Versio"
                             "n/11.0 Mobile/15A5341f Safari/604.1", }
    try:
        res = requests.get(url, headers)
        res.encoding = res.apparent_encoding
        return res.text
    except:
        print("页面请求出错!")
# 提取链接
def getLink(page, url, paternName, paternLink):
    try:
        links = re.findall(paternLink, page, re.M|re.S|re.I)
        # 相对路径转绝对路径
        links = iter([urljoin(url, link) for link in links])
        name = re.findall(paternName, page)[0]
        return name, links
    except:
        print("提取页面信息错误!")
# 获取内容
def getContent(name, link, paternTitle, paternBody):
    page = getWebPage(link)
    try:
        title = re.findall(paternTitle, page)[0]
        content = re.findall(paternBody, page)[0]
        content = content.replace(" ", " ").replace("
", "\n") fileWrite(title, content, name) except: print("提取小说页面错误!") # main def main(url, paternName, paternLink, paternTitle, paternBody): page = getWebPage(url) name, links = getLink(page, url, paternName, paternLink) for link in links: getContent(name, link, paternTitle, paternBody) print("小说下载完毕!") if __name__ == '__main__': paternName, paterLink, paternTitle, paternBody = "

(.*?)

", \ "
.*?
", \ "

(.*?)

", \ '
(.*?)

.*?

' main("http://www.xbiquge.la/2/2208/", paternName, paterLink, paternTitle, paternBody)

你可能感兴趣的:(Python)