先上代码
小说的目录页面是
http://www.lhh1.com/modules/article/reader.php?aid=33
# coding: utf-8
import urllib
from urllib import request
import os
from os import path
import re
result_dir = path.join(os.getcwd(),'result') #创建
if not path.exists(result_dir):
os.makedirs(result_dir)
#一开始有编码问题 就直接把目录页的HTML复制到txt里用utf-8编码 后来才发现用的gbk编码,难得改了
src = open('src.txt','r',encoding='utf-8').read()
pattern = '(.+?)'
res1 = re.compile(pattern,re.S).findall(src)
for name in res1[1:2]:
url = name[0]
#修饰URL
pattern_url = "(.+?)amp;(.*)" #去掉转义字符
res1 = re.compile(pattern_url, re.S).findall(url)
url = res1[0][0] + res1[0][1]
title =name[1]
print(url,title)
pattern_br = '\ \ \ \ (.+?)
html = request.urlopen(url).read()
html = html.decode('gbk') #编码形式
res2 = re.compile(pattern_br, re.S).findall(html)
# print(html)
# print(res2)
resfile = open(path.join(result_dir,title+'.txt'),'w') #分章节
words = ''
for x in res2:
res3 = re.compile('(.+)\
里面两个坑,一个是编码问题,另一个是url中的转义字符问题,在html中有5个转义字符,,HTML的 < < > > & & " " © © 分别是<,>,&,”,©;的转义字符。需要进行校正。
结果: