正则爬虫
import requests
import re
target="https://www.vodtw.com/Html/Book/59/59089/"
headers ={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
req_text = requests.get(url=target,headers=headers)
req_text.encoding='gbk'
li_list = re.findall('i>(.*?)',chapter)[0]
chapter_title = re.findall('id="htmltimu"> (.*?) ',chapter)[0]
chapter_text=(re.findall('3px;">
([\W\w]*?)
',chapter)[0]).replace('
'
,'').replace
('
','').replace
(' ','')
percent
= count
/ len(li_list
) * 100
print('%s 下载进度 %0.1f %%'%(name
,percent
),end
='\r')
count
= count
+ 1
with open(name
+'.txt', 'a',encoding
='utf-8') as f
:
f
.write
(chapter_title
+'\n'+chapter_text
+'\n')
print('\n'+'下载完成...')