'''
爬取小说 by @asdfv
将每部小说的章节内容保存至本地
'''
import urllib2,re
from bs4 import BeautifulSoup
import threading
def get_html_content(url):
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
header = {'User-Agent':user_agent}
request = urllib2.Request(url=url, headers=header)
html = urllib2.urlopen(request).read()
return html
novel_list = []
def get_novels_list(html):
soup_novels = BeautifulSoup(html,'html.parser')
for string in soup_novels.find_all(attrs="l"):
for str_name in string.find_all(attrs="clearfix stitle"):
novel_list.append([str_name.get_text().encode('utf-8'),str_name.get('href')])
return novel_list
def turn2novel(novel_chapters_url):
html = get_html_content(novel_chapters_url)
if html:
soup_novel = BeautifulSoup(html,'html.parser')
return soup_novel.find(attrs="reader").get('href')
def novel_chapters_content(chapter):
html = get_html_content(chapter)
if html:
reg_bookname = re.compile(r'(.*?)')
bookname = re.findall(reg_bookname,html)
reg = re.compile(
r'(.*?)')
url_chapters_name = re.findall(reg,html)
return url_chapters_name
def get_chapter_novel_content(chapter_txt_url):
html = get_html_content(chapter_txt_url)
if html:
html = html.decode(
'gbk').encode(
'utf-8')
reg = re.compile(
r'(.*?)