练:python爬取小说

  • 新人首发,还望多多指教 >_<
# -*- coding:utf-8 -*-
'''
    爬取小说 by @asdfv
    将每部小说的章节内容保存至本地
'''

import urllib2,re
from bs4 import BeautifulSoup
import threading

# 请求并获取网页源码
def get_html_content(url):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    header = {'User-Agent':user_agent}
    request = urllib2.Request(url=url, headers=header)
    html = urllib2.urlopen(request).read()
    return html

#[小说名,小说url]
novel_list = []
def get_novels_list(html):
    soup_novels = BeautifulSoup(html,'html.parser')
    for string in soup_novels.find_all(attrs="l"):
        for str_name in string.find_all(attrs="clearfix stitle"):
            novel_list.append([str_name.get_text().encode('utf-8'),str_name.get('href')])
    return novel_list

# 获取小说跳转至章节显示页面链接
def turn2novel(novel_chapters_url):
    html = get_html_content(novel_chapters_url)
    if html:
        soup_novel = BeautifulSoup(html,'html.parser')
        # print soup_novel.find(attrs="reader").get('href')
        return soup_novel.find(attrs="reader").get('href') # 章节url

# 获取某一小说所有章节名称及其链接
def novel_chapters_content(chapter):
    html = get_html_content(chapter)
    if html:
        reg_bookname = re.compile(r'
(.*?)') bookname = re.findall(reg_bookname,html) reg = re.compile(r'
  • (.*?)
  • '
    ) url_chapters_name = re.findall(reg,html) return url_chapters_name # 获取小说各章节文本内容 def get_chapter_novel_content(chapter_txt_url): html = get_html_content(chapter_txt_url) if html: html = html.decode('gbk').encode('utf-8') reg = re.compile(r'(.*?)

    你可能感兴趣的:(爬虫-python)