第一个小爬虫:全本网小说下载

编译环境:Python3.7.0

需要安装的库:requests

首先第一步:获取网页的源码(测试的URL:http://quanben5.com/n/yuzui/xiaoshuo.html)

import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None


def main():
    url = 'http://quanben5.com/n/yuzui/xiaoshuo.html'
    html = get_one_page(url)
    print(html)

main()

运行截图:

第一个小爬虫:全本网小说下载_第1张图片

第二步:获取所有章节的URL

此时需要库:re

import re
import requests

#获取网页的源码
def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

#获取所有章的url
def get_url(html):
    #用正则表达式提取出所需要的部分
    URL = re.findall('
  • ',html,re.S) list_url = [] # 定义一个列表来存储所有章的URL for url_ in URL: list_url.append( 'http://quanben5.com' + url_ ) for url_ in list_url: print(url_) def main(): url = 'http://quanben5.com/n/yuzui/xiaoshuo.html' html = get_one_page(url) get_url(html) main()
  • 效果图:

    第一个小爬虫:全本网小说下载_第2张图片

    获取单章的内容:

    import re
    import requests
    
    #获取网页的源码
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    '''
    #获取所有章的url
    def get_url(html):
        #用正则表达式提取出所需要的部分
        URL = re.findall('
  • ',html,re.S) list_url = [] # 定义一个列表来存储所有章的URL for url_ in URL: list_url.append( 'http://quanben5.com' + url_ ) for url_ in list_url: print(url_) ''' #获取单章的内容 def get_content(html): title = re.findall('

    (.*?)

    ',html,re.S) title = title[0] print(title) content = re.findall('

    (.*?)

    ',html,re.S) for sentence in content: print(sentence) def main(): url = 'http://quanben5.com/n/yuzui/41935.html' html = get_one_page(url) #get_url(html) get_content(html) main()
  • 效果图:

     

    第一个小爬虫:全本网小说下载_第3张图片

    下载单章内容:

    import re
    import requests
    
    #获取网页的源码
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    '''
    #获取所有章的url
    def get_url(html):
        #用正则表达式提取出所需要的部分
        URL = re.findall('
  • ',html,re.S) list_url = [] # 定义一个列表来存储所有章的URL for url_ in URL: list_url.append( 'http://quanben5.com' + url_ ) for url_ in list_url: ''' #获取单章的内容 def get_content(html): title = re.findall('

    (.*?)

    ',html,re.S) title = title[0] print(title) write_to_file(title) content = re.findall('

    (.*?)

    ',html,re.S) for sentence in content: print(sentence) write_to_file(sentence) write_to_file('\n') #将内容保存到本地 def write_to_file(content): with open('result.txt','a',encoding = 'utf-8') as f: f.write(content+'\n') def main(): url = 'http://quanben5.com/n/yuzui/41935.html' html = get_one_page(url) #get_url(html) get_content(html) main()
  • 效果图:

    第一个小爬虫:全本网小说下载_第4张图片

    第一个小爬虫:全本网小说下载_第5张图片

    下载全部内容:

    import re
    import requests
    
    #获取网页的源码
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    #获取所有章的url
    def get_url(html):
        #用正则表达式提取出所需要的部分
        URL = re.findall('
  • ',html,re.S) list_url = [] # 定义一个列表来存储所有章的URL for url_ in URL: list_url.append( 'http://quanben5.com' + url_ ) return list_url #获取单章的内容 def get_content(html): title = re.findall('

    (.*?)

    ',html,re.S) title = title[0] print(title + '开始下载') write_to_file(title) content = re.findall('

    (.*?)

    ',html,re.S) for sentence in content: write_to_file(sentence) write_to_file('\n') #将内容保存到本地 def write_to_file(content): with open('result.txt','a',encoding = 'utf-8') as f: f.write(content+'\n') # def save_content(list_url): for url_ in list_url: html_ = get_one_page(url_) get_content(html_) def main(): url = 'http://quanben5.com/n/yuzui/xiaoshuo.html' html = get_one_page(url) list_url = get_url(html) save_content(list_url) main()
  • 效果图:

    第一个小爬虫:全本网小说下载_第6张图片

    第一个小爬虫:全本网小说下载_第7张图片

    完善搜索功能:

    第一个小爬虫:全本网小说下载_第8张图片

    搜索后:

    第一个小爬虫:全本网小说下载_第9张图片

    正则提取我们想要的目标URL:

    import re
    import requests
    
    #定义全局变量keyword,方便创建text
    keyword = input('请输入你要下载的小说:')
    
    #获取网页的源码
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    def main():
        url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
        html = get_one_page(url)
        url1 = re.findall(r'

    ',html,re.S) url1 = url1[0] print(url1) url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html' print(url2) main()

     

    运行如下:

    有时候搜索不到,程序异常就会退出

    下一步,加上异常处理:

    import re
    import requests
    
    #定义全局变量keyword,方便创建text
    keyword = input('请输入你要下载的小说:')
    
    #获取网页的源码
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    def main():
        url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword
        html = get_one_page(url)
        url1 = re.findall(r'

    ',html,re.S) if url1 == []: print('搜索不到!!!') else: url1 = url1[0] url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html' print(url2) main()

     

    效果图:

    第一个小爬虫:全本网小说下载_第10张图片

    第一个小爬虫:全本网小说下载_第11张图片

    ----------------------------------------------------------------------分割线---------------------------------------------------------------------------------

    上个完整代码:

    import os
    import re
    import sys
    import requests
    
    #定义全局变量keyword,方便创建text
    keyword = input('请输入你要下载的小说:')
    name = str(keyword) + '.txt'
    
    #获取网页的源码
    def get_one_page(url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
            }
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                return response.text
            return None
        except RequestException:
            return None
    
    #获取所有章的url
    def get_url(html):
        #用正则表达式提取出所需要的部分
        URL = re.findall('
  • ',html,re.S) list_url = [] # 定义一个列表来存储所有章的URL for url_ in URL: list_url.append( 'http://quanben5.com' + url_ ) return list_url #获取单章的内容 def get_content(html): title = re.findall('

    (.*?)

    ',html,re.S) title = title[0] print(title + '开始下载') write_to_file(title) content = re.findall('

    (.*?)

    ',html,re.S) for sentence in content: write_to_file(sentence) write_to_file('\n') #将内容保存到本地 def write_to_file(content): with open(name,'a',encoding = 'utf-8') as f: f.write(content+'\n') #将所有章节保存到本地 def save_content(list_url): for url_ in list_url: html_ = get_one_page(url_) get_content(html_) def main(): url = 'http://quanben5.com/index.php?c=book&a=search&keywords='+keyword html = get_one_page(url) url1 = re.findall(r'

    ',html,re.S) if url1 == []: print('搜索不到!!!') flag = input('是否退出:(Y or N):') if flag == 'Y': sys.exit() elif flag == 'y': sys.exit() else: print('搜不到能怎么办,我也很无奈-.-||') else: url1 = url1[0] # 获得小说URL url2 = 'http://quanben5.com'+url1+'/xiaoshuo.html' # 获得小说目录页URL html2 = get_one_page(url2) list_url = get_url(html2) print(name + '开始下载!!!') save_content(list_url) main()

  • 最后运行一下!!!(写博客访问得有点频繁,后面的章节就不放出来了,被限制连接了)

    你可能感兴趣的:(#,Python3爬虫实战)