python爬虫笔记(八) 实例3:用Python批量爬取全站小说【以书趣阁为例】

1. 用Python批量爬取全站小说

爬取这个网站小说:http://www.shuquge.com/txt/89644/index.html

python爬虫笔记(八) 实例3:用Python批量爬取全站小说【以书趣阁为例】_第1张图片

python爬虫笔记(八) 实例3:用Python批量爬取全站小说【以书趣阁为例】_第2张图片

2. 爬取一本书

# -*- coding: utf-8 -*-
"""
Created on Sat Feb  8 20:31:43 2020

@author: douzi
"""

import requests
from parsel import Selector
import re
import time
    

def main():
    index_url = 'http://www.shuquge.com/txt/89644/index.html'  # 想要爬取的小说
    tpl = 'http://www.shuquge.com/txt/89644/'   
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}
    
    # 获取小说目录页
    urllist = requests.get(index_url, headers=headers)
    index_sel = Selector(urllist.text)
    # 
《九星毒奶》最新章节
1040 养龙皮?
index = index_sel.css('.listmain a::attr(href)').getall() # 保存10章节 for n in index: url = tpl + n # 第 n 章 response = requests.get(url, headers=headers, timeout=30) response.encoding = response.apparent_encoding print(response.request.url) # xpath css 选择器 提取网页数据结构(html) # lxml pyquery parsel sel = Selector(response.text) title = sel.css('h1::text').get() print(title) match = re.search(r'[0-9]*', title.split()[0]) if match: with open("./jiuxin/" + match.group(0) + '.txt', 'w', encoding = 'utf-8') as f: f.writelines(title) #
for line in sel.css('#content::text').getall(): f.writelines(line) time.sleep(0.5) if __name__ == '__main__': main()

 python爬虫笔记(八) 实例3:用Python批量爬取全站小说【以书趣阁为例】_第3张图片

3. 爬取一个分类

# -*- coding: utf-8 -*-
"""
Created on Sat Feb  8 20:31:43 2020

@author: douzi
"""

import requests
from parsel import Selector
import re
import time
import os


headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"}

# 下载一章节
def download_one_chapter(url, book_name):
    # 第 n 章
    response = requests.get(url, headers=headers, timeout=30)
    response.encoding = response.apparent_encoding
    
    print(response.request.url)
    # xpath css 选择器 提取网页数据结构(html) 
    # lxml pyquery parsel
    sel = Selector(response.text)
    title = sel.css('h1::text').get()
    print(title)
    with open('./'+book_name+'/'+title+'.txt','a+', encoding = 'utf-8') as f:
        f.writelines(title)
        # 
for line in sel.css('#content::text').getall(): f.writelines(line) f.write('\n\0') time.sleep(0.5) # 下载一本书 def download_one_book(index_url, bname): # index_url = 'http://www.shuquge.com/txt/89644/index.html' # 想要爬取的小说:例,九星毒奶 book_name = re.split('/', index_url)[-2] # 例: 89644 tpl = 'http://www.shuquge.com/txt/' + book_name + '/' # 获取小说目录页 urllist = requests.get(index_url, headers=headers) urllist.encoding = urllist.apparent_encoding index_sel = Selector(urllist.text) #
《九星毒奶》最新章节
1040 养龙皮?
index = index_sel.css('.listmain a::attr(href)').getall() for n in index: url = tpl + n download_one_chapter(url, bname) # 下载一类别 def download_one_category(): tpl = 'http://www.shuquge.com/category/7_{}.html' # 想要爬取的类别 # 3页 for page in range(1, 4): category_url = tpl.format(page) print(category_url) # 获取小说类别页 cate_list = requests.get(category_url, headers=headers) cate_list.encoding = cate_list.apparent_encoding index_sel = Selector(cate_list.text) books_url = index_sel.css('span.s2 a::attr(href)').getall() books_name = index_sel.css('span.s2 a::text').getall() for book_url in books_url: # 如:变成随身老奶奶 http://www.shuquge.com/txt/109203/index.html book_name = books_name[books_url.index(book_url)] print(book_name, book_url) if os.path.isdir('./' + book_name): os.removedirs(book_name) else: os.mkdir('./' + book_name) # 下载一本书 download_one_book(book_url, book_name) if __name__ == '__main__': # download_one_book('asd') download_one_category()

 python爬虫笔记(八) 实例3:用Python批量爬取全站小说【以书趣阁为例】_第4张图片

python爬虫笔记(八) 实例3:用Python批量爬取全站小说【以书趣阁为例】_第5张图片

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

你可能感兴趣的:(python爬虫)