使用python3爬取 "黑客" 小说所有章节

#目标,把整本小说都爬取下来,并储存到本地
#目标url:https://www.liaobige.com/dush/44901/
#爬取整本黑客小说所有章节,并保存到本地

import requests	#自动爬去html页面,自动请求网络提交
from bs4 import BeautifulSoup	#解析HTML/XMl页面,提取数据或信息
import time

def Zj():
	headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
	url = requests.get("https://www.liaobige.com/dush/44901/", headers=headers) 
	#print("返回值:",url.status_code)
	url.encoding = url.apparent_encoding #网页编码转换
	text = url.text
	soup = BeautifulSoup(text, 'html.parser') #网页解析器
	urls = 'https://www.liaobige.com/dush/44901/'
	zj = []

	for i in soup.select("body div div div span a"):
		ss = urls + i.get('href') #使用get爬取标签中的属性
		zj.append(ss)
	return zj


def Bt():
    for i in Zj():
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
        time.sleep(3)
        urlxx = requests.get(i, headers=headers) 
        dome = urlxx.status_code
        urlxx.encoding = urlxx.apparent_encoding #网页编码转换
        texts = urlxx.text
        soup = BeautifulSoup(texts, 'html.parser') #网页解析器
        if dome == 200:
            bt = soup.select("div h2")
            bts = [i.string for i in bt]
            for i in bts:
                try:
                    file = open("D:\\IT\\小说\\" + i + ".txt", "a+",encoding='utf-8')
                    for y in soup.find_all(class_="vcontent"):
                        a = y.get_text()
                        file.write(a)
                        file.close()
                    print(i)
                except:
                    print('本章节源代码错误!')  
        else:
            print('访问失败:',dome)
					
					
Bt()

运行结果:

使用python3爬取
使用python3爬取

你可能感兴趣的:(python)