爬取起点小说网免费小说

python 3.7
设置了0.5秒存入一个章节
所以有点慢
运行的时候在py文件的同级目录下创建目标的小说文件夹
在文件夹中写入小说章节
headers完全没有引用= =(主要是起点没有怎么反爬取)

import requests
from bs4 import *
from lxml import etree
import os
import time
res = requests.session()
head = {}
book_name = []
book_link = []
book_dict = {}
book_list = []
#得到起点免费小说的主页
def get_index():
   # head["Host"] = "www.qidian.com"
   # head["User - Agent"]= "Mozilla / 5.0(WindowsNT10.0;WOW64;rv: 63.0) Gecko / 20100101Firefox / 63.0"
    return res.get("https://www.qidian.com/free")
#获得小说的内容
def get_book_content(href, book_page , file_path):
    response = res.get("http:" + href)
    html = etree.HTML(response.text)
    content = '\n'.join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
   # print(content)
    file_name = book_page + ".txt"
    print("正在下载: " + file_name)
    with open(file_path + "\\" + file_name , "a" , encoding='utf-8' ) as f :
        f.write(content)

#得到小说的章节名
def get_book_host(str_name):
    if os.path.exists(str_name) == False:
        print("正在创建小说文件夹...")
        os.mkdir(str_name)
        print("创建成功!")
    book_index = res.get("http:" + book_dict[str_name])
    html = etree.HTML(book_index.text)
    book_list = html.xpath('//ul[@class="cf"]/li/a/text()')
    book_href = html.xpath('//ul[@class="cf"]/li/a/@href')
    for name , href   in zip(book_list , book_href) :
        get_book_content(href, name, str_name)
        time.sleep(0.5)
def main():
    #得到主页
    host = get_index()
    soup = BeautifulSoup(host.text , 'lxml')
   # soup = soup.prettify()
    #提取书籍信息
    for i in soup.find_all('div' , class_= "book-mid-info"):
        book_name.append(i.h4.a.text)
        book_link.append(i.h4.a["href"])
        book_dict[i.h4.a.text] = i.h4.a["href"]
    for i in book_name :
        print(i)
    name = input("请输入需要下载的小说:\n")
    get_book_host(name)





if __name__ == '__main__':
    main()



你可能感兴趣的:(Python爬虫)