简单小说爬取

关于简单的小说爬取

import requests  #requests是一个HTTP请求库
from pyquery import PyQuery#网页解析 原生CSS解析器 css层叠样式表

# 章节url name书名

def get_one_chapter(chapter_url=None,name=None,shunxu=None):
    '''
    :param chapter_url: 章节url
    :param name: 书名
    :return: None
    '''
    response = requests.get(url=chapter_url)        #
    doc = PyQuery(response.text)
    title = doc("h1").text()
    content1 = doc("#nr1").text()    
    print(shunxu,title,content1)
    num = str(shunxu)
    with open(file= num+title + ".txt", mode='a+', encoding='utf_8') as f:
        # 里面能写入文字,不能写入对象列表
        f.write(title + " \n\n " + content1)


# 获取整本书的url 获取书名
def get_index():
    #目录
    index_url = 'https://www.luoxia.com/qing/'#目录地址
    text = requests.get(url=index_url).text
    doc = PyQuery(text)                        #对象解析
    # print(doc)
    # ****************获取links的所有链接********************/
    links = doc('#content-list a')                #连接标签                         
    name = doc("h1").text()                     #连接标签
    shunxu = 0
    for link in list(links.items())[9:]:
        shunxu = shunxu+1           
        # print(l)
        chapter_url =link.attr.href  
        print(chapter_url)
        get_one_chapter(chapter_url=chapter_url, name=name,shunxu=shunxu)

get_index()

简单的实战练习

你可能感兴趣的:(基础)