爬小说

'''
import requests
import os
from lxml import etree
class Spider(object): #创建类

def start_request(self):   #定义函数

    response=requests.get("https://www.qidian.com/all")

    html=etree.HTML(response.content.decode())     #html树节点关系

    Bigtit_list=html.xpath('//div[@class="book-mid-info"]/h4/a/text()')  #属性为class="book-mid-info"]的div下h4标签下a标签的文本

    Bigtit_src=html.xpath('//div[@class="book-mid-info"]/h4/a/@href')  #属性为class="book-mid-info"]的div下h4标签下a标签href信息

    for bigtit,bigsrc in zip(Bigtit_list, Bigtit_src):  #建立一一对应关系
       if os.path.exists(bigtit)==False:
           os.mkdir(bigtit)
           self.file_data(bigtit,bigsrc)##被下面函数调用
def file_data(self,bigtit,bigsrc):
    response = requests.get("https:"+bigsrc)
    html = etree.HTML(response.content.decode())  # html树节点关系
    Little_list = html.xpath('//ul[@class="cf"]/li/a/text()')  # 属性为class="book-mid-info"]的div下h4标签下a标签的文本
    Little_src= html.xpath('//ul[@class="cf"]/li/a/@href')  # 属性为class="book-mid-info"]的div下h4标签下a标签href信息
    for littit, litsrc in zip(Little_list, Little_src):  # 建立一一对应关系
        self.finally_file(littit,litsrc,bigtit)
def finally_file(self,littit,litsrc,bigtit):
    response = requests.get("https:" + litsrc)
    html = etree.HTML(response.content.decode())  # html树节点关系
    content="\n".join(html.xpath('//div[@class="read-content j_readContent"]/p/text()'))
    file_name=bigtit+"\\"+littit+".txt"
    print("正在抓取文章"+file_name)
    with open(file_name,"a",encoding="utf-8") as f:##多媒体文件写入用"wb"
        f.write(content)

spider=Spider()
spider.start_request() #运行Spider类下start_request函数
'''

你可能感兴趣的:(爬小说)