Python爬虫爬取某盗版小说网站小说.

 前言

我将这个程序分为两个功能,一是实现爬取小说的最新章节,二是爬取小说的所有章节.

仅供学习.

获取小说详情页的html

通过函数gethtml()实现.

def gethtml(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:

        r = requests.get(url, headers=headers)
      #  time.sleep(5)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常1."

获取小说具体章节的URL

对于爬取所有章节和最新章节这两个功能来说,实现这个步骤分别用了两个函数 geturl_all()和 getnewtexturl() .

这两个函数用于分析爬取到了html,并分别获取所有章节的url和最新章节的url.


def geturl_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        url_list=[]
        #print(type(url_list))
        for i in range(0,len(ans)):
            url_list.append(re.findall(r"href=.*html",str(ans[i])))
        return url_list
    except:
        return "产生异常2."

def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        r=str(ans[-1])
       # print(url1)
        ans=re.findall(r"href=.*html",r)
        print(ans)
        return str(ans)
    except:
        return "产生异常2."

 获取小说章节中的正文.

通过正则表达式来清洗数据,得到正文.

def find_text(html):#得到小说正文
    texts=re.findall(r'id="content"(.*)',html)
    texts=str(texts)
    #tests=
    texts=re.split(r"\\r
\\r
    ",texts) #print(texts) ans=" " for i in range(0,len(texts)): ans+=" "+texts[i]+"\n" return ans

全部代码:

import  requests
import  os
from bs4 import BeautifulSoup
import  re
import  time

def gethtml(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:

        r = requests.get(url, headers=headers)
      #  time.sleep(5)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常1."

def geturl_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        url_list=[]
        #print(type(url_list))
        for i in range(0,len(ans)):
            url_list.append(re.findall(r"href=.*html",str(ans[i])))
        return url_list
    except:
        return "产生异常2."

def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        r=str(ans[-1])
       # print(url1)
        ans=re.findall(r"href=.*html",r)
        print(ans)
        return str(ans)
    except:
        return "产生异常2."

def find_text(html):#得到小说正文
    texts=re.findall(r'id="content"(.*)',html)
    texts=str(texts)
    #tests=
    texts=re.split(r"\\r
\\r
    ",texts) #print(texts) ans=" " for i in range(0,len(texts)): ans+=" "+texts[i]+"\n" return ans moshi=input("1.爬取全文.\n2.爬取最新章节.\n") def hind_all(): r= open("小说全部章节.text", 'w+', encoding="utf-8") url = "为了过审QAQ" # 书籍详细介绍页. url_list =geturl_all(url) # print(url_list) for i in range(0,len(url_list)): print("正在爬取中,请稍候当前在",i,"章节") #print(url_list[i]) url=str(url_list[i]) url = "为了过审QAQ" + url[8:-2] html = gethtml(url) ##得到小说最新章节html text = find_text(html) # 分析出小说正文. r.write(text) r.close() print("爬取成功.") def hind_last(): url = "为了过审QAQ" # 书籍详细介绍页. url = getnewtexturl(url) # 得到小说最新章节url print(url) url = "为了过审QAQ" + url[8:-2] # print(url) html = gethtml(url) ##得到小说最新章节html # print(html) # print(type(html)) text = find_text(html) # 分析出小说正文. # print(text) with open("小说最新章节.text", 'w+', encoding="utf-8") as f: f.write(text) f.close() if moshi=="1": hind_all(); else: hind_last();

 更新:

 隔了一段时间,感觉原功能还是太简陋,想要爬小说,必须要给定小说的介绍页面的url.所以我又优化了一下程序,新增了几个函数,来实现类似搜索引擎的功能.

首先模拟浏览器发出post请求,去请求你想要搜索的小说的名字.利用以下函数来实现.

def findbook(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    name=input("请输入你想要搜索的小说名字:")
    keyword={
      " searchkey": name
    }
    try:
       # r = requests.get(url, headers=headers,keyword=keyword)
        r=requests.post(url=url,data=keyword,headers=headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "搜索框错误."

然后进行数据清洗,在浏览器给出的响应信息页面中找出搜索结果,找出搜索到的小说和其介绍页面的url.

def findbookurl(html):
    soup = BeautifulSoup(html, 'html.parser')
    soup=soup.find_all("td",attrs={"class":"even"})
    soup=str(soup)
    listurl=soup.split("",listurl[i])#+re.findall(r">.*",listurl[i])
       # urllist[i]=listurl[i][22:]
        if len(data)==0:
            continue
       # print(str(data))
        data=str(data)
        urllist.append(data)
        #print(listurl[i])
    return  urllist

 返回格式是一个列表,列表的每个元素里边记录了每个搜索到的小说的url和书名.

然后输入想要爬取第几本书.

listurl=findbookurl(text)
print(listurl)
num=input("你想要爬取第几本小说?")
urlnum=listurl[int(num)-1].find("\"")
#print(listurl[int(num)-1][2:urlnum])
url=listurl[int(num)-1][2:urlnum]
namenumstar=listurl[int(num)-1].find(">")
namenumend=listurl[int(num)-1].find("<")
#print(listurl[int(num)-1][namenumstar+1:namenumend])
name=listurl[int(num)-1][namenumstar+1:namenumend]

 接下来就是将其url传入之前写的函数里,和之前功能相仿.最后依旧是将小说内容存入TXT文档里.

 

 完整代码:

import  requests
import  os
from bs4 import BeautifulSoup
import  re
import  time
import pa
def findbook(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    name=input("请输入你想要搜索的小说名字:")
    keyword={
      " searchkey": name
    }
    try:
       # r = requests.get(url, headers=headers,keyword=keyword)
        r=requests.post(url=url,data=keyword,headers=headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "搜索框错误."

def findbookurl(html):
    soup = BeautifulSoup(html, 'html.parser')
    soup=soup.find_all("td",attrs={"class":"even"})
    soup=str(soup)
    listurl=soup.split("",listurl[i])#+re.findall(r">.*",listurl[i])
       # urllist[i]=listurl[i][22:]
        if len(data)==0:
            continue
       # print(str(data))
        data=str(data)
        urllist.append(data)
        #print(listurl[i])
    return  urllist

def gethtml(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:

        r = requests.get(url, headers=headers)
      #  time.sleep(5)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常1."

def geturl_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        url_list=[]
        #print(type(url_list))
        for i in range(0,len(ans)):
            url_list.append(re.findall(r"href=.*html",str(ans[i])))
        return url_list
    except:
        return "产生异常2."

def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        r=str(ans[-1])
       # print(url1)
        ans=re.findall(r"href=.*html",r)
        print(ans)
        return str(ans)
    except:
        return "产生异常2."

def find_text(html):#得到小说正文
    texts=re.findall(r'id="content"(.*)',html)
    texts=str(texts)
    #tests=
    texts=re.split(r"\\r
\\r
    ",texts) #print(texts) ans=" " for i in range(0,len(texts)): ans+=" "+texts[i]+"\n" return ans def hind_all(url): r= open(name+"全部章节.txt", 'w+', encoding="utf-8") url_list =geturl_all(url) # print(url_list) for i in range(0,len(url_list)): print("正在爬取中,请稍候当前在",i,"章节") #print(url_list[i]) url=str(url_list[i]) url = "为了过审QAQ" + url[8:-2] html = gethtml(url) ##得到小说最新章节html text = find_text(html) # 分析出小说正文. r.write(text) r.close() print("爬取成功.") def hind_last(url): url = getnewtexturl(url) # 得到小说最新章节url print(url) url = "为了过审QAQ" + url[8:-2] # print(url) html = gethtml(url) ##得到小说最新章节html # print(html) # print(type(html)) text = find_text(html) # 分析出小说正文. # print(text) with open(name+"最新章节.txt", 'w+', encoding="utf-8") as f: f.write(text) f.close() def ma(url): moshi=input("1.爬取全文.\n2.爬取最新章节.\n") # url = "为了过审QAQ" if moshi == "1": hind_all(url); else: hind_last(url); url="为了过审QAQ" text=findbook(url) listurl=findbookurl(text) print(listurl) num=input("你想要爬取第几本小说?") urlnum=listurl[int(num)-1].find("\"") #print(listurl[int(num)-1][2:urlnum]) url=listurl[int(num)-1][2:urlnum] namenumstar=listurl[int(num)-1].find(">") namenumend=listurl[int(num)-1].find("<") #print(listurl[int(num)-1][namenumstar+1:namenumend]) name=listurl[int(num)-1][namenumstar+1:namenumend] ma(url)

你可能感兴趣的:(Pyhon,python,爬虫)