因为我是飞天鱼的粉丝,而且爱看万古神帝,但是现在很多的地方都看不到这个小说了,那些所谓正版的网站都是广告,但是采取爬虫去爬取小说内容,这样可以去掉广告,提升我们这些小说迷的阅读体验。
话不多说,代码如下:
#这两个库是经常使用的
import requests from bs4 import BeautifulSoup def get_novel_chapters():#这是在爬取每章的链接,还有标题 data_1 = [] root_url = "https://www.23uswx.com/1_1509/"#小说地址 header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'}#这个是防止反爬 r = requests.get(root_url,headers=header,timeout=3)#请求时间设置3秒 r.encoding = 'utf-8'#编码 soup = BeautifulSoup(r.text,'html.parser') dds = soup.find_all('dd')#因为网页的内容在dd那里 for dd in dds: link = dd.a['href'] content = dd.a.text data_1.append(('https://www.23uswx.com%s'%link,content))#元组 if not link: continue #print('https://www.23uswx.com%s'%link,content) return data_1 def get_chapter_content(url):#爬取每章的具体内容 header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36 Edg/108.0.1462.76'} r = requests.get(url, headers=header,timeout=3) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'html.parser') contents = soup.find('div',id = 'content') return contents.text if __name__ == '__main__': i = 0#实时关注爬取到哪里了 for chapter in get_novel_chapters(): i += 1 link , name = chapter with open('%s.txt'%name,'w',encoding='utf-8') as f: f.write(get_chapter_content(link)) print('第%d章'%i)