参考资料:B站小甲鱼视频,csdn内的大神前辈Jack-Cui
同时参考了各种杂乱的网站与视频hh
内容更多的是自己的笔记与总结
Python版本: Python3
运行平台: Windows
IDE: pycharm
目标网址:http://www.biqukan.net/
先随便选了一篇简单的【小说内容】进行爬取
#coding = utf-8
import urllib.request
from bs4 import BeautifulSoup
url = 'http://www.biqukan.net/book/116945/40009955.html'
html = urllib.request.urlopen(url).read().decode('gbk')
bf = BeautifulSoup(html, 'html5lib') #此处有warning
title = bf.find_all('h1', class_ = 'readTitle') #获取标题
text = bf.find_all('div', class_ = 'panel-body') #获取内容
print(text[0].text.replace('\xa0'*4, ''))
def get_url():
url = 'http://www.biqukan.net/book/116945/'
html = urllib.request.urlopen(url).read().decode('gbk')
reg = re.compile(r'.*? ')
data = re.findall(reg, html)
for i in data:
book_url = url + i[0]
print(i[1] + ': ' + book_url)
笔记:
#coding = utf-8
import urllib.request
from bs4 import BeautifulSoup
import re
import os
def url_open(url):
html = urllib.request.urlopen(url).read().decode('gbk')
return html
#获取小说内容
def get_content(url):
html = url_open(url)
bf = BeautifulSoup(html, 'html5lib') #此处有warning
text = bf.find_all('div', class_ = 'panel-body') #获取内容
content = text[0].text.replace('\xa0'*4, '')
return content
#获取章节链接+标题
def get_book_url():
url = 'http://www.biqukan.net/book/116945/'
html = url_open(url)
book_title = []
book_url = []
book_num = 0
reg = re.compile(r'.*? ')
book_name_reg = re.compile(r'(.*?)
')
book_name = re.findall(book_name_reg, html)
data = re.findall(reg, html)
for i in data:
book_num += 1
book_url.append(url + i[0])
book_title.append(i[1])
return book_title,book_url,book_name[0],book_num
if __name__ == '__main__':
book_title = []
book_url = []
book_title, book_url, book_name, book_num = get_book_url()
path = book_name +'(共' + str(book_num) + '章)'
if not os.path.exists('./' + path):
print('当前路径下创建【'+ path +'】文件夹,\r开始下载文件')
os.mkdir(path)
os.chdir(path)
for i in range(book_num - 1):
print('下载进度:%.3f%%'% float(i/book_num) )
with open(str(i) + '.' + book_title[i] + '.txt', 'a', encoding='utf-8') as f: #笔记①
f.write(get_content(book_url[i])) #笔记②
print('下载完成!')
针对以上所作笔记:
installer -F demo.py
自己动手做还是很有趣的,蛤蛤,为什么第二阶段用了re呢,因为我认为正则的(.*?)取数据有的时候不要太方便了!而且我beautifulsoup没怎么用过,摸着摸着用的hh,不过还是有问题出来了,上面的截图认真看也能看出来,以4为倍数的章节被吃掉了!!!
我尝试了很多次,都无法还原那些章节
在此也求助一下能路过看下的朋友,到底是为什么啊啊啊,崩溃
之后又小小研究了一下bs,优化了一下代码,v2.0献丑了(捂脸)
#coding = utf-8
import urllib.request
from bs4 import BeautifulSoup
import os
def url_open(url):
html = urllib.request.urlopen(url).read().decode('gbk') #
return html
#获取小说内容
def get_content(url):
html = url_open(url) #
bf = BeautifulSoup(html, 'html5lib') #
text = bf.find_all('div', class_ = 'panel-body') #
content = text[0].text.replace('\xa0'*4, '') #
return content
#获取章节链接+标题
def get_book_url(url):
html = url_open(url)
book_title = []
book_url = []
book_num = 0
#以下处理需要慢慢理解
#1.第一层加工
bf_1 = BeautifulSoup(html, 'html5lib') #
set = bf_1.find_all('dd', class_='col-md-3') #
#2.第二层加工
bf_2 = BeautifulSoup(str(set), 'html5lib') #
all_data = bf_2.find_all('a') #
#3.获取书籍title
title_set = bf_1.find_all('h1', class_ = 'bookTitle') #
title = BeautifulSoup(str(title_set), 'html5lib') #
book_name = title.get_text() #book_name为str,笔记+1
for data in all_data:
book_num += 1
book_url.append(url + data.get('href'))
book_title.append(data.get('title'))
return book_title, book_url, book_name, book_num
#打包文件
def pack_book(url):
book_title = []
book_url = []
book_title, book_url, book_name, book_num = get_book_url(url)
path = book_name + '(共' + str(book_num) + '章)'
if not os.path.exists('./' + path):
print('当前路径下创建【' + path + '】文件夹,\r开始下载文件')
os.mkdir(path)
os.chdir(path)
for i in range(book_num - 1):
print('下载进度:%.2f%%' % float(i / book_num * 100))
with open(str(i + 1) + '.' + book_title[i] + '.txt', 'a', encoding='utf-8') as f: # 笔记①
f.write(get_content(book_url[i])) # 笔记②
print('下载进度:100% ')
print('下载完成!')
if __name__ == '__main__':
#print('请从笔趣网(http://www.biqukan.net/)中随便挑选一本书的网址输入:')
#print('参考:')
print('\t 1.(大龟甲师):http://www.biqukan.net/book/116945/ ')
print('''
请从笔趣网(http://www.biqukan.net/)中随便挑选一本书的网址输入,以下参考
1.(大龟甲师):http://www.biqukan.net/book/116945/
2.(不朽凡人):http://www.biqukan.net/book/26870/
3.(圣域):http://www.biqukan.net/book/16962/
…………
''')
url = input('请输入网址:')
while True:
if url.replace(url.split('/')[-2], '') == 'http://www.biqukan.net/book//':
break
else:
url = input('输入网址(笔趣网)有误,请重新输入:')
pack_book(url)
上面的v2.0还是略显蹩脚的,但哈哈哈哈懒得改了,刚才宿舍外面有人表白我就到外面看了
哈哈哈哈看欢呼声应该是成功了,3.0随缘吧,这次都用了bs4的模块,舍弃了正则,感觉bs4还是挺好用的
上面的功能的话就是能爬取小说啦,笔趣网上什么小说都能爬取的啵,提示也有了,不过用户需要比较繁琐的操作的,以后改进!