这是一个练习作品。用python脚本爬取笔趣阁上面的免费小说。
环境:python3
类库:BeautifulSoup
数据源:http://www.biqukan.cc
原理就是伪装正常http请求,正常访问网页。然后通过bs4重新解析html结构来提取有效数据。
包含了伪装请求头部,数据源配置(如果不考虑扩展其他数据源,可以写死)。
#!/usr/bin/python
#coding:utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
source = {
'biquge': {
'base_url': 'http://www.biqukan.cc',
'category_min': 1,
'category_max': 2,
'category_url': 'http://www.biqukan.cc/fenlei{id}/1.html'
}
}
header = [
{'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36'},
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'},
{'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'},
{'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'},
{'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)'}
]
config.py文件
#!/usr/bin/python
#coding:utf-8
import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header
import hashlib
import time
hash_md5 = hashlib.md5()
##
# 通过分类获取文章名和对应的链接
#
def fiction():
url = source['biquge']['category_url']
cur_category_name = ''
_list = {}
for i in range(source['biquge']['category_min'], source['biquge']['category_max']):
req = requests.get(url.replace('{id}', '%s'%i), headers = header[random.randint(0,4)])
_temp_result = req.content.decode('gbk')
bs = BeautifulSoup(_temp_result, "html.parser")
next_page = bs.find('ul', id='pagelink')
while next_page!=None:
next_page = next_page.find('a', 'next')
if next_page==None:
break
# 更新小说
_page = _cur_page(bs)
print('page.length = %d'%len(_page))
_list.update(_page)
# 获取下一页数据
req = requests.get(next_page.attrs['href'], headers = header[random.randint(0,4)])
_temp_result = req.content.decode('gbk')
bs = BeautifulSoup(_temp_result, "html.parser")
next_page = bs.find('ul', id='pagelink')
# 短暂休息一下
time.sleep(random.random())
return _list
##
# 当前页面的所有小说信息
#
def _cur_page(bs):
_list = {}
# top列表
li_tags = bs.findAll('li', 'list-group-item')
if li_tags==None or len(li_tags)<=0:
return _list
for item in li_tags:
a_tag = item.find('a')
_item = {'name':a_tag.get_text(), 'link': a_tag.attrs['href']}
# 作者
author = item.find('small').get_text().replace('/ ', '')
_item['author'] = author
# 阅读数
readers = item.find('span').get_text()
_item['readers'] = readers
hash_md5.update(_item['link'])
_list[hash_md5.hexdigest()] = _item
# 最近更新列表
tr_tags = bs.findAll('tr')
if tr_tags==None or len(tr_tags)<=1:
return _list
for item in tr_tags:
a_tag = item.find('a')
if a_tag==None:
continue
_item = {'name':a_tag.get_text(), 'link': a_tag.attrs['href']}
# 作者
author = item.find('td', 'text-muted').get_text()
_item['author'] = author
# 状态
status = item.findAll('td')
_item['status'] = status[len(status)-1].get_text()
hash_md5.update(_item['link'])
if _list.has_key(hash_md5.hexdigest())!=True:
_list[hash_md5.hexdigest()] = _item
else:
_list[hash_md5.hexdigest()]['status'] = _item['status']
return _list
if __name__=="__main__":
_temp = fiction()
print('done')
fiction.py文件
#!/usr/bin/python
#coding:utf-8
import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header
##
# 抓取简介,返回结构体
# 标题:知否?知否?应是绿肥红瘦
# 作者:关心则乱
# 分类:都市言情
# 字数:200万
# 阅读数:2w+
# 状态:连载中、完结
# 封面:url
# 简介:赵丽颖、冯绍峰主演电视剧《知否?知否?应是绿肥红瘦》原著,该剧由正午阳光影业出品,侯鸿亮任制片人,2017年9月6日开机。宅斗翘楚、古言大神关心则乱,手把手传授你实用的古代生存指南。一个消极怠工的古代庶女,生活如此艰难,何必卖力奋斗。古代贵族女子的人生基调是由家族决定的,还流行株连,一个飞来横祸就会彻底遭殃,要活好活顺活出尊严,明兰表示,鸭梨很大。古代太危险了,咱们还是睡死算了。
#
def summary(url):
_result={'title':'', 'author':'', 'category':'', 'words':'', 'readers':'', 'status':'', 'cover_img':'', 'summary':''}
req = requests.get(url,headers = header[random.randint(0,4)])
_temp_result = req.content.decode('gbk')
bs = BeautifulSoup(_temp_result, "html.parser")
title_tag = bs.find('h1', 'bookTitle')
if title_tag!=None:
_result['title'] = title_tag.get_text()
book_tag = bs.find('p', 'booktag')
if book_tag!=None:
a_tags = book_tag.findAll('a')
_result['author'] = a_tags[0].get_text()
_result['category'] = a_tags[1].get_text()
span_tags = book_tag.findAll('span')
_result['words'] = span_tags[0].get_text()
_result['readers'] = span_tags[1].get_text()
_result['status'] = span_tags[2].get_text()
intro_tag = bs.find('p', id='bookIntro')
_result['cover_img'] = intro_tag.find('img').attrs['src']
_result['summary'] = intro_tag.get_text().replace('\n\r\n ','').replace('\r\n ','')
return _result
if __name__=="__main__":
_temp = summary('http://www.biqukan.cc/book/47583/')
print(_temp)
summary.py文件
#!/usr/bin/python
#coding:utf-8
import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header
# 抓取目录
def catalog(url):
_list=[]
req = requests.get(url,headers = header[random.randint(0,4)])
_temp_result = req.content.decode('gbk')
bs = BeautifulSoup(_temp_result, "html.parser")
all_list = bs.find('div', id='list-chapterAll')
if all_list==None:
return _list
list_tag = all_list.find('dl', 'panel-chapterlist')
if list_tag==None:
return _list
a_tags = list_tag.findAll('a')
for k in a_tags:
_dict={}
_dict['name'] = k.get_text()
_dict['link'] = url + k.attrs['href']
_list.append(_dict)
return _list
if __name__=="__main__":
_temp = catalog('http://www.biqukan.cc/book/47583/')
print(_temp)
catalog.py文件
#!/usr/bin/python
#coding:utf-8
import random
import requests
import re
from bs4 import BeautifulSoup
#from BeautifulSoup import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from config import source
from config import header
##
# 抓取小说正文
#
def detail(url):
per_artitle_limit_page = 3;
title=''
content=''
for i in range(1, per_artitle_limit_page):
if i==1:
part_url = ''
else:
part_url = '_%s'%i
req = requests.get(url.replace('.html',part_url + '.html'),headers = header[random.randint(0,4)])
_temp_result = req.content.decode('gbk')
bs = BeautifulSoup(_temp_result, "html.parser")
# title
if len(title)<=0:
title = bs.find('li','active').get_text()#re.findall(title_re, _temp_result)[0]
content_tag = bs.find('div', id='htmlContent')
if content_tag==None:
break
next_tag = content_tag.find('p', 'text-danger')
if next_tag!=None:
next_tag.clear()
_ = content_tag.get_text().replace('-->>', '').replace('一秒记住【笔趣阁 www.biqukan.cc】,更新快,无弹窗,免费读!','')
content += _
return content
def filter(content):
_temp = content.split('\r\n')
for index in range(len(_temp)):
_temp[index] = _temp[index].replace(' ','')
_temp = [elem for elem in _temp if elem != None and len(elem) != 0]
return ''.join(_temp)
if __name__=="__main__":
_temp = detail('http://www.biqukan.cc/book/20461/12592815.html')
print(filter(_temp))
article.py文件
暂没有做数据保存模块。如果需要串起来做成一个完整的项目的话,只需要把小说数据结构保存即可(节省磁盘空间)。通过小说url可以很快速的提取出小说简介、目录、每一章的正文。
如果想要做的更好,可以把目录,介绍、正文等部分缓存起来,当然得有足够的空间。