需要使用的库:
#导入相关库
import requests
from pyquery import PyQuery as pq
import re
import os
from multiprocessing import Process
from redis import StrictRedis
import logging
import chardet
全局标量
#集合存储还未下载的urls
con = StrictRedis(host='localhost',port='6379',db=10,password='')
#集合,存储已经下载的urls
con2 = StrictRedis(host='localhost',port='6379',db=10,password='')
base_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),'小说')
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
}
base_url = "https://www.cdzdgw.com"
为减少代码冗余:
字符串编码不统一
文件名不规范
#解决文本编码问题
def fix_code(text):
try:
text = text.encode('iso-8859-1').decode('gbk')
except (UnicodeEncodeError,UnicodeDecodeError) as e:
if "latin-1" in e.__str__():
return text
elif 'gbk' in e.__str__():
print(e)
return text.encode('iso-8859-1').decode()
return text
#解决文件名不规范问题
def format_filename(filename):
filename = re.sub("[、\"<>\/!,;:??“”\\'\*]", '', filename)
if "(" in filename:
filename = re.sub("(.*?)", '',filename, re.S)
return filename
return filename
爬虫笔趣网小说的类
class Spider():
"""
该类实现爬去一部小说
参数:
acticle_url:该参数为小说url
"""
def __init__(self):
self.base_url = "https://www.cdzdgw.com"
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:68.0) Gecko/20100101 Firefox/68.0',
}
def get_article_url(self):
url = con.spop('novel')
if url:
con2.sadd('down', url)
return url.decode()
return None
def run(self):
"""
获取小说章节url
:return:
"""
url = self.get_article_url()
while url:
aa = requests.get(url,headers=self.headers).text
pattern = '.*?(.*?)
.*?.*?.*? .*?.*? .*?(.*?)'
results = re.findall(pattern, aa, re.S)[0]
global caption
caption = fix_code(results[0])
if not os.path.exists(base_file+ "//"+caption):
os.makedirs(base_file+ "//"+caption) #创建一个目录
pattern = '.*?(.*?) .*?'
res = re.findall(pattern, results[1], re.S)
for i in res:
title = fix_code(i[1])
title_url = self.base_url + fix_code(i[0])
if "第" in title and "章" in title:
self.detial(title_url)
def detial(self,url):
"""
获取正文
:param url:
:return:
"""
res = requests.get(url, headers=self.headers).text
doc = pq(res)
title = fix_code(doc('.content h1').text())
pattern = '(.*?)https://www'
texts = doc(".showtxt").text()
txts = re.findall(pattern,texts,re.S)[0].strip().replace('\n\n','\n')
txts = fix_code(txts)
title = format_filename(title)
capt = format_filename(caption)
filename = base_file+ "//"+capt+"//"+title+".txt"
with open(filename,'w',errors='ignore') as f:
f.write(txts)
获取小说urls,使用redis进行存储
(感兴趣的可扩展分布式爬取)
#爬去笔趣网所有小说
def get_caption():
"""
小说大分类:
玄幻小说
修真小说
都市小说
穿越小说
网游小说
科幻小说
其他小说
#下面两个分类包含上面的
排行榜单
完本小说
两部分:更新列表,相关推荐
:return:小说url集合
#手动先做一遍分类
"""
urls = ["/xuanhuanxiaoshuo/","/xiuzhenxiaoshuo/","/dushixiaoshuo/","/chuanyuexiaoshuo/",
"/wangyouxiaoshuo/","/kehuanxiaoshuo/","/qitaxiaoshuo/"]
for url in urls:
res = requests.get(base_url+url, headers=headers).text
doc = pq(res)
for cap_url in doc('.s2 a').items():
cap_url = base_url + cap_url.attr.href
is_exist = con2.sadd('down',cap_url)
if not is_exist:
con.sadd('novel',cap_url) #集合
这里我实现的是多进程爬虫,执行效率一般,感兴趣的小伙伴可以根据整个爬虫实现异步,进程,线程的结合
if __name__=="__main__":
spider = Spider()
queue = []
for i in range(6):
p = Process(target=spider.run)
queue.append(p)
p.start()
for i in queue:
i.join()
这是笔趣网的爬虫实现的全部代码,如果感兴趣的小伙伴可以自行复制运行,也可以修改代码,提高效率。
代码还并不是很完善,还存在一些问题,还请自行copy发现
如果你有好的建议,或者希望线下交流,可以加我微信号:
qqtxw128