python爬虫之一(2):爬取网页小说(圣墟)

强化
爬取最新的小说圣墟
python爬虫之一(2):爬取网页小说(圣墟)_第1张图片
代码:

#coding=utf-8
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from Spider import getHtmlCode 
from bs4 import BeautifulSoup
import re

#第一章的地址
url = 'https://www.biquge5200.com/52_52542/20380548.html'
def getTree(url):
    temp = getHtmlCode(url)
    soup = BeautifulSoup(temp,'html.parser')
    return soup


#输入为:章节网页地址
#输出为:(章节名,内容)
def getAll(url):
    temp = getTree(url)
    chaptername = temp.h1.string
    print u'章节名:',chaptername
    content = temp.find_all('div',id='content')
    content = str(content[0])
    content = content.replace('
'
,'\n') pattern = re.compile('<(.*)>') list_line = pattern.findall(content) for line in list_line: line = '<' + line +'>' content = content.replace(line,'') # print u'内容 :',content,'\n' return(chaptername,content) #输入为:章节地址 #输出为:创建一个章节名为文件名的txt文本 def creatFile(url): (fileName,txt) = getAll(url) fileName = fileName + '.txt' f = open(fileName,'a+') f.write(txt) f.close() def nextUrl(url): tree = getTree(url) aSpan = tree('a',href=re.compile('.*52_52542')) for nextChapter in aSpan: # print type(nextChapter.string) if u'下一章' == nextChapter.string: pathUrl = nextChapter['href'] print pathUrl break else: pathUrl = '' return pathUrl # nextUrl(url) #输入为:章节地址 #输出为:整本小说(每一百章为一个文件夹放置) def main(url): count = 1 flag = True cmd = 'del /q /s *.txt' os.system(cmd) while flag: creatFile(url) print 'adress = ',url url = nextUrl(url) count = count + 1 if 0 == (count % 100) : filename = count / 100 cmd_md = 'md ' + str(filename) cmd_mv = 'move *.txt ' + str(filename) os.system(cmd_md) os.system(cmd_mv) if -1 == url.find('.html'): filename = count / 100 + 1 cmd_md = 'md ' + str(filename) cmd_mv = 'move *.txt ' + str(filename) os.system(cmd_md) os.system(cmd_mv) flag = False main(url)

结果截图:
python爬虫之一(2):爬取网页小说(圣墟)_第2张图片

在执行时发现这种方式容易报错,后面改为所有章节合并为一本书。
代码:

#coding=utf-8
import os
import sys
reload(sys)
sys.setdefaultencoding('utf8')
from Spider import getHtmlCode 
from bs4 import BeautifulSoup
import re

#第一章的地址
url = 'https://www.biquge5200.com/52_52542/20380548.html'
def getTree(url):
    temp = getHtmlCode(url)
    soup = BeautifulSoup(temp,'html.parser')
    return soup


#输入为:章节网页地址
#输出为:(章节名,内容)
def getAll(url):
    temp = getTree(url)
    chaptername = temp.h1.string
    print u'章节名:',chaptername
    content = temp.find_all('div',id='content')
    content = str(content[0])
    content = content.replace('
'
,'\n') pattern = re.compile('<(.*)>') list_line = pattern.findall(content) for line in list_line: line = '<' + line +'>' content = content.replace(line,'') # print u'内容 :',content,'\n' return(chaptername,content) #输入为:章节地址 #输出为:创建一个章节名为文件名的txt文本 def creatFile(url): (fileName,txt) = getAll(url) txt = fileName + '\n' + txt storyFileName = u'圣墟.txt' f = open(storyFileName,'a+') f.write(txt) f.close() def nextUrl(url): tree = getTree(url) aSpan = tree('a',href=re.compile('.*52_52542')) for nextChapter in aSpan: # print type(nextChapter.string) if u'下一章' == nextChapter.string: pathUrl = nextChapter['href'] print pathUrl break else: pathUrl = '' return pathUrl # nextUrl(url) #输入为:章节地址 #输出为:整本小说 def main(url): flag = True cmd = 'del /q /s *.txt' os.system(cmd) while flag: creatFile(url) print 'adress = ',url url = nextUrl(url) if -1 == url.find('.html'): flag = False main(url)

结果:
python爬虫之一(2):爬取网页小说(圣墟)_第3张图片
python爬虫之一(2):爬取网页小说(圣墟)_第4张图片

划重点)所有代码以及小说见我的下载资源,没有积分的qq私聊我

你可能感兴趣的:(Study--ing,python,Python3,#,Glidedsky爬虫挑战)