python写的整本书的小说爬虫(并写入txt文件)

终于将一本书的小说爬虫完善了。在这里我爬取的是“去看看小说网”中的“剑来”一书。

代码中关于爬取内容的地方,读者可参考,然后按照自己想要爬取的网页进行修改。

(以下代码为完整代码,已测试)

#-*- coding=utf-8 -*-

import urllib2
import urlparse
import Queue
import time
from bs4 import BeautifulSoup

# s1 = '大家好啊'
# open('hello.txt','w').write(s1)
  
# 得到小说内容
def link_crawler(seed_url):
    firstTitle = BeautifulSoup(download(seed_url)).title.text.split('(')[0]     # 拿到小说名字
    filename = firstTitle+'.txt'
    file = open(filename,'w+')      # 创建以小说名为名的txt文件,并将文件设置为追加模式
    file.write(firstTitle+'\n\n')
#     print firstTitle
    crawler_queue = Queue.deque([seed_url])     # 将链接存到crawler_queue列表中,并且按照降序存放
    seen = set(crawler_queue)       # 将访问过的链接存放在seen中
    while crawler_queue:
        url = crawler_queue.pop()   # 提取第一个链接
        html = download(url)
#         soup = BeautifulSoup(html)
        soup = BeautifulSoup(html).find_all('dd',{'class':'col-md-3'})
#         print soup
        for link in soup:
            title = link.string     # 章节名称
            file.write(title+'\n')
            link = link.find('a')['href']
            print title   
#             print link
            if link not in seen:
                # 因第一个链接所示内容为新书感言,需要从里头拿到的链接与后面的不一样,故单独记录
                first = link.split('.')[0]  
                link = urlparse.urljoin(url,link)   # 将链接补充完整
                html2 = download(link)
                content1 = BeautifulSoup(html2).find(id='htmlContent')      # 章节内容
                neilink = BeautifulSoup(html2).find(id='linkNext')
                bb = neilink['href']
#                 print bb
                if first == '1':
                    content = content1.text
                    file.write(content+'\n\n')
                else:
                    html3 = download(bb)
                    content2 = BeautifulSoup(html3).find(id='htmlContent')
                    content1 = content1.text
                    content2 = content2.text
                    file.write(content1+'\n')
                    file.write(content2+'\n\n')
#                     content = content1+content2(错误代码,不可以用+相连)
#                 print content.text
                 
                seen.add(link)
                crawler_queue.append(link)
                time.sleep(1)       # 睡眠(每隔一秒钟找一次)
                 
 
# 得到已知链接的网页源代码(各句功能在以前的博客中都有,不再赘述)
def download(url,user_agent = 'wswp',proxy = None,num_retries = 2):
    print 'downloading:',url
    headers = {'User-agent':user_agent}
    request = urllib2.Request(url,headers = headers)
      
    opener = urllib2.build_opener()
    if proxy:
        proxy_params = {urlparse.urlparse(url).scheme:proxy}
        opener.add_handler(urllib2.ProxyHandler(proxy_params))
    try:
        html = opener.open(request).read()
    except urllib2.URLError as e:
        print 'download error',e.reason
        html = None
        if num_retries > 0:
            if hasattr(e,'code') and 500 <= e.code < 600:
                html = download(url,user_agent,proxy,num_retries-1)
                  
    return html
 
seed_url = 'http://www.7kankan.la/book/1/'
# seed_url = 'http://www.biquge5200.com/52_52542/'
link_crawler(seed_url)


你可能感兴趣的:(python网络爬虫)