python爬取某一小说

python爬取某一小说

经过几天的学习简单写一个爬取小说的代码,试试结果,可惜爬取得有些慢,下面是代码:

# _*_ coding:utf-8 _*_
import urllib2,urllib
import re
import sys
from bs4 import BeautifulSoup
import random

reload(sys)
sys.setdefaultencoding('utf8')

def getHtml(url):
    user_agents = [
                   'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                   'Opera/9.25 (Windows NT 5.1; U; en)',
                   'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                   'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                   'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
                   'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
                   "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
                   "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
                     ]
    user_agent_random = random.choice(user_agents)
    header = {
        'User-Agent':user_agent_random,
        'Host':'www.biquge.com.tw',
        'Referer':'http://www.biquge.com.tw/',
        'GET':url
    }
    request = urllib2.Request(url,headers=header)
    html = urllib2.urlopen(request).read()
    html = html.decode('gbk','ignore').encode("utf8")
   # print html
    return html
#取得章节和章节url保存到列表中
def getht(h):
    soup = BeautifulSoup(h,'html.parser')
    html_ = soup.find_all('dd')

    i = 0
    book = []
    book_mark = []
    for im in html_:
        len1 = len(im)
        s=str(im)
        html_url = 'http://www.biquge.com.tw'+s[13:35]
        book.append(html_url)
        book_mark.append(s[37:-9])
        i = i + 1


    return book,book_mark
#取网页内容值
#在小说一个章节网页取内容保存到本地
def getContent(html_book,html_book_mark):
    soup = BeautifulSoup(html_book,'html.parser')
    b = soup.find_all('div',id='content')[0]
    fh = open('E://python/2.txt','a')

    s = b.get_text()
    st =html_book_mark+ str(s)+'\n'
    fh.write(st)
    fh.close()
    print html_book_mark+'保存成功'

#循环取出每章内容保存
def get_par(books,book_marks):
    t = 0
    for (bo,bo_mark) in zip(books,book_marks):
        getContent(getHtml(bo),bo_mark)
        t+=1
        if t > len(books):
            print "全部章节保存完全"


url = 'http://www.biquge.com.tw/16_16273/'
#得到书籍目录页
html = getHtml(url)
#得到每一章节网址
book,book_m = getht(html)
#得到每一章节网址内容
get_par(book,book_m)


注意点:测试时在取到某章时出现 ‘gbk’ codec can’t decode bytes in position 7782-7783: illegal multibyte sequence错误,在decode时添加ignore就可以解决该问题

python爬取某一小说_第1张图片

代码块

部分user_agent :

user_agents = [
                   'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
                   'Opera/9.25 (Windows NT 5.1; U; en)',
                   'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
                   'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
                   'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
                   'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
                   "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
                   "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
                     ]

... 

你可能感兴趣的:(python-爬虫)