抓取天涯文章的蜘蛛代码,刚经过更新(因为天涯页面HTML代码变化)

#_*_coding:utf-8-*-

import urllib2

import traceback

import codecs

from BeautifulSoup import BeautifulSoup



def openSoup(url,code):

    page = urllib2.urlopen(url)

    soup = BeautifulSoup(page,fromEncoding=code)#,fromEncoding="gb2312"

    #soup = BeautifulSoup(page,code)

    return soup



def getContentFromDiv(contents):

    s = ""

    for content in contents:

        try:

            s += content

        except:

            pass

    

    s = s.lstrip().rstrip()

    if len(s) < 50:

        return ""

    else:

        return "    "+s+"\r\n"+"\r\n"



def readHtml(soup,fp,authname):

    pageContent = ""

    item = soup.find(name='div', attrs={'class':'bbs-content clearfix'})

    if item != None:

        pageContent += getContentFromDiv(item.contents)



    items = soup.findAll(name='div', attrs={'class':'atl-item'})

    for item in items:

        userItem = item.find(name='a', attrs={'class':'js-vip-check'})

        if userItem == None or userItem.contents[0] != authname:

            continue



        contentItem = item.find(name='div', attrs={'class':'bbs-content'})

        pageContent += getContentFromDiv(contentItem.contents)

    

    fp.write(pageContent)

   

def getNextPage(soup,pno):

    nextlink = soup.find(name="a",attrs={"class":"js-keyboard-next"})

    if nextlink != None:

        return "http://bbs.tianya.cn"+nextlink["href"]

    else:

        return 'OVER'



def getHtml(url,filename,authname):

    p = 1

    fp = codecs.open(filename,'w','utf-8')

    while True:

        soup = openSoup(url,'utf-8')

        readHtml(soup,fp,authname)

        url = getNextPage(soup,p+1)

        if url == 'OVER' :

            break

        print 'PAGE '+str(p)+' OK'

        p = p + 1

       

    print 'It\'s Over'

    fp.close()





if __name__ == '__main__':

    getHtml('http://bbs.tianya.cn/post-no05-143258-1.shtml','krzc.txt',u'关河五十州')

    #getHtml('http://bbs.tianya.cn/post-no05-143258-1036.shtml','krzc.txt',u'关河五十州')

 

你可能感兴趣的:(html)