python爬糗百

#coding=utf-8
#需要BeautifulSoup(美丽的汤)支持:http://crummy.com/software/BeautifulSoup

import urllib
import urllib2
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup          # For processing HTML

def formalize(text):
    result = ''
    lines = text.split(u'\n')
    for line in lines:
        line = line.strip()
        if len(line) == 0:
            continue
        result += line + u'\n\n'
    return result

outfile = open("qiushi.txt", "w")
count = 0
for i in range(1, 101):
    url = "http://qiushibaike.com/qiushi/best/all/page/%d" % i
    data = urllib2.urlopen(url).readlines()
    soup = BeautifulSoup("".join(data))
    contents = soup.findAll('div', "content")
    stories = [str(text) for text in contents]
    for story in stories:
        count += 1
        print "processing page %d, %d items added" % (i, count)
        minisoup = BeautifulSoup(story)
        text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
        text = urllib.unquote(unescape(text, {'"':'"'}))
        text = formalize(text).encode("utf-8")
        print >> outfile, '-' * 20 + " %05d " % count + '-' * 20 + "\n"
        print >> outfile, text + "\r\n"
outfile.close()


基本的操作如下:


# -*- coding: cp936 -*-
import urllib
import urllib2
from xml.sax.saxutils import unescape
from BeautifulSoup import BeautifulSoup          # For processing HTML

#url = "http://job.dajie.com/7262fae6-a1aa-4674-9efa-3baf697faa46.html"
url="http://www.qiushibaike.com/hot"
data = urllib2.urlopen(url).readlines()
soup = BeautifulSoup("".join(data))
contents = soup.findAll('div', "content")
stories = [str(text) for text in contents]  
for story in stories:
    minisoup = BeautifulSoup(story)
    text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
    print text
    break

    '''
for div in soup.findAll('div', "content") :
    
    print 'find it'
    print div
    minisoup = BeautifulSoup(div)
    #来遍历文档中所有元素, 并打印它们
    text = ''.join([e for e in minisoup.recursiveChildGenerator() if isinstance(e, unicode)])
    print text
    break
    s = div.contents
    for x in s:
        if (x.encode('GB2312')) != '<br/>' and (x.encode('GB2312')) != '\n': #注意此处GB2312编码不是utf8
               print x.encode('GB2312')
        break

 '''


你可能感兴趣的:(python爬糗百)