很悲催,python常用的正则表达式一点都不熟,写了个漏洞百出的只看楼主的代码,先贴上来,有空再完善。
# -*- coding: utf8 -*- import urllib2 import re #from BeautifulSoup import BeautifulSoup ##def (i,title)=ExtractTitle(rawdata): ## buf=[] #### print rawdata[0] ## i=0 ## while(rawdata[i]): ## m = rawdata[i].find('<title>') ## i=i+1 ## rawdata[i] ## i++ ## while (m==-1) ## ## if m!=-1: ## i=i+1 ## print rawdata[i] def Extract(rawdata): author=[] title=[] i=0 content=[] link=[] while i < len(rawdata): m = rawdata[i].find('<title>') if m!=-1: title=rawdata[i+1] print title ###find the author tmpline = rawdata[i].find('topic-doc') if tmpline!=-1: print rawdata[i-2] posBeg=rawdata[i-2].find('alt=') posEnd=rawdata[i-2].find('/>') author = rawdata[i-2][posBeg+5:posEnd-1] print "author: "+author i=i+1 ## find the content written by the author while i < len(rawdata): contentLine = rawdata[i].find('alt=\"'+author+'\"') if contentLine!=-1: while i< len(rawdata): pLine = rawdata[i].find('<p>') if pLine!=-1: while i< len(rawdata): pEndLine=rawdata[i].find('</p>') if pEndLine !=-1: print rawdata[i] content.append(rawdata[i]) break i+=1 break i+=1 i+=1 i=i+1 def ExtractLink(rawdata,start): i=start links=[] while i< len(rawdata): line = rawdata[i].find('paginator') if line!=-1: tmpRow = rawdata[i] posBeg=tmpRow.find("href=") if posBeg!=-1: tmpRow=tmpRow[posBeg+6:-1] posBeg=0 while posBeg!=-1: posEnd=tmpRow.find(">") ## print posBeg ## print posEnd ## print tmpRow[posBeg:posEnd] links.append(tmpRow[posBeg:posEnd-1]) posBeg=tmpRow.find("href") if posBeg==-1: break tmpRow=tmpRow[posBeg+6:-1] posBeg=0 break i+=1 links=links[0:len(links)-1] return links def ExtractAuthorContent(rawdata): author=[] title=[] pos=0 pEnd=0 print len(rawdata) title,pos=findTitle(rawdata,pos) author,pos=findAuthor(rawdata,pos) ## find the content written by the author content,pEnd=extractAllSections(rawdata,author,pos) return author ## print content ## print pEnd ## def ExtractContent(rawdata,author): pos=0 print len(rawdata) print author ## find the content written by the author content,pEnd=extractAllSections(rawdata,author,pos) def extractSection(rawdata,start): i=start content=[] done=0 while i< len(rawdata): pBeginLine = rawdata[i].find('<p>') if pBeginLine!=-1: break i+=1 while i< len(rawdata): pEndLine=rawdata[i].find('</p>') content.append(rawdata[i]) if pEndLine !=-1: print rawdata[i] break i+=1 return (content,i) def extractAllSections(rawdata,author,start): i=start content=[] pEnd=0 count=0 while i < len(rawdata): contentLine = rawdata[i].find('alt=\"'+author+'\"') if contentLine!=-1: tmpContent,i=extractSection(rawdata,i) content.append(tmpContent) if i!=rawdata: count+=1 pEnd=i i+=1 i+=1 return (content,pEnd) def findTitle(rawdata,start): i=start while i < len(rawdata): m = rawdata[i].find('<title>') if m!=-1: title=rawdata[i+1] print title break i+=1 return(title,i) def findAuthor(rawdata,start): i=start author=0 while i < len(rawdata): tmpline = rawdata[i].find('topic-doc') if tmpline!=-1: posBeg=rawdata[i-2].find('alt=') posEnd=rawdata[i-2].find('/>') author = rawdata[i-2][posBeg+5:posEnd-1] print "author: "+author break i+=1 return (author,i) #data=file('E:/petrelli/play/crawl_douban/douban_2.htm','r').readlines() #soup = BeautifulSoup(data) #print soup.prettify() #Extract(data) data = urllib2.urlopen('http://www.douban.com/group/topic/9737262/').readlines() links=ExtractLink(data,0) author=ExtractAuthorContent(data) for link in links: print link ## data = urllib2.urlopen(link).readlines() ## ExtractContent(data,author) #ExtractAuthorContent(data) #for line in data: # print line