import urllib
from pyquery import PyQuery as pq
import codecs
import Queue
class Fetcher:
def __init__(self):
self.q = Queue.Queue()
self.q.put(("http://www.7dsw.com/toplastupdate/1.html",0))
def work(self,):
while not self.q.empty():
url,tp = self.q.get()
page = self.getPage(url)
if tp == 0:
self.getCapUrl(page)
else:
self.getContent(page)
def getPage(self,url):
print 'fetch page...'
resp = urllib.urlopen(url)
page = resp.read()
page = page.decode('gbk')
return page
def getCapUrl(self,page):
doc = pq(page)
wanted = doc('#newscontent ul a')
i = 1
dir(wanted[i])
while i"href")
print u
self.q.put((u,1))
i+= 2
def saveFile(self,filename,data):
fp = codecs.open(filename,'a','utf-8')
fp.write(data)
fp.write("\r\n------------------------\r\n");
fp.close()
def getContent(self,page):
doc = pq(page)
wanted = doc('#BookText')
self.saveFile("aa.txt",wanted.text())
f = Fetcher()
f.work()