请尊重原创作品。转载请保持文章完整性,并以超链接形式注明原始作者“tingsking18”和主站点地址,方便其他朋友提问和指正。
二进制文件下载地址:
SinaGetBook
效果如图:
代码:
#!/usr/bin/env python #coding=utf-8 #!/usr/bin/env python #coding=utf-8 import traceback import sys import wx import re import urllib import wx.richtext as rt import wx.lib.buttonpanel as bp import Casing import Debug def trace_back(): try: return traceback.print_exc() except: return '' class Window(wx.Frame): def __init__(self): sys.setdefaultencoding("utf-8") wx.Frame.__init__(self,None,-1,u'新浪网图书频道抓取工具',pos=wx.Point(0, 0),size=(800,620)) l1 = wx.StaticText(self, -1, u"目录URL:") self.t1 = wx.TextCtrl(self, -1, "http://vip.book.sina.com.cn/book/?book=27633", size=(500, -1)) l2 = wx.StaticText(self, -1, u"内容URL前缀:") self.t2 = wx.TextCtrl(self, -1, "http://vip.book.sina.com.cn/book/", size=(500, -1)) l3 = wx.StaticText(self, -1, u"替换的内容:") self.t3 = wx.TextCtrl(self, -1, u"阅读‘刘猛’的其他作品: /n" u"http://vip.book.sina.com.cn/book/?book=39011《狼牙》作者新作:冰是睡着的水/n" u"http://vip.book.sina.com.cn/book/?book=41217刘猛展示狙击手神秘生活:刺客/n" u"http://vip.book.sina.com.cn/book/?book=38884中国特种部队生存实录:狼牙/n" u"http://vip.book.sina.com.cn/book/?book=43226刘猛最新力作:如临大敌", size=(500, 100), style=wx.TE_MULTILINE|wx.TE_PROCESS_ENTER) self.t3.SetInsertionPoint(0) l4 = wx.StaticText(self, -1, u"内容") #self.t4 = wx.TextCtrl(self, -1,"", # size=(600, 400), style=wx.TE_MULTILINE|wx.TE_PROCESS_ENTER) self.t4 = rt.RichTextCtrl(self,-1,"",size=(600, 400), style=wx.VSCROLL|wx.HSCROLL|wx.NO_BORDER); #self.t4.SetInsertionPoint(0) self.b = wx.Button(self, -1, u"开始抓取") self.Bind(wx.EVT_BUTTON, self.OnTestReplace, self.b) space = 2 bsizer = wx.BoxSizer(wx.VERTICAL) bsizer.Add(self.b, 0, wx.GROW|wx.ALL, space) sizer = wx.FlexGridSizer(cols=3, hgap=space, vgap=space) sizer.AddMany([ l1, self.t1, (0,0), l2, self.t2, (0,0), l3, self.t3, bsizer, l4, self.t4, (0,0), ]) border = wx.BoxSizer(wx.VERTICAL) border.Add(sizer, 0, wx.ALL, 15) self.SetSizer(border) self.SetAutoLayout(True) self.Show(True) def OnTestReplace(self, evt): #dlg = wx.MessageDialog(None, u'Data file is not exist,please download it!',u'Error',wx.OK | wx.ICON_INFORMATION) #dlg.ShowModal() #dlg.Destroy() listurl = self.t1.GetValue() prefix = self.t2.GetValue() #print prefix replace = self.t3.GetValue() #print replace.decode("utf-8").encode("GBK") rep = replace.split("/n") def f(): try: sock = urllib.urlopen(listurl) strhtml = sock.read() strhtml = unicode(strhtml, 'gb2312','ignore').encode('utf-8','ignore') strhtml =strhtml.lower() list = re.findall('''''', strhtml) for one in list: try: sock1 = urllib.urlopen(prefix+one) htmlcontent = sock1.read() htmlcontent = unicode(htmlcontent, 'gb2312','ignore').encode('utf-8','ignore') title = re.findall(''' ([/s/S]*?) ","") s_content = s_content.replace("(.*?)
''', htmlcontent)[0] s_content = re.findall('''