python入门(3)-净化雷锋网网页内容

本程序用于提取网页正文,将内容聚合到一个文件中。使用了多线程、锁、正则表达式、Beautiful Soup开源组件。

抓下来的6300多个网页处理了大约五分钟。用了8个线程。

代码如下:

  1 #!/usr/bin/python
  2 """
  3 parser
  4     for parsing html file from leiphone.com and 36kr.com
  5 author
  6     xiaoyang
  7 contact
  8     [email protected]
  9 version
 10     
 11 describe
 12     parse a html file from leiphone.com
 13 log
 14    1.2012-11-22 create
 15    2.2012-11-23 add FileCollect and ParseTask class
 16    3.2012-11-23 add MutiThreads support
 17 """
 18 
 19 import sys
 20 import os
 21 from bs4 import BeautifulSoup
 22 import Queue
 23 import threading
 24 import re
 25 
 26 # for mutithread,you shouldn't change these vars directly
 27 OUT_CNT_LOCK = threading.Lock()
 28 PRINT_LOCK = threading.Lock()
 29 OUT_FILE_PREFIX = "out"
 30 WORKER_NUM = 8
 31 OUT_CNT = 0
 32 MAX_ITEM_CNT = 100
 33 PRINT_DBG = True
 34 
 35 # for debug
 36 FileCollectDBG = False
 37 ParseTaskDbg = False
 38 
 39 # error print and exit ,thread safety 
 40 def errPrint(ifExit=True, msg='_'):
 41     global PRINT_LOCK
 42     try:
 43         if PRINT_LOCK.acquire(10):
 44                 print >> sys.stderr,msg
 45                 if ifExit:
 46                     sys.exit()
 47     finally:
 48         PRINT_LOCK.release()
 49 
 50 # dbg print 
 51 def dbgPrint(msg):
 52     global PRINT_LOCK
 53     if PRINT_LOCK.acquire(10):
 54         print msg
 55     PRINT_LOCK.release()
 56 
 57 import inspect
 58 def lineno():
 59     """Returns the current line number in our program."""
 60     line=inspect.currentframe().f_back.f_lineno
 61     return str(line) 
 62     
 63 # for LeiPhone.com
 64 def SaveResLP(doc, filename, mode="a"):
 65     fp = None
 66     try:
 67         fp = open(filename, mode)
 68         fp.write(doc)
 69     except IOError as errStr:
 70         dbgPrint("lines:"+lineno())
 71         errPrint(True,errStr)
 72     finally:
 73         fp.close()
 74     return True
 75 
 76 # foe 36kr.com
 77 def SaveRes36K(doc, filename):
 78     return True
 79 
 80 class FileCollect:
 81     def __init__(self, root):
 82         if root[len(root)-1] != '\\':
 83             root+="\\"
 84         self.root = root
 85         self.dlist = []
 86         self.fqueue = Queue.Queue(0) 
 87     def init(self):
 88         for root, dirs, files in os.walk(self.root):
 89             self.dlist += dirs
 90             for afile in files: 
 91                 # if file ends with '.html',add it
 92                 if re.search('.html$',afile) is not None:
 93                     self.fqueue.put(root + afile)
 94         return True
 95 
 96 class ParseTask:
 97     def __init__(self, savedFileName=None):
 98         self.soup = None
 99         self.savedFileName = savedFileName
100     def parse(self, readFileName):
101         fp = None
102         content = None
103         try:
104             fp = open(readFileName, "r")
105             if fp is not None:
106                 self.soup = BeautifulSoup(fp.read())
107             else:
108                 msg = "fopen" + readFileName + "failed"
109                 errPrint(True,msg)
110             content = self.soup.find("article")
111             if content is not None:
112                 #self.soup = BeautifulSoup(str(content))
113                 # remove other tags
114                 tag=content.find("p").find("a")
115                 if not tag:
116                     return False
117                 tag.clear()
118                 
119                 tag=content.find("footer")
120                 if not tag:
121                     return False
122                 tag.clear()
123 
124                 tag=content.find(class_="alipayzone")
125                 if not tag:
126                     return False
127                 tag.clear()
128                 
129                 tag=content.find(class_="authorpigtwo")
130                 if not tag:
131                     return False
132                 tag.clear()
133                 
134                 tag=content.find(id="jiathis_style_32x32")
135                 if not tag:
136                     return False
137                 tag.clear()
138                 
139                 tag=content.find(class_="wumii-hook")
140                 if not tag:
141                     return False
142                 tag.clear()
143                 
144                 tag=content.find("center")
145                 if not tag:
146                     return False
147                 tag.clear()
148                 
149                 tags=content.find_all(rel="bookmark")
150                 for tag in tags:
151                     tag.clear()
152                 SaveResLP(str(content), self.savedFileName)
153             else:
154                 return False
155             # file handled done 
156             return True
157         except IOError as errStr:
158             errPrint(True,errStr)
159         except Exception as errStr:
160             dbgPrint("lines:"+lineno())
161             errPrint(True,errStr)
162             #errPrint(True,errStr)
163         finally:
164             if fp is not None:
165                 fp.close()
166             
167 # get out filename,thread safety
168 def newOutName():
169     global OUT_CNT_LOCK
170     # block here until get the lock
171     if(OUT_CNT_LOCK.acquire(10)):
172         # get the lock
173         global OUT_CNT
174         OUT_CNT+=1
175         filename = str(OUT_FILE_PREFIX) + str(OUT_CNT) + str(".html")
176         OUT_CNT_LOCK.release()
177         return filename
178         
179         
180 class TaskThread(threading.Thread):
181     def __init__(self,tid, tname, queue):
182         threading.Thread.__init__(self, name=tname)
183         self.tid=tid
184         self.queue = queue
185         self.parserTask = None
186         self.stop = False
187         self.savedCnt = 0
188     def run(self):
189         outName = newOutName()
190         self.parserTask = ParseTask()
191         while not self.stop:
192             try:
193                 # if no obj exist,throw exception
194                 inName = self.queue.get_nowait()
195                 dbgPrint("handle:" + inName)
196                 self.parserTask.savedFileName = outName
197                 if self.parserTask.parse(inName):
198                     self.savedCnt+=1
199                     if self.savedCnt > MAX_ITEM_CNT:
200                         # create new saved file
201                         outName = newOutName()
202                         self.savedCnt = 0
203                 else:
204                     # parsed failed
205                     continue
206             except Queue.Empty:
207                 self.stop = True
208                 if self.savedCnt!=0:
209                     msg = "ethread [" + self.name + "] out:'" + outName + "' with " + str(self.savedCnt) + " items success" 
210                     errPrint(False,msg)
211                 else:
212                     msg = "ethread [" + self.name + "] exit with " + str(self.savedCnt) + " items" 
213                     errPrint(False,msg)
214                 return 
215             except Exception as ex:
216                 errPrint(True, "lines:"+lineno()+","+ex)
217                 return 
218                
219 # main
220 def main():
221     taskThreads={}
222     #fc = FileCollect("E:\project\python\Parser\page")
223     fc = FileCollect("F:\myweb\leiphone\web")
224     print "Start add files..."
225     fc.init()
226     print "Added files count:%d" % fc.fqueue.qsize()
227     print("Starting threads ...")
228     try:
229         for tid in range(0,WORKER_NUM):
230             tobj=TaskThread(tid,"thread-"+str(tid),fc.fqueue)
231             taskThreads[tid]=tobj
232             tobj.start()
233         for tid in range(0,WORKER_NUM):
234             taskThreads[tid].join()
235     except Exception as ex:
236         errPrint(True, ex)    
237     print('All threads have terminated.')
238 
239 if __name__ == '__main__':
240     main()
241     afile="03-31-dan-talk-omgpop.html"
242     if re.search('.html$',afile) is not None:
243         print "matched!"
244     else:
245         print "mismatched!"
246     if re.search('.jpg$',afile) is not None:
247         print "matched2!"

 

 

网页原内容:

python入门(3)-净化雷锋网网页内容_第1张图片

净化效果:

python入门(3)-净化雷锋网网页内容_第2张图片

你可能感兴趣的:(python)