本程序用于提取网页正文,将内容聚合到一个文件中。使用了多线程、锁、正则表达式、Beautiful Soup开源组件。
抓下来的6300多个网页处理了大约五分钟。用了8个线程。
代码如下:
1 #!/usr/bin/python 2 """ 3 parser 4 for parsing html file from leiphone.com and 36kr.com 5 author 6 xiaoyang 7 contact 8 [email protected] 9 version 10 11 describe 12 parse a html file from leiphone.com 13 log 14 1.2012-11-22 create 15 2.2012-11-23 add FileCollect and ParseTask class 16 3.2012-11-23 add MutiThreads support 17 """ 18 19 import sys 20 import os 21 from bs4 import BeautifulSoup 22 import Queue 23 import threading 24 import re 25 26 # for mutithread,you shouldn't change these vars directly 27 OUT_CNT_LOCK = threading.Lock() 28 PRINT_LOCK = threading.Lock() 29 OUT_FILE_PREFIX = "out" 30 WORKER_NUM = 8 31 OUT_CNT = 0 32 MAX_ITEM_CNT = 100 33 PRINT_DBG = True 34 35 # for debug 36 FileCollectDBG = False 37 ParseTaskDbg = False 38 39 # error print and exit ,thread safety 40 def errPrint(ifExit=True, msg='_'): 41 global PRINT_LOCK 42 try: 43 if PRINT_LOCK.acquire(10): 44 print >> sys.stderr,msg 45 if ifExit: 46 sys.exit() 47 finally: 48 PRINT_LOCK.release() 49 50 # dbg print 51 def dbgPrint(msg): 52 global PRINT_LOCK 53 if PRINT_LOCK.acquire(10): 54 print msg 55 PRINT_LOCK.release() 56 57 import inspect 58 def lineno(): 59 """Returns the current line number in our program.""" 60 line=inspect.currentframe().f_back.f_lineno 61 return str(line) 62 63 # for LeiPhone.com 64 def SaveResLP(doc, filename, mode="a"): 65 fp = None 66 try: 67 fp = open(filename, mode) 68 fp.write(doc) 69 except IOError as errStr: 70 dbgPrint("lines:"+lineno()) 71 errPrint(True,errStr) 72 finally: 73 fp.close() 74 return True 75 76 # foe 36kr.com 77 def SaveRes36K(doc, filename): 78 return True 79 80 class FileCollect: 81 def __init__(self, root): 82 if root[len(root)-1] != '\\': 83 root+="\\" 84 self.root = root 85 self.dlist = [] 86 self.fqueue = Queue.Queue(0) 87 def init(self): 88 for root, dirs, files in os.walk(self.root): 89 self.dlist += dirs 90 for afile in files: 91 # if file ends with '.html',add it 92 if re.search('.html$',afile) is not None: 93 self.fqueue.put(root + afile) 94 return True 95 96 class ParseTask: 97 def __init__(self, savedFileName=None): 98 self.soup = None 99 self.savedFileName = savedFileName 100 def parse(self, readFileName): 101 fp = None 102 content = None 103 try: 104 fp = open(readFileName, "r") 105 if fp is not None: 106 self.soup = BeautifulSoup(fp.read()) 107 else: 108 msg = "fopen" + readFileName + "failed" 109 errPrint(True,msg) 110 content = self.soup.find("article") 111 if content is not None: 112 #self.soup = BeautifulSoup(str(content)) 113 # remove other tags 114 tag=content.find("p").find("a") 115 if not tag: 116 return False 117 tag.clear() 118 119 tag=content.find("footer") 120 if not tag: 121 return False 122 tag.clear() 123 124 tag=content.find(class_="alipayzone") 125 if not tag: 126 return False 127 tag.clear() 128 129 tag=content.find(class_="authorpigtwo") 130 if not tag: 131 return False 132 tag.clear() 133 134 tag=content.find(id="jiathis_style_32x32") 135 if not tag: 136 return False 137 tag.clear() 138 139 tag=content.find(class_="wumii-hook") 140 if not tag: 141 return False 142 tag.clear() 143 144 tag=content.find("center") 145 if not tag: 146 return False 147 tag.clear() 148 149 tags=content.find_all(rel="bookmark") 150 for tag in tags: 151 tag.clear() 152 SaveResLP(str(content), self.savedFileName) 153 else: 154 return False 155 # file handled done 156 return True 157 except IOError as errStr: 158 errPrint(True,errStr) 159 except Exception as errStr: 160 dbgPrint("lines:"+lineno()) 161 errPrint(True,errStr) 162 #errPrint(True,errStr) 163 finally: 164 if fp is not None: 165 fp.close() 166 167 # get out filename,thread safety 168 def newOutName(): 169 global OUT_CNT_LOCK 170 # block here until get the lock 171 if(OUT_CNT_LOCK.acquire(10)): 172 # get the lock 173 global OUT_CNT 174 OUT_CNT+=1 175 filename = str(OUT_FILE_PREFIX) + str(OUT_CNT) + str(".html") 176 OUT_CNT_LOCK.release() 177 return filename 178 179 180 class TaskThread(threading.Thread): 181 def __init__(self,tid, tname, queue): 182 threading.Thread.__init__(self, name=tname) 183 self.tid=tid 184 self.queue = queue 185 self.parserTask = None 186 self.stop = False 187 self.savedCnt = 0 188 def run(self): 189 outName = newOutName() 190 self.parserTask = ParseTask() 191 while not self.stop: 192 try: 193 # if no obj exist,throw exception 194 inName = self.queue.get_nowait() 195 dbgPrint("handle:" + inName) 196 self.parserTask.savedFileName = outName 197 if self.parserTask.parse(inName): 198 self.savedCnt+=1 199 if self.savedCnt > MAX_ITEM_CNT: 200 # create new saved file 201 outName = newOutName() 202 self.savedCnt = 0 203 else: 204 # parsed failed 205 continue 206 except Queue.Empty: 207 self.stop = True 208 if self.savedCnt!=0: 209 msg = "ethread [" + self.name + "] out:'" + outName + "' with " + str(self.savedCnt) + " items success" 210 errPrint(False,msg) 211 else: 212 msg = "ethread [" + self.name + "] exit with " + str(self.savedCnt) + " items" 213 errPrint(False,msg) 214 return 215 except Exception as ex: 216 errPrint(True, "lines:"+lineno()+","+ex) 217 return 218 219 # main 220 def main(): 221 taskThreads={} 222 #fc = FileCollect("E:\project\python\Parser\page") 223 fc = FileCollect("F:\myweb\leiphone\web") 224 print "Start add files..." 225 fc.init() 226 print "Added files count:%d" % fc.fqueue.qsize() 227 print("Starting threads ...") 228 try: 229 for tid in range(0,WORKER_NUM): 230 tobj=TaskThread(tid,"thread-"+str(tid),fc.fqueue) 231 taskThreads[tid]=tobj 232 tobj.start() 233 for tid in range(0,WORKER_NUM): 234 taskThreads[tid].join() 235 except Exception as ex: 236 errPrint(True, ex) 237 print('All threads have terminated.') 238 239 if __name__ == '__main__': 240 main() 241 afile="03-31-dan-talk-omgpop.html" 242 if re.search('.html$',afile) is not None: 243 print "matched!" 244 else: 245 print "mismatched!" 246 if re.search('.jpg$',afile) is not None: 247 print "matched2!"
网页原内容:
净化效果: