(1)取文件路径
from unipath import Path ofn = 'test.txt' self.cwd = Path(__file__).ancestor(1) ofn_path = Path(self.cwd, ofn)
(2)打印错误堆栈信息
import traceback logger.error('error [%s]'% (traceback.format_exc()))
(3)测试方法用时
from timeit import Timer t1=Timer("test()","from __main__ import test") print t1.timeit(10000) print min(t1.repeat(3,10000))
(3)测试内存回收和使用
import gc , objgraph gc.collect() objgraph.show_most_common_types(limit=50)
(4) 测试用时
before = datetime.datetime.now() 。。。 end = datetime.datetime.now() logger.error('init [%s]'% (end - before))
(5) 网页抓取
import urllib def fetch(url): #logger.error('fetch:%s' % url) content = u'' try: status = urllib.urlopen(url) if status.getcode() == 200: content = status.read() content = unicode(content.strip(), 'utf-8', 'ignore') else: logger.error('fetch error [%s]' % url) except: logger.error('fetch error %s' % traceback.format_exc()) return content
(6) 网页解析
import re def parse_html(self, url, content): # logger.error('url[%s][%s]' % (url,content)) title_pattern = r'<ul class="movie_list">(?P<detail_content>.*?)</ul>' detail_pattern = r'<li>.*?href="(?P<detail_url>.*?)".*?title="(?P<detail_title>.*?)">.*?</li>' list_res = [] res = re.search(title_pattern,content,re.S) if res: detail_content = res.group('detail_content') #logger.error('parse_html:%s' % detail_content) list_res = re.findall(detail_pattern, detail_content, re.S) if not list_res: list_res = []