PycURL 传说是实现Python下多线程网页抓取的效率最高的解决方案,本质是对libcurl C语言库的封装。
在Linux上有个常用的命令 curl(非常好用),支持curl的就是大名鼎鼎的libcurl库;libcurl是功能强大的,而且是非常高效的函数库。libcurl除了提供本身的C API之外,还有多达40种编程语言的Binding,这里介绍的PycURL就是libcurl的Python binding。
在Python中对网页进行GET/POST等请求,当需要考虑高性能的时候,libcurl是非常不错的选择,一般来说会比liburl、liburl2快不少,可能也会比Requests的效率更高。特别是使用PycURL的多并发请求时,更是效率很高的。个人感觉,其唯一的缺点是,由于是直接调用的是libcurl C库,PycURL的函数接口之类的还和C中的东西很像,可能不是那么的Pythonic,写代码的学习曲线稍微比liburl高一点儿。
下面是我的几个实践示例
1.最简单的网页获取
#!/usr/bin/env python #-*- coding: utf-8 -*- import sys, pycurl, time, cStringIO sys.stderr.write("pycURL version [%s]\n" % pycurl.version) start_time = time.time() url = 'http://www.dianping.com/shanghai' b = cStringIO.StringIO() c = pycurl.Curl() c.setopt(c.URL, url) c.setopt(c.WRITEFUNCTION, b.write) c.perform() end_time = time.time() content = b.getvalue() duration = end_time - start_time print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL) c.close() print 'pycurl takes [%s] seconds to get [%s]' % (duration, url) print 'length of the content is [%d]' % len(content)
2.简单的pycURL包装类
#!/usr/bin/env python #encoding: utf-8 import sys, pycurl, cStringIO, urllib class Curl: def __init__(self): self.c = pycurl.Curl() def __del__(self): self.c.close() def init(self, verbose): c = self.c; c.setopt(c.FOLLOWLOCATION, 1) c.setopt(c.MAXREDIRS, 5) c.setopt(c.CONNECTTIMEOUT, 30) c.setopt(c.TIMEOUT, 300) c.setopt(c.NOSIGNAL, 1) c.setopt(c.USERAGENT, "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36") c.setopt(c.VERBOSE, verbose) def get(self, url): b = cStringIO.StringIO() c = self.c; c.setopt(c.URL, url) c.setopt(c.WRITEFUNCTION, b.write) c.perform() content = b.getvalue() print "HTTP CODE: ", c.getinfo(c.HTTP_CODE) b.close() return content def post(self, url, data): b = cStringIO.StringIO() c = self.c; c.setopt(c.POSTFIELDS, urllib.urlencode(data)) c.setopt(c.URL, url) c.setopt(c.WRITEFUNCTION, b.write) c.perform() content = b.getvalue() print "HTTP CODE: ", c.getinfo(c.HTTP_CODE) b.close() return content def purge(self, url): cmd = 'PURGE ' proxy = '127.0.0.1:8080' c = self.c c.setopt(c.URL, url) c.setopt(c.PROXY, proxy) c.setopt(c.CUSTOMREQUEST, cmd) c.perform() status = c.getinfo(c.HTTP_CODE) print "HTTP CODE: ", status return status if __name__ == '__main__': page = 'http://news.sohu.com/' c = Curl() c.init(True) c.get(page) page1 = 'http://www.google.com/' post_data_dic = {"name":"value"} c.post(page1, post_data_dic) page2 = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg' c.purge(page2)
3.简单的pycURL multi类包装
#!/usr/bin/env python #encoding: utf-8 import sys, pycurl, cStringIO class MCurl: def __init__(self, tasks, concurrent): self.taskQ = tasks self.taskQ_size = len(tasks) self.max_conn = concurrent self.resp_dict = {} self.m = pycurl.CurlMulti() def __del__(self): self.m.close() def add_tasks(self): self.max_conn = min(self.taskQ_size, self.max_conn) assert 1 <= self.max_conn <= 100, "invalid number of concurrent urls" print "===Getting %d urls using %d concurrent cURL handle pool===" % (self.taskQ_size, self.max_conn) self.m.handles = [] for i in range(self.max_conn): c = pycurl.Curl() c.fp = None c.setopt(pycurl.FOLLOWLOCATION, 1) c.setopt(pycurl.MAXREDIRS, 5) c.setopt(pycurl.CONNECTTIMEOUT, 30) c.setopt(pycurl.TIMEOUT, 300) c.setopt(pycurl.NOSIGNAL, 1) self.m.handles.append(c) self.resp_dict['total'] = self.taskQ_size self.resp_dict['succ'] = [] self.resp_dict['fail'] = [] def process_tasks(self): freelist = self.m.handles[:] queue = self.taskQ num_processed = 0 while num_processed < self.taskQ_size: #if there is an url to process and a free curl handle, add to multi stack while queue and freelist: url, filename = queue.pop(0) c = freelist.pop() c.fp = open(filename, "wb") c.setopt(pycurl.URL, url) c.setopt(pycurl.WRITEDATA, c.fp) self.m.add_handle(c) #store some info for use later c.filename = filename c.url = url #run the internal curl state machine for the multi stack while 1: ret, num_handles = self.m.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break #check if curl handle has terminated, and add them to the freelist while 1: num_q, ok_list, err_list = self.m.info_read() for c in ok_list: c.fp.close() c.fp = None self.resp_dict['succ'].append(c.url) self.m.remove_handle(c) print ("Success:", c.filename, c.url, c.getinfo(pycurl.EFFECTIVE_URL)) freelist.append(c) for c, errno, errmsg in err_list: c.fp.close() c.fp = None self.resp_dict['fail'].append(c.url) self.m.remove_handle(c) print("Failed: ", c.filename, c.url, errno, errmsg) freelist.append(c) num_processed = num_processed + len(ok_list) + len(err_list) if num_q == 0: break; #currently no more I/O is pending, we just call select() to sleep until some more data is available self.m.select(1.0) def del_tasks(self): for c in self.m.handles: if c.fp is not None: c.fp.close() c.fp = None c.close() def dump_process(self): print self.resp_dict #========= main entry point ========== #give tasks info urls = ["http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg", "http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg", "", "http://m2.biz.itc.cn/pic/new/n/93/87/Img7798793_n.jpg", "http://m1.biz.itc.cn/pic/new/n/92/87/Img7798792_n.jpg", "http://m3.biz.itc.cn/pic/new/n/94/91/Img7799194_n.jpg", "http://m1.biz.itc.cn/pic/new/n/96/87/Img7798796_n.jpg", "http://m2.biz.itc.cn/pic/new/n/97/87/Img7798797_n.jpg", "http://m1.biz.itc.cn/pic/new/n/16/88/Img7798816_n.jpg", "http://m2.biz.itc.cn/pic/new/n/17/88/Img7798817_n.jpg", "http://m4.biz.itc.cn/pic/new/n/95/87/Img7798795_n.jpg", "http://m4.biz.itc.cn/pic/new/n/91/91/Img7799191_n.jpg"] concurr = 6 queue = [] for url in urls: url = url.strip() if not url or url[0] == "#": continue filename = "./sohu_%03d.jpg" % (len(queue) + 1) queue.append((url, filename)) mc = MCurl(queue, concurr) mc.add_tasks() mc.process_tasks() mc.del_tasks() mc.dump_process()
4.PURGE等自定义请求实现
#!/usr/bin/env python #encoding: utf-8 import sys, pycurl, cStringIO, urllib url = 'http://m3.biz.itc.cn/pic/new/n/94/87/Img7798794_n.jpg' cmd = 'PURGE ' #cmd = 'DELETE ' proxy = '127.0.0.1:8080' c = pycurl.Curl() c.setopt(c.URL, url) c.setopt(c.VERBOSE, 1) c.setopt(c.PROXY, proxy) c.setopt(c.CUSTOMREQUEST, cmd) try: c.perform() except Exception as e: print e status = c.getinfo(c.HTTP_CODE) print "HTTP CODE: ", status c.close()
运行截图
说明:
1.使用post表单时,只需要设置
源码pycurl/examples/quickstart/form_post.py 很标准
2.异步批量预取的例子在pycurl/examples/retriever-multi.py,很有代表性的
3.使用自定义方法