PycURL - 绝版聊神 - ITeye技术网站 教主张

PycURL - 绝版聊神 - ITeye技术网站

http://wiki.woodpecker.org.cn/moin/zspy

代码见 http://zspy.googlecode.com



张沈鹏 [email protected] http://zsp.iteye.com/



2008-1-23 16:42





1. PycURL

Pycurl http://pycurl.sourceforge.net/



外部libcurl的接口,C写的,比urllib快,功能强.支持循环rewrite陷井的安全深度. 用于做网络爬虫,抓网页.



从 http://pycurl.sourceforge.net/download/ 下载 pycurl-ssl-7.16.4.win32-py2.5.exe 安装.



参考文献1,测试代码





Toggle line numbers

   1

   2 #像操作文件一样操作字符串,也可以from cStringIO import StringIO,性能应该会好一些

   3 import StringIO

   4

   5 html = StringIO.StringIO()

   6

   7 import pycurl

   8 c = pycurl.Curl()

   9

  10 c.setopt(pycurl.URL, 'http://www.baidu.com')

  11

  12 #写的回调

  13 c.setopt(pycurl.WRITEFUNCTION, html.write)

  14

  15 c.setopt(pycurl.FOLLOWLOCATION, 1)

  16

  17 #最大重定向次数,可以预防重定向陷阱

  18 c.setopt(pycurl.MAXREDIRS, 5)

  19

  20 #访问,阻塞到访问结束

  21 c.perform()

  22

  23 #打印出 200(HTTP状态码) http://www.baidu.com(生效的url)

  24 print c.getinfo(pycurl.HTTP_CODE), c.getinfo(pycurl.EFFECTIVE_URL)

  25

  26 #输出百度首页的html

  27 #print html.getvalue()

然后看看多线程,http://pycurl.cvs.sourceforge.net/pycurl/pycurl/tests/ 有很多例子,还可做参考http://pycurl.sourceforge.net/doc/curlmultiobject.html



我自己改写了一个:)





Toggle line numbers

   1

   2 #!/usr/bin/env python

   3 #coding=utf-8

   4

   5 import threading

   6 import pycurl

   7 from cStringIO import StringIO

   8

   9 class UrlOpen(threading.Thread):

  10     """异步下载网页"""

  11

  12     def __init__(self):

  13         super(UrlOpen,self).__init__()

  14         self.opener = pycurl.CurlMulti()

  15         self.handle_list=[]

  16

  17     def add(self,url,recall,writer=StringIO()):

  18         """

  19         参数:网址,回调函数,存放临时数据的对象

  20         """

  21         c = pycurl.Curl()

  22

  23         #可以传给回调函数

  24         c.url=url

  25         c.content = writer

  26         c.recall = recall

  27         c.setopt(c.URL,url)

  28         c.setopt(c.WRITEFUNCTION,c.content.write)

  29

  30         self.handle_list.append(c)

  31         self.opener.add_handle(c)

  32

  33     def _remove(self,c):

  34         c.close()

  35         self.opener.remove_handle(c)

  36         self.handle_list.remove(c)

  37

  38

  39     def run(self):

  40         num_handle=len(self.handle_list)

  41         while 1:

  42             ret = self.opener.select(10.0)

  43             if ret == -1:  continue

  44             while 1:

  45                 num_handle_pre=num_handle

  46                 ret, num_handle =self.opener.perform()

  47                 #活动的连接数改变时

  48                 if num_handle!=num_handle_pre:

  49                     result=self.opener.info_read()

  50                     print result

  51                     for i in result[1]:

  52                         #成功

  53                         i.http_code = i.getinfo(i.HTTP_CODE)

  54                         self._remove(i)

  55                         i.recall(i)

  56                     for i in result[2]:

  57                         #失败,应该记录一下

  58                         self._remove(i)

  59

  60                 if ret != pycurl.E_CALL_MULTI_PERFORM:

  61                     break

  62

  63 _opener=None

  64 def urlopen(*arg,**key):

  65     global _opener

  66     if _opener is None:

  67         _opener=UrlOpen()

  68         _opener.add(*arg,**key)

  69         _opener.start()

  70     else:

  71         _opener.add(*arg,**key)

  72

  73 def show(x):

  74     print x.content.getvalue()

  75 if __name__=="__main__":

  76     urlopen("http://www.baidu.com/",show)

  77     _opener.join()

又封装了一个异步打开网页的类和函数





Toggle line numbers

   1 #coding=utf-8

   2

   3 import threading

   4 from cStringIO import StringIO

   5

   6 import pycurl

   7 """

   8 Asyn open url

   9 Author:[email protected]

  10 2008-1-25 17:14

  11 """

  12

  13 class UrlOpen(threading.Thread):

  14     """异步下载网页"""

  15

  16     def __init__(self,):

  17         super(UrlOpen,self).__init__()

  18         self.opener = pycurl.CurlMulti()

  19         self.handle_list=[]

  20         self.waiting=[]

  21

  22     def add(self,url,recall,catch=None,writer=StringIO()):

  23         """

  24         参数:网址,回调函数,存放临时数据的对象

  25         """

  26         if catch is None:

  27             def catch(curl,error_no,desp):

  28                 #print "Error:%s - %s"%(error_no,desp)

  29                 pass

  30

  31         c = pycurl.Curl()

  32

  33         #可以传给回调函数

  34         c.url=url

  35         c.content = writer

  36         c.recall = recall

  37         c.catch=catch

  38         c.setopt(c.URL,

  39             url.encode('utf-8') if type(url) is unicode else url

  40         )

  41         c.setopt(c.WRITEFUNCTION,c.content.write)

  42

  43         self.waiting.append(c)

  44

  45     def _add(self):

  46         waiting=self.waiting[:]

  47         self.waiting=[]

  48         for c in waiting:

  49             self.handle_list.append(c)

  50             self.opener.add_handle(c)

  51

  52     def _remove(self,c):

  53         c.close()

  54         self.opener.remove_handle(c)

  55         self.handle_list.remove(c)

  56

  57

  58     def run(self):

  59         import select

  60         import time

  61         num_handle=0

  62         while 1:

  63             if self.handle_list:

  64                 ret = self.opener.select(1.0)

  65                 if ret >= 0:

  66                     while 1:

  67                         num_handle_pre=num_handle

  68                         ret, num_handle =self.opener.perform()

  69                         #活动的连接数改变时

  70                         if num_handle!=num_handle_pre:

  71                             result=self.opener.info_read()

  72                             for i in result[1]:

  73                                 #成功

  74                                 i.http_code = i.getinfo(i.HTTP_CODE)

  75                                 self._remove(i)

  76                                 i.recall(i)

  77                             for i in result[2]:

  78                                 #失败,应该记录一下,或回调失败函数

  79                                 #i为(<pycurl.Curl object at 0x00C04C80>, 6, 'Could not resolve host: www.msn.com (Domain name not found)')

  80                                 i[0].catch(*i)

  81                                 self._remove(i[0])

  82                         if ret != pycurl.E_CALL_MULTI_PERFORM:

  83                             break

  84             else:

  85                 time.sleep(1)

  86             self._add()

  87

  88 _opener=None

  89 def urlopen(*arg,**key):

  90     global _opener

  91     if _opener is None:

  92         _opener=UrlOpen()

  93         _opener.start()

  94     _opener.add(*arg,**key)

  95

  96 if __name__=="__main__":

  97     def show(x):

  98         print x.content.getvalue()

  99         print '--'*11

100     urlopen("http://www.baidu.com/",show)

101     urlopen("http://www.google.com/",show)

102     urlopen("http://www.sougou.com/",show)

103     urlopen("http://www.yodao.com/",show)

104     urlopen("http://www.yahoo.com/",show)

105     urlopen("http://www.msn.com/",show)

106     _opener.join()



1.1. 相关文献

PycURL简单学习 http://blog.donews.com/limodou/archive/2005/11/28/641257.aspx



python中的pycurl模块学习 https://forum.eviloctal.com/read.php?tid=27337

你可能感兴趣的:(ITeye)