python写简单爬虫的五种方法

获取html的方法【一】:使用urllib


# -*- coding: UTF-8 -*-

import urllib

 

获取web页面内容并返回'

def getWebPageContent(url):

    f = urllib.urlopen(url)

    data = f.read()

    f.close()

return data

 

url = 'http://blog.csdn.net'

content = getWebPageContent(url)

print content


获取html的方法【二】:使用Pycurl

# Pycurl参考地址:http://pycurl.sourceforge.net/

# Pycurl下载地址:http://pycurl.sourceforge.net/download/pycurl-7.18.1.tar.gz

# -*-coding: UTF-8 -*-

importpycurl

importStringIO

 

defgetURLContent_pycurl(url):   

    c = pycurl.Curl()

    c.setopt(pycurl.URL,url)

    b = StringIO.StringIO()

    c.setopt(pycurl.WRITEFUNCTION, b.write)

    c.setopt(pycurl.FOLLOWLOCATION, 1)

    c.setopt(pycurl.MAXREDIRS, 5)

    代理

    #c.setopt(pycurl.PROXY, 'http://11.11.11.11:8080')

    #c.setopt(pycurl.PROXYUSERPWD, 'aaa:aaa')

    c.perform()

    returnb.getvalue()

 

url = 'http://blog.csdn.net'

content =getURLContent_pycurl(url)

print content


获取html的方法【三】:使用cPAMIE

cPAMIE下载:http://sourceforge.net/project/showfiles.php?group_id=103662

# -*-coding: UTF-8 -*- 

import cPAMIE

defgetURLContent_cPAMIE(url):

        g_ie =cPAMIE.PAMIE()

        g_ie.showDebugging = False

        g_ie.frameName= None

        g_ie.navigate(url)   

content =g_ie.pageGetText()

g_ie.quit()    

returncontent

 

url = 'http://blog.csdn.net'

content = getURLContent_cPAMIE(url)

print content


获取html的方法【四】:使用urllib下载文件

# -*- coding: UTF-8 -*-

import urllib

 

url = 'http://blog.csdn.net'

path = 'C://temp//csdn.net.html'

urllib.urlretrieve(url,path)


获取html的方法【四】:利用Twisted框架之client.getPage

# Twisted框架下载:

http://tmrc.mit.edu/mirror/twisted/Twisted/8.1/Twisted_NoDocs-8.1.0.win32-py2.5.exe

# -*-coding: UTF-8 -*-

fromtwisted.internet import reactor

fromtwisted.web import client

 

defresult(content):

    print content

    reactor.stop()

 

deferred =client.getPage("http://blog.csdn.net")

deferred.addCallback(result)   

reactor.run()

你可能感兴趣的:(python写简单爬虫的五种方法)