[Python3.x]网络爬虫(三):urllib.request抓资源的方式总结

转载自:http://blog.csdn.net/reymix/article/details/46869529
Python 3.X 要使用urllib.request 来抓取网络资源。

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
buff = response.read()
html = buff.decode('utf8')
response.close()
print(html)

使用Request的方式:

import urllib.request

req = urllib.request.Request('http://www.lovejing.com')
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode('utf8')
response.close()
print(html)

这种方式同样可以用来处理其他URL,例如FTP:

import urllib.request

req = urllib.request.Request('ftp://ftp.lovejing.com')
response = urllib.request.urlopen(req)
buff = response.read()
html = buff.decode('utf8')
response.close()
print(html)

使用POST请求:

import urllib.parseimport

urllib.requesturl = 'http://www.somebody.com/cgi-bin/register.cgi'
values = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
response = urllib.request.urlopen(req)
the_page = response.read()

使用GET请求:

import urllib.request
import urllib.parse

data = {}
data['name'] = 'Somebody Here'
data['location'] = 'Northampton'
data['language'] = 'Python'
url_values = urllib.parse.urlencode(data)
print(url_values)
name=Somebody+Here&language=Python&location=Northampton
url = 'http://www.example.com/example.cgi'
full_url = url + '?' + url_values
data = urllib.request.open(full_url)

添加header:


import urllib.parse
import urllib.request

url = 'http://www.somebody.com/cgi-bin/register.cgi'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values = {'name' : 'Michael Foord',
          'location' : 'Northampton',
          'language' : 'Python' }
headers = { 'User-Agent' : user_agent }
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()

错误处理:

import urllib.request
import urllib.error

req = urllib.request.Request('http://www.pretend_server.org')
try: urllib.request.urlopen(req)
except urllib.error.URLError as e:
print(e.reason)

返回的错误代码:

# Table mapping response codes to messages; entries have the

# form {code: (shortmessage, longmessage)}.

responses = {

    100: ('Continue', 'Request received, please continue'),

    101: ('Switching Protocols',

          'Switching to new protocol; obey Upgrade header'),



    200: ('OK', 'Request fulfilled, document follows'),

    201: ('Created', 'Document created, URL follows'),

    202: ('Accepted',

          'Request accepted, processing continues off-line'),

    203: ('Non-Authoritative Information', 'Request fulfilled from cache'),

    204: ('No Content', 'Request fulfilled, nothing follows'),

    205: ('Reset Content', 'Clear input form for further input.'),

    206: ('Partial Content', 'Partial content follows.'),



    300: ('Multiple Choices',

          'Object has several resources -- see URI list'),

    301: ('Moved Permanently', 'Object moved permanently -- see URI list'),

    302: ('Found', 'Object moved temporarily -- see URI list'),

    303: ('See Other', 'Object moved -- see Method and URL list'),

    304: ('Not Modified',

          'Document has not changed since given time'),

    305: ('Use Proxy',

          'You must use proxy specified in Location to access this '

          'resource.'),

    307: ('Temporary Redirect',

          'Object moved temporarily -- see URI list'),



    400: ('Bad Request',

          'Bad request syntax or unsupported method'),

    401: ('Unauthorized',

          'No permission -- see authorization schemes'),

    402: ('Payment Required',

          'No payment -- see charging schemes'),

    403: ('Forbidden',

          'Request forbidden -- authorization will not help'),

    404: ('Not Found', 'Nothing matches the given URI'),

    405: ('Method Not Allowed',

          'Specified method is invalid for this server.'),

    406: ('Not Acceptable', 'URI not available in preferred format.'),

    407: ('Proxy Authentication Required', 'You must authenticate with '

          'this proxy before proceeding.'),

    408: ('Request Timeout', 'Request timed out; try again later.'),

    409: ('Conflict', 'Request conflict.'),

    410: ('Gone',

          'URI no longer exists and has been permanently removed.'),

    411: ('Length Required', 'Client must specify Content-Length.'),

    412: ('Precondition Failed', 'Precondition in headers is false.'),

    413: ('Request Entity Too Large', 'Entity is too large.'),

    414: ('Request-URI Too Long', 'URI is too long.'),

    415: ('Unsupported Media Type', 'Entity body in unsupported format.'),

    416: ('Requested Range Not Satisfiable',

          'Cannot satisfy request range.'),

    417: ('Expectation Failed',

          'Expect condition could not be satisfied.'),



    500: ('Internal Server Error', 'Server got itself in trouble'),

    501: ('Not Implemented',

          'Server does not support this operation'),

    502: ('Bad Gateway', 'Invalid responses from another server/proxy.'),

    503: ('Service Unavailable',

          'The server cannot process the request due to a high load'),

    504: ('Gateway Timeout',

          'The gateway server did not receive a timely response'),

    505: ('HTTP Version Not Supported', 'Cannot fulfill request.'),

    }

你可能感兴趣的:(python,python,网络爬虫)