【spider01】Urllib

Urllib

Urllib详解–

什么是urllib?
python内置的http请求库
urllib.requests 请求模块
urllib.error 异常处理模块
urllib.parse url解析模块
urllib.robotparse robots.txt解析模块

urlopen

urllib.request.urlopen(url,data=None,[timeout,]*,cafile=None,cadefault=False,context=None)

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))


















































































































	

































































	

        

			        

	

			        

	

			        

	

			        

			    



	

        

			        

	

			        

	

			        

	

			        

			    































    
    
    
	
    
    
    
    
	
	
	
	
	
	
	
	
	
	
    
    百度一下,你就知道
    


















    








    



	
    
    
import urllib.parse 
import urllib.request

data = bytes(urllib.parse.urlencode({'world':'Hello'}),encoding = 'utf-8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())#加data以post形式传递
b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "world": "Hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Connection": "close", \n    "Content-Length": "11", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "json": null, \n  "origin": "117.184.110.250", \n  "url": "http://httpbin.org/post"\n}\n'
import urllib.request

response = urllib.request.urlopen('http://httpbin.org/get',timeout = 1)
print(response.read())
b'{\n  "args": {}, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Connection": "close", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "origin": "117.184.110.250", \n  "url": "http://httpbin.org/get"\n}\n'
import socket
import urllib.request
import urllib.error

try:
    response = urllib.request.urlopen('http://httpbin.org/get',timeout = 0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason,socket.timeout):
        print("timeout")
timeout

响应

响应类型

import urllib.request

response = urllib.request.urlopen('http://www.python.org')
print(type(response))

状态码 响应头

import urllib.request

response = urllib.request.urlopen('http://www.python.org')
print(response.status)
print(response.getheaders())
print(response.getheader('Server'))
200
[('Server', 'nginx'), ('Content-Type', 'text/html; charset=utf-8'), ('X-Frame-Options', 'SAMEORIGIN'), ('x-xss-protection', '1; mode=block'), ('X-Clacks-Overhead', 'GNU Terry Pratchett'), ('Via', '1.1 varnish'), ('Content-Length', '49228'), ('Accept-Ranges', 'bytes'), ('Date', 'Sun, 08 Apr 2018 10:05:20 GMT'), ('Via', '1.1 varnish'), ('Age', '2762'), ('Connection', 'close'), ('X-Served-By', 'cache-iad2143-IAD, cache-hkg17927-HKG'), ('X-Cache', 'HIT, HIT'), ('X-Cache-Hits', '4, 9'), ('X-Timer', 'S1523181920.228242,VS0,VE0'), ('Vary', 'Cookie'), ('Strict-Transport-Security', 'max-age=63072000; includeSubDomains')]
nginx
import urllib.request

response = urllib.request.urlopen('http://www.python.org')
print(response.read().decode('utf-8'))




  


    
    

    

    
    
    
    
    

    
    
    
    
    

    

    
    
    

    

    
    
    
    
    
    
    

    
    
    
    

    Welcome to Python.org

    
    

    
    
    
    
    
    
    
    
    
    

    

    
    
    
    

    

    
    

    
    
    




    

Notice: While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience.

Get Started

Whether you're new to programming or an experienced developer, it's easy to learn and use Python.

Start with our Beginner’s Guide

Download

Python source code and installers are available for download for all versions! Not sure which version to use? Check here.

Latest: Python 3.6.5 - Python 2.7.14

Docs

Documentation for Python's standard library, along with tutorials and guides, are available online.

docs.python.org

Jobs

Looking for work or have a Python related position that you're trying to hire for? Our relaunched community-run job board is the place to go.

jobs.python.org

>>> Python Enhancement Proposals (PEPs): The future of Python is discussed here.

>>> Python Software Foundation

The mission of the Python Software Foundation is to promote, protect, and advance the Python programming language, and to support and facilitate the growth of a diverse and international community of Python programmers. Learn more

Become a Member Donate to the PSF

request

import urllib.request

request = urllib.request.Request('http://www.python.org')
response = urllib.request.urlopen(request)
print(response.read().decode('utf-8'))




  


    
    

    

    
    
    
    
    

    
    
    
    
    

    

    
    
    

    

    
    
    
    
    
    
    

    
    
    
    

    Welcome to Python.org

    
    

    
    
    
    
    
    
    
    
    
    

    

    
    
    
    

    

    
    

    
    
    




    

Notice: While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience.

Get Started

Whether you're new to programming or an experienced developer, it's easy to learn and use Python.

Start with our Beginner’s Guide

Download

Python source code and installers are available for download for all versions! Not sure which version to use? Check here.

Latest: Python 3.6.5 - Python 2.7.14

Docs

Documentation for Python's standard library, along with tutorials and guides, are available online.

docs.python.org

Jobs

Looking for work or have a Python related position that you're trying to hire for? Our relaunched community-run job board is the place to go.

jobs.python.org

>>> Python Enhancement Proposals (PEPs): The future of Python is discussed here.

>>> Python Software Foundation

The mission of the Python Software Foundation is to promote, protect, and advance the Python programming language, and to support and facilitate the growth of a diverse and international community of Python programmers. Learn more

Become a Member Donate to the PSF

from urllib import request,parse

url = 'http://httpbin.org/post'
headers = {
    "User-Agent":'Mozilla/4.0(compatible;MISIE 5.5;Window NT)',
    'Host':'httpbin.org'
}
dict = {
    'name':'Germany'
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
req = request.Request(url=url,data=data,headers=headers,method='POST')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germany"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Length": "12", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0(compatible;MISIE 5.5;Window NT)"
  }, 
  "json": null, 
  "origin": "117.184.110.250", 
  "url": "http://httpbin.org/post"
}
from urllib import request,parse

url = 'http://httpbin.org/post'
dict = {
    'name':'Germany'
}
data = bytes(parse.urlencode(dict),encoding='utf-8')
req = request.Request(url=url,data=data,headers=headers,method='POST')
req.add_header("User-Agent",'Mozilla/4.0(compatible;MISIE 5.5;Window NT)')
response = request.urlopen(req)
print(response.read().decode('utf-8'))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "Germany"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Length": "12", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0(compatible;MISIE 5.5;Window NT)"
  }, 
  "json": null, 
  "origin": "117.184.110.250", 
  "url": "http://httpbin.org/post"
}

Handler

—辅助工具

代理

import urllib.request

proxy_handler = urllib.request.ProxyHandler({
    'http':'http://127.0.0.1:9743',
    'https':'https://127.0.0.1:9743'
})
opener = urllib.request.build_opener(proxy_handler)
response = opener.open('http://httpbin.org/get')
print(response.read())
---------------------------------------------------------------------------

ConnectionRefusedError                    Traceback (most recent call last)

I:\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1317                 h.request(req.get_method(), req.selector, req.data, headers,
-> 1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error


I:\Anaconda3\lib\http\client.py in request(self, method, url, body, headers, encode_chunked)
   1238         """Send a complete request to the server."""
-> 1239         self._send_request(method, url, body, headers, encode_chunked)
   1240 


I:\Anaconda3\lib\http\client.py in _send_request(self, method, url, body, headers, encode_chunked)
   1284             body = _encode(body, 'body')
-> 1285         self.endheaders(body, encode_chunked=encode_chunked)
   1286 


I:\Anaconda3\lib\http\client.py in endheaders(self, message_body, encode_chunked)
   1233             raise CannotSendHeader()
-> 1234         self._send_output(message_body, encode_chunked=encode_chunked)
   1235 


I:\Anaconda3\lib\http\client.py in _send_output(self, message_body, encode_chunked)
   1025         del self._buffer[:]
-> 1026         self.send(msg)
   1027 


I:\Anaconda3\lib\http\client.py in send(self, data)
    963             if self.auto_open:
--> 964                 self.connect()
    965             else:


I:\Anaconda3\lib\http\client.py in connect(self)
    935         self.sock = self._create_connection(
--> 936             (self.host,self.port), self.timeout, self.source_address)
    937         self.sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)


I:\Anaconda3\lib\socket.py in create_connection(address, timeout, source_address)
    723     if err is not None:
--> 724         raise err
    725     else:


I:\Anaconda3\lib\socket.py in create_connection(address, timeout, source_address)
    712                 sock.bind(source_address)
--> 713             sock.connect(sa)
    714             # Break explicitly a reference cycle


ConnectionRefusedError: [WinError 10061] 由于目标计算机积极拒绝,无法连接。


During handling of the above exception, another exception occurred:


URLError                                  Traceback (most recent call last)

 in ()
      6 })
      7 opener = urllib.request.build_opener(proxy_handler)
----> 8 response = opener.open('http://httpbin.org/get')
      9 print(response.read())


I:\Anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    524             req = meth(req)
    525 
--> 526         response = self._open(req, data)
    527 
    528         # post-process response


I:\Anaconda3\lib\urllib\request.py in _open(self, req, data)
    542         protocol = req.type
    543         result = self._call_chain(self.handle_open, protocol, protocol +
--> 544                                   '_open', req)
    545         if result:
    546             return result


I:\Anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    502         for handler in handlers:
    503             func = getattr(handler, meth_name)
--> 504             result = func(*args)
    505             if result is not None:
    506                 return result


I:\Anaconda3\lib\urllib\request.py in http_open(self, req)
   1344 
   1345     def http_open(self, req):
-> 1346         return self.do_open(http.client.HTTPConnection, req)
   1347 
   1348     http_request = AbstractHTTPHandler.do_request_


I:\Anaconda3\lib\urllib\request.py in do_open(self, http_class, req, **http_conn_args)
   1318                           encode_chunked=req.has_header('Transfer-encoding'))
   1319             except OSError as err: # timeout error
-> 1320                 raise URLError(err)
   1321             r = h.getresponse()
   1322         except:


URLError: 

cookie

—储存在用户本地终端上的数据(网站为了辨别用户身份)

import http.cookiejar,urllib.request

cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name+'='+item.value)
BAIDUID=053933EE20910CC428023D9BDBFDD91B:FG=1
BIDUPSID=053933EE20910CC428023D9BDBFDD91B
H_PS_PSSID=1427_21102_20718
PSTM=1523181945
BDSVRTM=0
BD_HOME=0
import http.cookiejar,urllib.request

filename = "cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)
import http.cookiejar,urllib.request

filename = "cookie.txt"
cookie = http.cookiejar.LWPCookieJar(filename)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)
import http.cookiejar,urllib.request

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))


















































































































	

































































	

        

			        

	

			        

	

			        

	

			        

			    



	

        

			        

	

			        

	

			        

	

			        

			    































    
    
    
	
    
    
    
    
	
	
	
	
	
	
	
	
	
	
    
    百度一下,你就知道