这个有很多种的,下面以urllib为例
下面选取的是网页是python官网
>>> import urllib.request
>>> response = urllib.request.urlopen('http://www.python.org')
>>>>
情况一:timeout = 0.1
>>> response = urllib.request.urlopen('http://www.python.org',timeout = 0.1)
Traceback (most recent call last):
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1152, in _send_request
self.endheaders(body)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1103, in endheaders
self._send_output(message_body)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 934, in _send_output
self.send(msg)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 877, in send
self.connect()
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 849, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\socket.py", line 712, in create_connection
raise err
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\socket.py", line 703, in create_connection
sock.connect(sa)
socket.timeout: timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 466, in open
response = self._open(req, data)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 484, in _open
'_open', req)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1282, in http_open
return self.do_open(http.client.HTTPConnection, req)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError:
情况二:timeout = 0.5
>>> response = urllib.request.urlopen('http://www.python.org',timeout = 0.5)
Traceback (most recent call last):
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1254, in do_open
h.request(req.get_method(), req.selector, req.data, headers)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1107, in request
self._send_request(method, url, body, headers)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1152, in _send_request
self.endheaders(body)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1103, in endheaders
self._send_output(message_body)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 934, in _send_output
self.send(msg)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 877, in send
self.connect()
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\http\client.py", line 1261, in connect
server_hostname=server_hostname)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\ssl.py", line 385, in wrap_socket
_context=self)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\ssl.py", line 760, in __init__
self.do_handshake()
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\ssl.py", line 996, in do_handshake
self._sslobj.do_handshake()
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\ssl.py", line 641, in do_handshake
self._sslobj.do_handshake()
socket.timeout: _ssl.c:703: The handshake operation timed out
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "", line 1, in
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 163, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 472, in open
response = meth(req, response)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 582, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 504, in error
result = self._call_chain(*args)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 696, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 466, in open
response = self._open(req, data)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 484, in _open
'_open', req)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 444, in _call_chain
result = func(*args)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1297, in https_open
context=self._context, check_hostname=self._check_hostname)
File "C:\Users\lijy2\AppData\Local\Programs\Python\Python35-32\lib\urllib\request.py", line 1256, in do_open
raise URLError(err)
urllib.error.URLError:
情形3: timeout = 1
>>> response = urllib.request.urlopen('http://www.python.org',timeout = 1)
>>>
这里,我们发现在设置了timeout之后,一旦超时,会发生报错,然后任务也就结束了。但是会保证每个任务的时间都是被限制了的。
比如,我们做一个并发的爬虫(例如用多协程或者多线程实现)。这里,如果不进行爬虫不设置timeout的话,如果某个子协程在运行的在还在等待的话,就有其他的线程跟着一起等这个线程的响应。(虽然会让其他的线程或者协程在这时候运行,但是切换所需要的时间的)。如果可以设计到这个timeout的数值比较小(合理的小的话)就会让这个线程(或者协程)在只用很短的时间就结束爬虫。如果失败就先记录下来,在之后做这个失败的数据的处理。
可以采用分级的timeout。这样,失败一次就放到timeout时间序列更长的队列当中。这样通过mlfq这样的操作来调度这些爬虫。
这样方法对于网络质量不是很稳定的情况下,这个爬虫效果会比较好。有些时候就没有必要用那么长的时间来等待。