error1:
NewConnectionError('
: Failed to establish a new connection:[WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败。',))
解决办法:
session.keep_alive=False
error2:
python hostname doesn't match either of facebookXXXXX
解决办法:
importssl
ssl.match_hostname =lambdacert, hostname:True
多方查阅后发现了解决问题的原因:http连接太多没有关闭导致的。
解决办法:
1、增加重试连接次数
requests.adapters.DEFAULT_RETRIES = 5
2、关闭多余的连接
requests使用了urllib3库,默认的http connection是keep-alive的,requests设置False关闭。
s = requests.session()
s.keep_alive =False
3、只用session进行操作。即只创建一个连接,并设置最大连接数或者重试次数。
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
session.get(url)
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
s = requests.Session()
retry = Retry(connect =5, backoff_factor =1)
adapter = HTTPAdapter(max_retries = retry)
s.mount('http://', adapter)
s.keep_alive =False
res = s.post(self.conn.host +'/sign-in', data = json.dumps({'name':"XXX",'pwd':"XXX"}))
response = res.json()
但是在starkoverflow上有人给出了这样的解释。
4.安装 py
pip install -U pyopenssl
5、设定固定的睡眠时间在发送请求之间
https://github.com/requests/requests/issues/4246#event
https://stackoverflow.com/questions/23013220/max-retries-exceeded-with-url
在爬取boss直聘时出现这种错误,总结如下:
1.http连接太多没有关闭导致的,解决方法:
import requests
requests.adapters.DEFAULT_RETRIES =5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
s.get(url)# 你需要的网址
2.访问次数频繁,被禁止访问,解决方法:使用代理
import requests
s = requests.session()
url ="https://mail.163.com/"
s.proxies= {"https":"47.100.104.247:8080","http":"36.248.10.47:8080", }
s.headers= header
s.get(url)
查找代理的网址:http://ip.zdaye.com/shanghai_ip.html#Free
使用代理时需注意:1.代理分为http和https两种,不能用混,如果把http的代理用作https也是会报上面的错误;2.上面的代理以字典格式传入,例如上面的例子,可以是“47.100.104.247:8080”这种格式,也可以是“https://47.100.104.247:8080”这种格式;3.如果代理不可用一样会报上面的错误。以下方法判断代理是否可用:
import requests
s = requests.session()
url ="https://mail.163.com/"
s.keep_alive = False
s.proxies= {"https":"47.100.104.247:8080","http":"36.248.10.47:8080", }
s.headers= header
r = s.get(url)
print(r.status_code) # 如果代理可用则正常访问,不可用报以上错误
升级
pip install --upgrade requests
如果同一ip访问次数过多也会封ip,这里就要用代理了proxies,python很简单,直接在请求中带上proxies参数就行,
r = requests.get(url, headers=headers, cookies=cookies,proxies = proxies)
代理ip的话,给大家推荐个网站
http://www.data5u.com/
最下方会有20个免费的,一般小爬虫够用了,使用代理就会出现代理连接是否通之类的问题,需要在程序中添加下面的代码,设置连接时间
requests.adapters.DEFAULT_RETRIES =5
s = requests.session()
s.keep_alive = False
from bs4 import BeautifulSoup
import json,requests,sys
reload(sys)
sys.setdefaultencoding('utf-8')
list =[22711693,24759450,69761921,69761921,22743334,66125712,22743270,57496584,75153221,57641884,66061653,70669333,57279088,24740739,66126129,75100027,92667587,92452007,72345827,90004047,90485109,90546031,83527455,91070982,83527745,94273474,80246564,83497073,69027373,96191554,96683472,90500524,92454863,92272204,70443082,96076068,91656438,75633029,96571687,97659144,69253863,98279207,90435377,70669359,96403354,83618952,81265224,77365611,74592526,90479676,56540304,37924067,27496773,56540319,32571869,43611843,58612870,22743340,67293664,67292945, 57641749,75157068,58934198,75156610,59081304,75156647,75156702,67293838,]
returnList = []
proxies = {
# "https": "http://14.215.177.73:80", "http": "http://202.108.2.42:80",
}
headers = {
'Host': 'www.dianping.com',
'Referer': 'http://www.dianping.com/shop/22711693',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/535.19',
'Accept-Encoding': 'gzip'}
cookies = {
'_lxsdk_cuid': '16146a366a7c8-08cd0a57dad51b-32637402-fa000-16146a366a7c8',
'lxsdk': '16146a366a7c8-08cd0a57dad51b-32637402-fa000-16146a366a7c8',
'_hc.v': 'ec20d90c-0104-0677-bf24-391bdf00e2d4.1517308569',
's_ViewType': '10',
'cy': '16',
'cye': 'wuhan',
'_lx_utm': 'utm_source%3DBaidu%26utm_medium%3Dorganic',
'_lxsdk_s': '1614abc132e-f84-b9c-2bc%7C%7C34'}
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
for i in list:
url = "https://www.dianping.com/shop/%s/review_all" % i
r = requests.get(url, headers=headers, cookies=cookies,proxies = proxies)
# print r.text soup = BeautifulSoup(r.text, 'lxml')
lenth = soup.find_all(class_='PageLink').__len__() + 1
#print lenth for j in xrange(lenth):
urlIn = "http://www.dianping.com/shop/%s/review_all/p%s" % (i, j)
re = requests.get(urlIn, headers=headers, cookies=cookies,proxies =proxies)
soupIn = BeautifulSoup(re.text, 'lxml')
title = soupIn.title.string[0:15]
#print title coment = []
coment = soupIn.select('.reviews-items li')
for one in coment:
try:
if one['class'][0]=='item':
continue
except(KeyError),e:
pass
name = one.select_one('.main-review .dper-info .name')
#print name.get_text().strip() name = name.get_text().strip()
star = one.select_one('.main-review .review-rank span')
#print star['class'][1][7:8] star = star['class'][1][7:8]
pl = one.select_one('.main-review .review-words')
pl['class'] = {'review-words'}
words = pl.get_text().strip()
returnList.append([title,name,star,words])file = open("/Users/huojian/Desktop/store_shop.sql","w")for one in returnList:
file.write("\n")
file.write(unicode(one[0]))
file.write("\n")
file.write(unicode(one[1]))
file.write("\n")
file.write(unicode(one[2]))
file.write("\n")
file.write(unicode(one[3]))
file.write("\n")