【Python】健壮的爬虫

还记得上次的发车(探索)吗?小伙伴有木有出现爬虫程序出现异常停止吗?上次没上车的童靴可以点击传送门

那么问题来了

为啥我们的爬虫程序会中途死亡?那么我们需要了解的是如何反制爬虫?
知道了他们的防守方式才能使我们的进攻更为有效!
通常很多网站不允许非浏览器访问,还有一个 ip 频繁访问会短暂禁止该IP访问。所以我们需要做的事情就是将自己的爬虫程序伪装为浏览器访问并且在 ip 被禁的时候使用代理 ip 来协助访问。

准备工作

使用 pip 工具下载 beautiflsoup4 、requests、lxml

pip install beautiflsoup4
pip install requests
pip install lxml

话不多说,直接来看我们的代码,这次我们换一个网址来测试(meizitu.com):

# -*- coding: UTF-8 -*-
import requests
import random
import time
from bs4 import BeautifulSoup

#伪装浏览器头
user_agent_list = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

##初始化一个list用来存放我们获取到的IP,当然我们也可以通过在代理ip服务商哪里去获取IP
iplist = ['60.207.239.245:3128', '124.88.67.24:843', '210.101.131.231:8080', '124.192.106.247:3128', '202.106.16.36:3128', '220.248.229.45:3128', '61.185.137.126:3128', '183.77.250.45:3128', '218.17.252.34:3128', '58.9.99.41:3128', '124.207.132.242:3128', '60.160.34.4:3128', '61.153.145.202:25', '175.154.229.72:8998', '112.91.208.78:9999', '222.33.192.238:8118', '116.242.227.201:3128', '119.29.232.113:3128', '61.136.115.147:3128', '121.248.112.20:3128', '60.21.132.218:63000', '123.7.115.141:9999', '221.212.221.194:3128', '124.88.67.32:81', '111.1.3.36:8000', '120.194.18.90:81', '114.215.150.13:3128', '120.52.21.132:8082', '218.67.126.15:3128']
# html = request.urlopen("http://haoip.cc/tiqu.htm").read().decode('utf-8')
# iplistn= re.findall(r'r/>(.*?)
# for ip in iplistn:
#     i = re.sub('\n', '', ip)##re.sub 是re模块替换的方法,这儿表示将\n替换为空
#     iplist.append(i.strip()) ##添加到我们上面初始化的list里面, i.strip()的意思是去掉字符串的空格
# print(iplist)

#发送请求
def get(url, timeout =5, proxy=None, num_retries=10):
    UA = random.choice(user_agent_list)  ##从user_agent_list中随机取出一个字符串
    headers = {'User-Agent':UA}
    if proxy == None:
        try:
            return requests.get(url, headers=headers, timeout=timeout)  ##这样服务器就会以为我们是真的浏览器了
        except:  ##如过上面的代码执行报错则执行下面的代码
            if num_retries > 0:  ##num_retries是我们限定的重试次数
                time.sleep(3)  ##延迟3秒
                print(u'获取网页出错,3S后将获取倒数第:', num_retries, u'次')
                return get(url, timeout,None,num_retries - 1)  ##调用自身 并将次数减1
            else:
                print(u'开始使用代理')
                time.sleep(3)
                IP = ''.join(str(random.choice(iplist)).strip())  ##下面有解释哦
                proxy = {'http': IP}
                return get(url,proxy=proxy)  ##代理不为空的时候
    else:
        try:
            IP = ''.join(str(random.choice(iplist)).strip())  ##将从self.iplist中获取的字符串处理成我们需要的格式(处理了些什么自己看哦,这是基础呢)
            proxy = {'http': IP}  ##构造成一个代理
            return requests.get(url, headers=headers, proxies=proxy, timeout=timeout)  ##使用代理获取response
        except:
            if num_retries > 0:
                time.sleep(3)
                IP = ''.join(str(random.choice(iplist)).strip())
                proxy = {'http': IP}
                print(u'正在更换代理,3S后将重新获取倒数第', num_retries, u'次')
                print(u'当前代理是:', proxy)
                return get(url, timeout, proxy, num_retries - 1)
            else:
                print(u'代理也不好使了!取消代理')
                return get(url)
##保存图片,默认保存在python程序所在目录
def save(name, img):
   f = open(name + '.jpg', 'ab')
   f.write(img.content)
   f.close()


i = 5510 #观察得出
while i >= 1:
    #爬虫入口
    url_src =  "http://www.meizitu.com/a/" + str(i) + ".html"
    html = get(url_src)
    #规避错误,绕行
    error_soup = BeautifulSoup(html.text,'lxml')
    error = error_soup.find('a',id='eCode')
    print(error)
    if(error != None):
        i -= 1
        continue
    target = html.text.encode('latin1').decode('gb2312', 'ignore')
    soup = BeautifulSoup(target, 'lxml')
    all_pic = soup.find('div',id='picture').find_all('img')
    for pic in all_pic:
        pic_src = pic.attrs['src']
        pic_alt = pic.attrs['alt']
        print(pic_src + "    " + pic_alt)#无聊的时候打印点东西
        # 这里的指定存储路径,需要注意的是这里需手动创建文件夹,如需自动想、可以使用os库
        #request.urlretrieve(pic_src, 'D:\\meizi\\' + '%s.jpg' % pic_alt)  # 指定目录位置
        res = get(pic_src)
        save(pic_alt,res)


    print(i)
    i-=1

print('over')#彻底停止

你可能感兴趣的:(python,python,爬虫,浏览器)