爬虫笔记(六)--反爬处理

身份伪装

  1. 告诉服务器是人(浏览器)
    User-Agent:声明身份
  2. 人的特点,从哪来
    Request-Headers
    Refer
  3. 身份证(cookies)
    用户信息,网站信息
    加UA,refer,cookies
  4. 访问频率,访问数量–>拉黑
    开小号

添加headers

import requests
url = "http://www.httpbin.org/headers"
res = requests.get(url).text
print(res)

爬虫笔记(六)--反爬处理_第1张图片
由结果可知,python代码发出的请求user-agent为python-request
自己访问该网页时
爬虫笔记(六)--反爬处理_第2张图片
自己手动添加headers

import requests
headers={
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",
}#使用浏览器访问的User-Agent
url = "http://www.httpbin.org/headers"
res = requests.get(url,headers=headers).text
print(res)

爬虫笔记(六)--反爬处理_第3张图片

添加cookies(三种)

  • 以cookie作为键以整体cookie作为值添加
import requests
cookie={
    "Cookie": "_uuid=EEC25884-F044-5E17-FBAC-414E3EA1540283053infoc ",
}#等号前为键,等号后为值
url = "http://www.httpbin.org/cookies"
res = requests.get(url,cookies=cookie).text
print(res)

爬虫笔记(六)--反爬处理_第4张图片

  • 直接将上方的图中内容复制到cookie={ }中传值
  • headers里加cookie,但有些网站不能直接在headers中添加cookie

设置超时时间

对timeout赋值,值为秒数,如果超过则报错,所以配合try except使用

import requests
url = "http://www.httpbin.org/ip"
ip = {"http":"http://192.168.2.123:8080"}
try:
    res = requests.get(url,proxies=ip,timeout=3).text
    print(res)
except:
    pass

添加代理

requests代理格式:{“协议类型”:“协议类型://ip:端口号”}{“http”:“http://127.0.0.1:80”}
访问http://www.httpbin.org/ip可以看到本地主机对公网的ip,如果代理可用则ip改变

import requests
url = "http://www.httpbin.org/ip"
ip = {"http":"http://192.168.2.123:8080"}
res = requests.get(url,proxies=ip).text
print(res)

ip采集

import requests
from lxml import etree
page = 0
result = requests.get("http://www.httpbin.org/ip").text
while True:
    url = "https://www.7yip.cn/free/?action=china&page="+str(page)
    #本机ip,与此不同则为可用ip
    page +=1
    res = requests.get(url).text
    e = etree.HTML(res)
    types = e.xpath("//td[@data-title='类型']/text()")
    ips = e.xpath("//td[@data-title='IP']/text()")
    port = e.xpath("//td[@data-title='PORT']/text()")
    proxy = { }
    for i in range(len(types)):
        #从0到types的长度拿到每一个下标
        proxy[types[i].lower()] = types[i].lower()+"://"+ips[i]+":"+port[i]
        #键的形式存http与https,值把ip与端口拼接
        print(proxy)
        try:
            res2 = requests.get("http://www.httpbin.org/ip",proxies=proxy,timeout=3).text
            #用代理ip发请求
            if result not in res2:
                #本机ip是否在res2发出请求返回的ip中
                print(proxy,"代理可用")
            proxy={}
            #如果不重置为空字典会向字典中插入新值,一行会显示两个代理
        except:
            proxy={}

IP代理池

制作IP相关的操作,和爬虫业务不相关
  • 容器对象(数据库)
    方法:存,取,删IP,获取数量,获取所有IP,不判断是否可用
  • IP代理池
    方法:检测IP是否可用,采集可用IP(达到阈值时停止),获取到IP存入容器,检测
import requests
import pymysql
from lxml import etree
conn = pymysql.Connect(
        db = 'spider',
        host = 'localhost',
        port = 3306,
        user = 'root',
        password= '123',
        charset="utf8"
)
cursor = conn.cursor()
check_url = "http://www.httpbin.org/ip"
my_ip = requests.get(check_url).text
class DB():
    def save(self,ip):
        sql = "insert into ips values (%s)"
        cursor.execute(sql,(ip,))
        #mysqldb需要提交元组
        conn.commit()
    def get(self):
        sql = "select *  from ips"
        cursor.execute(sql)
        for i in cursor.fetchall():
            #取到所有条数据,返回元组
            return i[0]
    def delete_ip(self,ip):
        sql = "delete from ips where ip = %s"
        cursor.execute(sql,(ip,))
        conn.commit()
    def get_count(self):
        sql="select * from ips"
        cursor.execute(sql)
        return len(cursor.fetchall())
        #fetchall返回元组,元组长度即为数量
    def get_all(self):
        sql = "select *  from ips"
        cursor.execute(sql)
        return cursor.fetchall()
class Pool():
    def __init__(self,limit):
        self.limit=limit
        #阈值大小
        self.db = DB()
        #实例化数据库对象
    def check_ip(self,ip):
        try:
        # 许多代理不可用所以用try处理
            target_ip = requests.get(check_url,proxies=eval(ip),timeout=3).text
            if target_ip != my_ip:
                return True
        except:
                pass
        return  False
    def crawler(self):
        page = 1
        if self.limit >self.db.get_count():
            #如果容器中的ip数量小于阈值,就采集
            while True:
                url = "http://qinghuadaili.com/free/%s/"%str(page)
                page += 1
                html = etree.HTML(requests.get(url).text)
                ip = html.xpath("//tr/td[1]/text()")
                port = html.xpath("//tr/td[2]/text()")
                type = html.xpath("//tr/td[4]/text()")
                proxy = { }
                for i in range(len(type)):
                    proxy[type[i].lower()] = type[i].lower()+"://"+ip[i]+":"+port[i]
                    if self.check_ip(proxy):
                        print("可用ip",proxy)
                        self.db.save(str(proxy))
    def   get_ip(self):
        ip = self.db.get()
        self.db.delete_ip()
        #取出ip后从数据库中删除
        return ip
    def check_all(self):
        #检测所有ip的可用性
        ips = self.db.get_all()
        for ip in ips:
            if not self.check_ip(ip[0]):
                self.db.delete_ip(ip[0])
        if self.db.get_count() < self.limit:
            self.crawler()
pool = Pool(200)
pool.crawler()

验证码处理及人人实例

抓取思路:扩散
验证码:第三方平台,人工智能识别
import requests
import pymysql
from lxml import etree

from chaojiying import get_code

conn = pymysql.Connect(
        db = 'spider',
        host = 'localhost',
        port = 3306,
        user = 'root',
        password= '123',
        charset="utf8"
)
cursor = conn.cursor()
cookies = {
               "t":"6c384ab6618045d6fb2691f06c37d4c14",
    }
#cookies中只有某个或某几个负责登录状态,人人网是键为t的值
# #人人网判断登录状态,否则无法查看
def view_detials(url):
    res = requests.get("http://www.renren.com/"+url+"/profile",cookies=cookies).text
    ele = etree.HTML(res)
    try:
        name = ele.xpath("//title/text()")[0]
        print(name)
        save_data(name)
        # name为列表
        if name != "人人网 - 验证码":
            img_url = ele.xpath("//div[@id='footprint-box']//li/a/@namecard")
            # 从最近访客中开始扩散,扩散需要的url为namecard中的数字
            save_url(img_url)
            # 提交一个列表的url
        else:
            img_url = ele.xpath("//div[@class='optional']/img/@src")[0]
            # 取到验证码的图片的url
            img = requests.get(img_url, cookies=cookies).content
            # get_code函数只能处理图片的二进制流
            check_code(get_code(img))
            # 调用第三方平台对验证码的处理函数对图片进行处理
            # get_code的返回值为字典,键['pic_str']的值为验证码的字符串形式


    except:
        pass

def check_code(code):
    check_url = "http://www.renren.com/validateuser.do"
    #人人网验证码的url
    get_code = code['pic_str']
    data = {
        'id': '880792860',
        'icode': get_code,
        'submit': '继续浏览',
        'requestToken': '637455402',
        '_rtk': 'f1bfca84'
    }
    requests.post(check_url,data=data,cookies=cookies)
def get_url():
    sql = "select url from renren where status = %s"
    #数据库中的url为varchar
    cursor.execute(sql,("0",))
    #数据库中的status为varchar
    url = cursor.fetchone()[0]
    new_sql = "update renren set status =1 where url = %s"
    cursor.execute(new_sql,(url,))
    conn.commit()
    view_detials(url)
def save_url(urls):
    for url in  urls:
        try:
        #会出现验证码,用try处理
            #在viewdetails中获得的url列表中遍历
            sql = "insert into renren values (%s,%s)"
            cursor.execute(sql,(url,"0"))
            #将新插入的人的数据状态码为0
            conn.commit()
        except:
            pass
def save_data(data):
    sql = "insert into renren_data values (%s)"
    #插入语句的%s需要加括号
    cursor.execute(sql,(data,))
    conn.commit()
if __name__ == '__main__':
    while True:
        get_url()
def get_code(im):
	chaojiying = Chaojiying_Client('用户名','密码', '软件id')	
	return chaojiying.PostPic(im, 1902)		

你可能感兴趣的:(爬虫笔记)