requests模块的cookie和代理操作

一、爬取基于某人个人用户的用户信息(人人网)带验证码

步骤:
1.获取验证码图片
2.调用getCode函数处理验证码图片得到相应的验证码数据(通过云打码平台)
3.创建一个session对象,该对象可以发起请求。该对象可以自动将请求后创建的cookie进行存储
4.基于session进行登录操作:session.post()-->cookie会存储到session
5.针对个人主页发起请求,此时session中存储了cookie信息,可以直接访问登陆之后的页面

import requests
from lxml import etree
#该函数可以将验证码图片进行识别,返回内容

import http.client, mimetypes, urllib, json, time, requests


class YDMHttp:
    apiurl = 'http://api.yundama.com/api.php'
    username = ''
    password = ''
    appid = ''
    appkey = ''

    def __init__(self, username, password, appid, appkey):
        self.username = username
        self.password = password
        self.appid = str(appid)
        self.appkey = appkey

    def request(self, fields, files=[]):
        response = self.post_url(self.apiurl, fields, files)
        response = json.loads(response)
        return response

    def balance(self):
        data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': 
                self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['balance']
        else:
            return -9001

    def login(self):
        data = {'method': 'login', 'username': self.username, 'password': self.password, 
            'appid': self.appid,
                'appkey': self.appkey}
        response = self.request(data)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['uid']
        else:
            return -9001

    def upload(self, filename, codetype, timeout):
        data = {'method': 'upload', 'username': self.username, 
            'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
        file = {'file': filename}
        response = self.request(data, file)
        if (response):
            if (response['ret'] and response['ret'] < 0):
                return response['ret']
            else:
                return response['cid']
        else:
            return -9001

    def result(self, cid):
        data = {'method': 'result', 'username': self.username, 
              'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid)}
        response = self.request(data)
        return response and response['text'] or ''

    def decode(self, filename, codetype, timeout):
        cid = self.upload(filename, codetype, timeout)
        if (cid > 0):
            for i in range(0, timeout):
                result = self.result(cid)
                if (result != ''):
                    return cid, result
                else:
                    time.sleep(1)
            return -3003, ''
        else:
            return cid, ''

    def report(self, cid):
        data = {'method': 'report', 'username': self.username,
               'password': self.password, 'appid': self.appid,
                'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
        response = self.request(data)
        if (response):
            return response['ret']
        else:
            return -9001

    def post_url(self, url, fields, files=[]):
        for key in files:
            files[key] = open(files[key], 'rb')
        res = requests.post(url, files=files, data=fields)
        return res.text


def getCode(imgPath):
    # 普通用户名
    username    = 'bobo328410948'
    # 普通密码
    password    = 'bobo328410948'                            
    # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appid       = 6003                                   
    # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    appkey      = '1f4b564483ae5c907a1d34f8e2f2776c'    
    # 图片文件
    filename    = imgPath                        
    # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识
别率。在此查询所有类型 http://www.yundama.com/price.html
    codetype    = 2004
    # 超时时间,秒
    timeout     = 20                                    
    # 检查
    if (username == 'username'):
        print('请设置好相关参数再测试')
    else:
        # 初始化
        yundama = YDMHttp(username, password, appid, appkey)
        # 登陆云打码
        uid = yundama.login();
        print('uid: %s' % uid)
        # 查询余额
        balance = yundama.balance();
        print('balance: %s' % balance)
        # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
        cid, result = yundama.decode(filename, codetype, timeout);
        print('cid: %s, result: %s' % (cid, result))
        return result

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like 
     Gecko) Chrome/69.0.3497.81 Safari/537.36'
}

#获取验证码图片
url = 'http://www.renren.com/'
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
codeImg_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
#针对验证码图片的url发起请求,获取验证码图片的二进制数据,存储到本地
codeImg_data = requests.get(url=codeImg_url,headers=headers).content
with open('./code.jpg','wb') as fp:
    fp.write(codeImg_data)
#获取验证码图片中的内容
code_content = getCode('code.jpg')
print(code_content)

#创建一个session对象,该对象可以发起请求。该对象可以自动将请求后创建的cookie进行存储
session = requests.session()

#1.进行登录操作:服务器就会针对指定当前用户创建一个cookie对象(存储当前用户的状态信息(包含当前用户的身份标识))
#2.进行个人主页请求(携带步骤1中创建的cookie),获取当前用户个人主页的页面数据

#进行登录操作
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018921047557'

data = {
    "email":"[email protected]",
    "icode":code_content,
    "origURL":"http://www.renren.com/home",
    "domain":"renren.com",
    "key_id":"1",
    "captcha_type":"web_login",
    "password":"5bc148793949a14590c2a43e32204fe4e486b821cd4641abaecba424fa27336d",
    "rkey":"142a1b216077e29550fc37b2057fa297",
    "f":"http%3A%2F%2Fwww.renren.com%2F289676607%2Fprofile",
}
#目的:创建cookie
session.post(url=url,data=data,headers=headers)  #cookie会存储到session

#针对个人主页发起请求
person_url = 'http://www.renren.com/289676607/profile'
page_text = session.get(url=person_url,headers=headers).text

with open('./renren.html','w',encoding='utf-8') as fp:
    fp.write(page_text)

二.基于requests模块的代理操作

什么是代理

代理就是第三方代替本体处理相关事务。例如:生活中的代理:代购,中介,微商......

爬虫中为什么需要使用代理

一些网站会有相应的反爬虫措施,例如很多网站会检测某一段时间某个IP的访问次数,如果访问频率太快以至于看起来不像正常访客,它可能就会会禁止这个IP的访问。所以我们需要设置一些代理IP,每隔一段时间换一个代理IP,就算IP被禁止,依然可以换个IP继续爬取。

代理的分类:

正向代理:代理客户端获取数据。正向代理是为了保护客户端防止被追究责任。
反向代理:代理服务器提供数据。反向代理是为了保护服务器或负责负载均衡。

免费代理ip提供网站

1.http://www.goubanjia.com/
2.西祠代理
3.快代理

import requests
import random
if __name__ == "__main__":
    #不同浏览器的UA
    header_list = [
        # 遨游
        {"user-agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},
        # 火狐
        {"user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
        # 谷歌
        {
            "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 
          (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
    ]
    #不同的代理IP
    proxy_list = [
        {"http": "112.115.57.20:3128"},
        {'http': '121.41.171.223:3128'}
    ]
    #随机获取UA和代理IP
    header = random.choice(header_list)
    proxy = random.choice(proxy_list)

    url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
    #参数3:设置代理
    response = requests.get(url=url,headers=header,proxies=proxy)
    response.encoding = 'utf-8'
    
    with open('daili.html', 'wb') as fp:
        fp.write(response.content)
    #切换成原来的IP
    requests.get(url, proxies={"http": ""})

你可能感兴趣的:(requests模块的cookie和代理操作)