一、爬取基于某人个人用户的用户信息(人人网)带验证码
步骤:
1.获取验证码图片
2.调用getCode函数处理验证码图片得到相应的验证码数据(通过云打码平台)
3.创建一个session对象,该对象可以发起请求。该对象可以自动将请求后创建的cookie进行存储
4.基于session进行登录操作:session.post()-->cookie会存储到session
5.针对个人主页发起请求,此时session中存储了cookie信息,可以直接访问登陆之后的页面
import requests
from lxml import etree
#该函数可以将验证码图片进行识别,返回内容
import http.client, mimetypes, urllib, json, time, requests
class YDMHttp:
apiurl = 'http://api.yundama.com/api.php'
username = ''
password = ''
appid = ''
appkey = ''
def __init__(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid':
self.appid,
'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['balance']
else:
return -9001
def login(self):
data = {'method': 'login', 'username': self.username, 'password': self.password,
'appid': self.appid,
'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['uid']
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {'method': 'upload', 'username': self.username,
'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
file = {'file': filename}
response = self.request(data, file)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['cid']
else:
return -9001
def result(self, cid):
data = {'method': 'result', 'username': self.username,
'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'cid': str(cid)}
response = self.request(data)
return response and response['text'] or ''
def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if (cid > 0):
for i in range(0, timeout):
result = self.result(cid)
if (result != ''):
return cid, result
else:
time.sleep(1)
return -3003, ''
else:
return cid, ''
def report(self, cid):
data = {'method': 'report', 'username': self.username,
'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
response = self.request(data)
if (response):
return response['ret']
else:
return -9001
def post_url(self, url, fields, files=[]):
for key in files:
files[key] = open(files[key], 'rb')
res = requests.post(url, files=files, data=fields)
return res.text
def getCode(imgPath):
# 普通用户名
username = 'bobo328410948'
# 普通密码
password = 'bobo328410948'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 6003
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = '1f4b564483ae5c907a1d34f8e2f2776c'
# 图片文件
filename = imgPath
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识
别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 2004
# 超时时间,秒
timeout = 20
# 检查
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login();
print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout);
print('cid: %s, result: %s' % (cid, result))
return result
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
#获取验证码图片
url = 'http://www.renren.com/'
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
codeImg_url = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
#针对验证码图片的url发起请求,获取验证码图片的二进制数据,存储到本地
codeImg_data = requests.get(url=codeImg_url,headers=headers).content
with open('./code.jpg','wb') as fp:
fp.write(codeImg_data)
#获取验证码图片中的内容
code_content = getCode('code.jpg')
print(code_content)
#创建一个session对象,该对象可以发起请求。该对象可以自动将请求后创建的cookie进行存储
session = requests.session()
#1.进行登录操作:服务器就会针对指定当前用户创建一个cookie对象(存储当前用户的状态信息(包含当前用户的身份标识))
#2.进行个人主页请求(携带步骤1中创建的cookie),获取当前用户个人主页的页面数据
#进行登录操作
url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018921047557'
data = {
"email":"[email protected]",
"icode":code_content,
"origURL":"http://www.renren.com/home",
"domain":"renren.com",
"key_id":"1",
"captcha_type":"web_login",
"password":"5bc148793949a14590c2a43e32204fe4e486b821cd4641abaecba424fa27336d",
"rkey":"142a1b216077e29550fc37b2057fa297",
"f":"http%3A%2F%2Fwww.renren.com%2F289676607%2Fprofile",
}
#目的:创建cookie
session.post(url=url,data=data,headers=headers) #cookie会存储到session
#针对个人主页发起请求
person_url = 'http://www.renren.com/289676607/profile'
page_text = session.get(url=person_url,headers=headers).text
with open('./renren.html','w',encoding='utf-8') as fp:
fp.write(page_text)
二.基于requests模块的代理操作
什么是代理
代理就是第三方代替本体处理相关事务。例如:生活中的代理:代购,中介,微商......
爬虫中为什么需要使用代理
一些网站会有相应的反爬虫措施,例如很多网站会检测某一段时间某个IP的访问次数,如果访问频率太快以至于看起来不像正常访客,它可能就会会禁止这个IP的访问。所以我们需要设置一些代理IP,每隔一段时间换一个代理IP,就算IP被禁止,依然可以换个IP继续爬取。
代理的分类:
正向代理:代理客户端获取数据。正向代理是为了保护客户端防止被追究责任。
反向代理:代理服务器提供数据。反向代理是为了保护服务器或负责负载均衡。
免费代理ip提供网站
1.http://www.goubanjia.com/
2.西祠代理
3.快代理
import requests
import random
if __name__ == "__main__":
#不同浏览器的UA
header_list = [
# 遨游
{"user-agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)"},
# 火狐
{"user-agent": "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"},
# 谷歌
{
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11
(KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11"}
]
#不同的代理IP
proxy_list = [
{"http": "112.115.57.20:3128"},
{'http': '121.41.171.223:3128'}
]
#随机获取UA和代理IP
header = random.choice(header_list)
proxy = random.choice(proxy_list)
url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
#参数3:设置代理
response = requests.get(url=url,headers=header,proxies=proxy)
response.encoding = 'utf-8'
with open('daili.html', 'wb') as fp:
fp.write(response.content)
#切换成原来的IP
requests.get(url, proxies={"http": ""})