爬虫--selenium爬取带验证码的豆瓣网

from selenium import webdriver
import time
import requests
from lxml import etree
import base64

# 操作浏览器
driver = webdriver.Chrome()
url = 'https://accounts.douban.com/login?alias=&redir=https%3A%2F%2Fwww.douban.com%2F&source=index_nav&error=1001'

driver.get(url)
time.sleep(1)
# 输入账号操作
driver.find_element_by_id('email').send_keys('[email protected]')
time.sleep(1)
# 输入密码操作
driver.find_element_by_id('password').send_keys('13668678920hjy@')
time.sleep(1)


# 获取验证码相关信息
html_str = driver.page_source
html_element = etree.HTML(html_str)
# print(html_element)
# # 得到验证码的url
try:
    image_url = html_element.xpath('//img[@id="captcha_image"]/@src')[0]
    print(image_url)
    # 获取这个图片的内容
    response = requests.get(image_url)

    # 获取base64的str
    base64_str = base64.b64encode(response.content)
    v_type = 'cn'
    # post提交打码平台的数据
    form = {
        'v_pic':base64_str,
        'v_type':v_type,
    }
    # 打码平台 Authorization的header
    headers = {
        'Authorization':'APPCODE 926e3a416dd34ef0be35a19809ade4c9',
    }
    # 从打码平台获取验证码信息
    dmpt_url = 'http://yzmplus.market.alicloudapi.com/fzyzm'
    response = requests.post(dmpt_url, form, headers=headers)
    print(response.text)
    # captcha_value 就是我们的验证码信息
    captcha_value = response.json()['v_code']
    print(image_url)
    print(captcha_value)
    driver.find_element_by_id('captcha_field').send_keys(captcha_value)
    time.sleep(1)
except IndexError:
    pass

# try ....except...else 语句,当没有异常发生时,else中的语句将会被执行。

# 点击按钮操作
driver.find_element_by_class_name('btn-submit').click()
time.sleep(1)

# 获取所有的cookie的信息
cookies = driver.get_cookies()
cookie_list = []
# 对于每一个cookie_dict, 就是将name 和 value取出, 拼接成name=value;
for cookie_dict in cookies:
    cookie_str = cookie_dict['name'] + '=' + cookie_dict['value']
    cookie_list.append(cookie_str)
# 拼接所有的cookie到header_cookie中
header_cookie = ';'.join(cookie_list)
# print(header_cookie)

headers = {
    'Cookie':header_cookie,
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}

another_url = 'https://www.douban.com/accounts/'
response = requests.get(another_url,headers=headers)

with open('db.html','wb') as f:
    f.write(response.content)

 

你可能感兴趣的:(爬虫)