豆瓣模拟登陆

1、模拟headers
豆瓣模拟登陆_第1张图片

2、登陆豆瓣,查看并构造表单
豆瓣模拟登陆_第2张图片

两种实现方式,一种普通,一种使用session

import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve

login_url = 'https://accounts.douban.com/login'  # 登陆网址
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
    'Host':'accounts.douban.com',
    'Referce':'https://www.douban.com/',   
}  # 模拟headers

# s = requests.session()  # 创建session
# s.headers.update(headers)  # 之后会都会带上headers

res = requests.get(login_url, headers=headers)
# res = s.get(login_url)  # 不用再模拟headers

res.encoding = 'utf-8'
print(res.status_code)
soup = BeautifulSoup(res.text, 'lxml')
captcha = soup.find_all('div', class_='item item-captcha')
captcha_id = captcha[0].find_all('input')
captcha_id = captcha_id[1]['value']  # 得到captcha-id
if len(captcha):  # 如果存在验证码
    captcha_url = captcha[0].div.img['src']  # 得到验证码图片地址
    urlretrieve(captcha_url, 'd:/captcha.jpg')
    captcha_solution = input('Please input captcha:\n')
    form_data = {
    'source':'index_nav',
    'redir':'https://www.douban.com/',
    'form_email':'',
    'form_password':'',
    'captcha-solution':captcha_solution,
    'captcha-id':captcha_id,
    }  # 构造表单
else:
     form_data = {
    'source':'index_nav',
    'redir':'https://www.douban.com/',
    'form_email':'',
    'form_password':'',
    }
res_login = requests.post(login_url, headers=headers, data=form_data)
# res_login = s.post(login_url, data=form_data)

res_login.encoding = 'utf-8'
print(res.status_code)
soup = BeautifulSoup(res_login.text, 'lxml')
print(soup)

你可能感兴趣的:(python爬虫)