python模拟登陆人人网(selenium、cookies、表单)

以爬取包贝尔主页信息为例代码解释模拟登陆

1、导入所需第三方库

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from lxml import etree
import time

2、用selennium模拟登陆

#登录selenium
def login(url, user, password):
    driver.get(url)
    driver.maximize_window()
    input_name = wait.until(EC.presence_of_element_located((By.ID, 'email')))
    input_name.send_keys(user)
    #查找密码输入框
    input_password = wait.until(EC.presence_of_element_located((By.ID, 'password')))
    #输入密码后回车
    input_password.send_keys(password)
    input_password.send_keys(Keys.RETURN)
    time.sleep(5)
    driver.get(ul)
    return driver.page_source

3、用xpath提取网页信息

获得个人简介和代表作,如需爬取其他信息,修改即可

# 利用xpath获取网页信息
def parse_page(html):
    dom = etree.HTML(html)
    user_list = dom.xpath('//h1[@class="avatar_title"]/text()')
    user = user_list[0].strip()
    topic_list = dom.xpath('//p[@class="authentication"]/text()')
    topic = topic_list[0].strip()
    return user,topic

4、主函数

ul为包贝尔主页,想获取别的网页更改ul即可

if __name__ == '__main__':
    driver= webdriver.Chrome()
    wait = WebDriverWait(driver, 10)
    url = "http://www.renren.com"  
    ul = "http://www.renren.com/880792860/profile"
    user = input("请输入用户名:")
    password = input("请输入密码:")
    html = login(url, user, password)
    data = parse_page(html)
    print('{}:{}'.format(data[0],data[1]))
    driver.close()

5、cookies模拟登陆(相对简单)

只需修改上面2、4就行,第三方库和获取信息代码相同

修改2

import requests
from lxml import etree

#获取登录后网页源代码
#首先手动登录一次,将登录后的cookie加到headers中
def get_page(url):  
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
        'Cookie': 'anonymid=k9ms9idkght7ea; depovince=GW; _r01_=1; JSESSIONID=abctH4Nu1Svub9nGWmlhx; ick_login=4ec1d30a-2073-4b56-957b-46a35291833a; taihe_bi_sdk_uid=84bc59a61b5ab331f13802d75d2ef4c1; taihe_bi_sdk_session=d5a1586e172ec99e162ebab493c1f30a; ick=4124d285-2c06-4b32-964c-eea910677987; XNESSESSIONID=1bfcb13bc2f6; first_login_flag=1; ln_uact=15832184726; ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; jebe_key=138cdf0b-ded6-44c4-b2ed-d0f783393b53%7C9dac2838a292687518bbf3c5313bd627%7C1588253329601%7C1%7C1588253328944; jebe_key=138cdf0b-ded6-44c4-b2ed-d0f783393b53%7C9dac2838a292687518bbf3c5313bd627%7C1588253329601%7C1%7C1588253328951; wp=1; wp_fold=1; jebecookies=e614abe6-69db-48ea-8042-749ef59cac89|||||; _de=3A994080B05890C8BC1347536B8CB062; p=1d07e84cd3e56341724179838d20b9a07; t=b946317273136234495912a975d405697; societyguester=b946317273136234495912a975d405697; id=974339327; xnsid=348fb2fb; loginfrom=syshome'
    }
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status() 
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(e)
        return ""

cookies获取:手动登陆人人网,然后找到cookies做成字典。如图
python模拟登陆人人网(selenium、cookies、表单)_第1张图片
使用开发者工具,右击点击检查,找到network,刷新,找到这个网页
python模拟登陆人人网(selenium、cookies、表单)_第2张图片
用这个cookies代替代码中的cookies,其余不用改

修改4
url为包贝尔主页网址,如需爬取其他网址,修改即可

if __name__ == '__main__':   
    url = 'http://www.renren.com/880792860/profile'
    html = get_page(url)
    data = parse_page(html)
    print('{}:{}'.format(data[0],data[1]))

6、表单登陆

修改2

import requests
from lxml import etree

def login(url): 
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',        
    }
    formdata = {"email": "15832184726",
                #icode: 
                "origURL": "http://www.renren.com/home",
                "domain": "renren.com",
                "key_id": 1,
                "captcha_type": "web_login",
                "password": "89e63f831ebb68a47c50ade29ba604cabf1d1bdb11f1a5cd3a5fad1f34204c49",
                "rkey": "e85e89d5b673c4d6ba6c114a47c6a7f1",
                "f": "http%3A%2F%2Fwww.renren.com%2F974339327"
        
    }
    try:
        r = s.post(url, headers=headers,data = formdata, timeout=10)
        r.raise_for_status() 
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(e)
        return ""
def get_page(url): 
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'        
    }
    try:
        r = s.get(url, headers=headers, timeout=10)
        r.raise_for_status() 
        r.encoding = 'utf-8'
        return r.text
    except Exception as e:
        print(e)
        return ""

formdata获取
回到登陆界面,打开开发者工具,找到network,勾选preserve log,然后登陆
python模拟登陆人人网(selenium、cookies、表单)_第3张图片
python模拟登陆人人网(selenium、cookies、表单)_第4张图片
按网页数据修改代码中的formdata

修改4

if __name__ == '__main__':    
    #登录地址
    login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=202045949278'
    #创建会话对象
    s = requests.session()
    html_login = login(login_url)
    print(html_login)
    url = 'http://www.renren.com/880792860/profile'
    html = get_page(url)
    data = parse_page(html)
    print('{}:{}'.format(data[0],data[1]))  

你可能感兴趣的:(python模拟登陆人人网(selenium、cookies、表单))