Selenium+PlantomJs实现模拟京东登陆

1.python抓取网页的过程中,有很多网页都是需要登陆后才可以进行信息采集,分析页面链接有的时候太费时间,因此引入了Selenium+PlantomJs实现模拟登陆,简单,方便实现登陆拿到cookies

2.实现思路分析
a)访问京东登陆页面
https://passport.jd.com/new/login.aspx
b)输入用户名,密码,复杂的有各种形式验证码,比如淘宝登陆的滑动验证
c)登陆成功,cookies持久化到本地

3.使用技术Selenium+ChromeDriver(PhantomJs)

4.代码实现(模拟登陆,cookies持久化到本地)

# -*- coding: utf-8 -*-
import random,time,os
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains

from user_agents import agents


class LoginCookies(object):

    loginurl = 'https://passport.jd.com/new/login.aspx'
    username ='username'
    pwd = 'pwd'
    headers ={
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
        'Connection': 'keep-alive',
        'Host': 'passport.jd.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3514.0 Safari/537.36',
    }

    def __init__(self):
        self.cookies = {}
        options = Options()
        #options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        # options.add_argument('--disable-dev-shm-usage')
        #driver = webdriver.Chrome(executable_path="chromedriver",
        #                          chrome_options=options)
        #self.driver =  webdriver.Chrome(chrome_options=options)
        self.driver =  webdriver.PhantomJS()
        #self.driver = webdriver.Firefox()
        self.agent = random.choice(agents)

    def login(self):
        #self.driver.delete_all_cookies()
        self.headers["User-Agent"] = self.agent
        for key in self.headers:
            webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.{}'.format(key)] = self.headers[key]
        self.driver.get(self.loginurl)
        time.sleep(1)
        self.driver.maximize_window()  # 将浏览器最大化
        time.sleep(1)
        #print('cookies:',self.driver.get_cookies())
        with open('html.txt','w') as f:
            f.write(self.driver.page_source)
        self.driver.find_element_by_xpath(("//a[text()='账户登录']")).click()
        time.sleep(1)
        self.driver.find_element_by_name('loginname').send_keys(self.username)
        time.sleep(1)
        self.driver.find_element_by_name('nloginpwd').send_keys(self.pwd)
        time.sleep(1)
        self.driver.find_element_by_id("loginsubmit").click()
        time.sleep(5)
        self.driver.save_screenshot('Screenshots/logindjd.png')
        #print('login cookies:', self.driver.get_cookies())
        #print(type(self.driver.get_cookies()))
        self.savecookies(self.driver.get_cookies()[0])
        self.cookies =  self.driver.get_cookies()[0]
        print('search',self.driver.find_element_by_id('J_searchbg').text)
        self.driver.quit()

    def savecookies(self,cookies):
        with open('cookies.txt', 'w') as f:
            if cookies:
                for k, v in cookies.items():
                    f.writelines('{0}:{1}'.format(k, v) + "\n")

    def readcookies(self):
        if os.path.exists('cookies.txt'):
            for line in open('cookies.txt'):
                item = line.split(':')
                self.cookies[item[0]] = item[1]
        else:
            self.login()

if __name__ == '__main__':
    loginCookies = LoginCookies()
    loginCookies.readcookies()
    print('logindcookies:',loginCookies.cookies)

你可能感兴趣的:(Scrapy,Python进阶基础)