Python 网络爬虫---四种方法模拟浏览器登录

Python 网络爬虫---四种方法模拟浏览器登录



#-----------------第一种情况:Selenium+PhantomJS+BeautifulSoup+requests---------------


#因为 requests 模块不能执行 JavaScript,所以它不能处理很多新式的跟踪软件生成的 cookie,比如 Google Analytics,只有当客户端脚本执行后才设置 cookie
#还可以调用 delete_cookie()、add_cookie() 和 delete_all_cookies() 方法来处理 cookie。另外,还可以保存 cookie 以备其他网络爬虫使用。
#通过Selenium和PhantomJS,我们可以很好的处理一些需要事件执行后才能获得的cookie。


from selenium import webdriver
def login1(username, password, url):
    driver = webdriver.Firefox()
    driver.get('https://www.facebook.com')
    driver.find_element_by_id('email').send_keys(username)
    driver.find_element_by_id('pass').send_keys(password)
    driver.find_element_by_id('login_form').submit()
    driver.implicitly_wait(30)
    print(driver.get_cookies()) #输出cookies
    # wait until the search box is available,
    # which means have succrssfully logged in
    #search = driver.find_element_by_id('q')
    # now are logged in so can navigate to the page of interest
    #driver.get(url)
    # add code to scrape data of interest here
    driver.close()


#-----------------第二种情况:requests.Session+cookielib  --------------------------------

#cookielib模块的主要作用是提供可存储cookie的对象,以便于requests模块配合使用来访问Internet资源。Cookielib模块非常强大,我们可以利用本模块的CookieJar类的对象来捕获cookie并在后续连接请求时重新发送,比如可以实现模拟登录功能。该模块主要的对象有CookieJar、FileCookieJar、MozillaCookieJar、LWPCookieJar。
#它们的关系:CookieJar —-派生—->FileCookieJar  —-派生—–>MozillaCookieJar和LWPCookieJar
#默认的是FileCookieJar没有实现save函数。
#而MozillaCookieJar或LWPCookieJar都已经实现了。
#所以可以用MozillaCookieJar或LWPCookieJar,去自动实现cookie的save。

#s = requests.Session()
#req = s.get(url=url,headers=headers)
#print(s.cookies)


import requests
import http.cookiejar
from bs4 import BeautifulSoup
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.1.2.3000 Chrome/55.0.2883.75 Safari/537.36'
headers = {
    "Host": "www.zhihu.com",
    "Origin":"https://www.zhihu.com/",
    "Referer":"http://www.zhihu.com/",
    'User-Agent':agent
}


postdata = {
    'password': '*******',  #填写密码
    'account': '********', #填写帐号
}
response = session.get("https://www.zhihu.com", headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
xsrf = soup.find('input', attrs={"name": "_xsrf"}).get("value")
postdata['_xsrf'] =xsrf
result = session.post('http://www.zhihu.com/login/email', data=postdata, headers=headers)
session.cookies.save(ignore_discard=True, ignore_expires=True)

#***********************************************************************

import requests
import http.cookiejar as cookielib
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename='cookie')
try:
    session.cookies.load(ignore_discard=True)
except:
       print("Cookie 未能加载")
def isLogin():
    url = "https://www.zhihu.com/"
    login_code = session.get(url, headers=headers, allow_redirects=False).status_code
    if login_code == 200:
        return True
    else:
        return False
        
if __name__ == '__main__':
    agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.1.2.3000 Chrome/55.0.2883.75 Safari/537.36'
    headers = {
        "Host": "www.zhihu.com",
        "Origin": "https://www.zhihu.com/",
        "Referer": "http://www.zhihu.com/",
        'User-Agent': agent
    }
    if isLogin():
        print('您已经登录')

#requests的Session可以自动保持cookie,不需要自己维护cookie内容
#备注 503错误解决办法    S = requests.Session()

#-----------------第三种情况:requests.Session+cookielib  --------------------------------


# -*- coding: UTF-8 -*-
from urllib import request
from http import cookiejar


if __name__ == '__main__':
    #声明一个CookieJar对象实例来保存cookie
    cookie = cookiejar.CookieJar()
    #利用urllib.request库的HTTPCookieProcessor对象来创建cookie处理器,也就CookieHandler
    handler=request.HTTPCookieProcessor(cookie)
    #通过CookieHandler创建opener
    opener = request.build_opener(handler)
    #此处的open方法打开网页
    response = opener.open('http://www.baidu.com')
    #打印cookie信息
    for item in cookie:
        print('Name = %s' % item.name)
        print('Value = %s' % item.value)

#-----------------第四种情况:requests.Session+cookielib  --------------------------------

        
import urllib
import urllib2
import cookielib
import lxml.html


LOGIN_EMAIL = '[email protected]'
LOGIN_PASSWORD = 'wu.com'
#LOGIN_URL = 'http://example.webscraping.com/user/login'
LOGIN_URL = 'http://127.0.0.1:8000/places/default/user/login'
        
def login_cookies():
    """working login
    """
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    html = opener.open(LOGIN_URL).read()
    data = parse_form(html)
    data['email'] = LOGIN_EMAIL
    data['password'] = LOGIN_PASSWORD
    encoded_data = urllib.urlencode(data)
    request = urllib2.Request(LOGIN_URL, encoded_data)
    response = opener.open(request)
    print response.geturl()
    return opener


    

你可能感兴趣的:(网络爬虫)