Python 网络爬虫---四种方法模拟浏览器登录
#-----------------第一种情况:Selenium+PhantomJS+BeautifulSoup+requests---------------
#因为 requests 模块不能执行 JavaScript,所以它不能处理很多新式的跟踪软件生成的 cookie,比如 Google Analytics,只有当客户端脚本执行后才设置 cookie
#还可以调用 delete_cookie()、add_cookie() 和 delete_all_cookies() 方法来处理 cookie。另外,还可以保存 cookie 以备其他网络爬虫使用。
#通过Selenium和PhantomJS,我们可以很好的处理一些需要事件执行后才能获得的cookie。
from selenium import webdriver
def login1(username, password, url):
driver = webdriver.Firefox()
driver.get('https://www.facebook.com')
driver.find_element_by_id('email').send_keys(username)
driver.find_element_by_id('pass').send_keys(password)
driver.find_element_by_id('login_form').submit()
driver.implicitly_wait(30)
print(driver.get_cookies()) #输出cookies
# wait until the search box is available,
# which means have succrssfully logged in
#search = driver.find_element_by_id('q')
# now are logged in so can navigate to the page of interest
#driver.get(url)
# add code to scrape data of interest here
driver.close()
#-----------------第二种情况:requests.Session+cookielib --------------------------------
#cookielib模块的主要作用是提供可存储cookie的对象,以便于requests模块配合使用来访问Internet资源。Cookielib模块非常强大,我们可以利用本模块的CookieJar类的对象来捕获cookie并在后续连接请求时重新发送,比如可以实现模拟登录功能。该模块主要的对象有CookieJar、FileCookieJar、MozillaCookieJar、LWPCookieJar。
#它们的关系:CookieJar —-派生—->FileCookieJar —-派生—–>MozillaCookieJar和LWPCookieJar
#默认的是FileCookieJar没有实现save函数。
#而MozillaCookieJar或LWPCookieJar都已经实现了。
#所以可以用MozillaCookieJar或LWPCookieJar,去自动实现cookie的save。
#s = requests.Session()
#req = s.get(url=url,headers=headers)
#print(s.cookies)
import requests
import http.cookiejar
from bs4 import BeautifulSoup
session = requests.Session()
session.cookies = http.cookiejar.LWPCookieJar("cookie")
agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.1.2.3000 Chrome/55.0.2883.75 Safari/537.36'
headers = {
"Host": "www.zhihu.com",
"Origin":"https://www.zhihu.com/",
"Referer":"http://www.zhihu.com/",
'User-Agent':agent
}
postdata = {
'password': '*******', #填写密码
'account': '********', #填写帐号
}
response = session.get("https://www.zhihu.com", headers=headers)
soup = BeautifulSoup(response.content, "html.parser")
xsrf = soup.find('input', attrs={"name": "_xsrf"}).get("value")
postdata['_xsrf'] =xsrf
result = session.post('http://www.zhihu.com/login/email', data=postdata, headers=headers)
session.cookies.save(ignore_discard=True, ignore_expires=True)
#***********************************************************************
import requests
import http.cookiejar as cookielib
session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename='cookie')
try:
session.cookies.load(ignore_discard=True)
except:
print("Cookie 未能加载")
def isLogin():
url = "https://www.zhihu.com/"
login_code = session.get(url, headers=headers, allow_redirects=False).status_code
if login_code == 200:
return True
else:
return False
if __name__ == '__main__':
agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/5.1.2.3000 Chrome/55.0.2883.75 Safari/537.36'
headers = {
"Host": "www.zhihu.com",
"Origin": "https://www.zhihu.com/",
"Referer": "http://www.zhihu.com/",
'User-Agent': agent
}
if isLogin():
print('您已经登录')
#requests的Session可以自动保持cookie,不需要自己维护cookie内容
#备注 503错误解决办法 S = requests.Session()
#-----------------第三种情况:requests.Session+cookielib --------------------------------
# -*- coding: UTF-8 -*-
from urllib import request
from http import cookiejar
if __name__ == '__main__':
#声明一个CookieJar对象实例来保存cookie
cookie = cookiejar.CookieJar()
#利用urllib.request库的HTTPCookieProcessor对象来创建cookie处理器,也就CookieHandler
handler=request.HTTPCookieProcessor(cookie)
#通过CookieHandler创建opener
opener = request.build_opener(handler)
#此处的open方法打开网页
response = opener.open('http://www.baidu.com')
#打印cookie信息
for item in cookie:
print('Name = %s' % item.name)
print('Value = %s' % item.value)
#-----------------第四种情况:requests.Session+cookielib --------------------------------
import urllib
import urllib2
import cookielib
import lxml.html
LOGIN_EMAIL = '[email protected]'
LOGIN_PASSWORD = 'wu.com'
#LOGIN_URL = 'http://example.webscraping.com/user/login'
LOGIN_URL = 'http://127.0.0.1:8000/places/default/user/login'
def login_cookies():
"""working login
"""
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(LOGIN_URL).read()
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
encoded_data = urllib.urlencode(data)
request = urllib2.Request(LOGIN_URL, encoded_data)
response = opener.open(request)
print response.geturl()
return opener