本节,将与网页进行交互,根据用户输入返回对应的内容。
表单有几个重要的组成部分,分别是 < form > 标签的action、enctype和method属性。
其中若action="#",则表示此表单的提交url与此页面的url相同。
enctype属性用于设置数据提交的编码
。默认的编码类型为application/x-www-form-urlencoded,此时所有非字母数字的字符都需要转换为十六进制的ascii码,如果存在大量非字母数字的字符,那么这种编码的效率就会非常低。针对这种情况,需要使用multipart/form-data作为编码类型,使用这种编码类型不会对数据进行编码,而是使用MIME(Multipurpose Internet Mail Extensions,多用途互联网邮件扩展)协议将其作为多个部分进行发送,这和邮件的传输标准相同。
可以通过Python的urllib、requests等库实现自动化提交登陆表单。
# -*- coding: utf-8 -*-
import urllib
import glob
import sqlite3
import os
import http.cookiejar as cookielib
import json
import time
import lxml.html
# 登陆邮箱
LOGIN_EMAIL = '[email protected]'
# 登陆密码
LOGIN_PASSWORD = 'example'
# 登陆的url
LOGIN_URL = 'http://example.webscraping.com/user/login'
def login_basic():
"""fails because not using formkey
会因为没有使用_formkey知名表单ID而失败,因为会重定向到登陆页面
"""
data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD}
# 编码数据
encoded_data = urllib.parse.urlencode(data)
# 构造request
request = urllib.request.Request(LOGIN_URL, encoded_data)
# 发送请求,并获得响应response
response = urllib.parse.urlopen(request)
# 打印响应页面的url,是否是登陆的url,因为登陆失败会重定向到登陆页面
print(response.geturl())
def login_formkey():
"""fails because not using cookies to match formkey
会因为没有使用cookie而报错,因为cookie中的_formkey的值会与提交的登陆表单数据中的_formkey进行对比,是否一致。
"""
html = urllib.parse.urlopen(LOGIN_URL).read()
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
encoded_data = urllib.parse.urlencode(data)
request = urllib.request.Request(LOGIN_URL, encoded_data)
response = urllib.parse.urlopen(request)
# 打印响应页面的url,是否是登陆的url,因为登陆失败会重定向到登陆页面
print(response.geturl())
def login_cookies():
"""working login
正常登陆
"""
# 声明一个cookieJar的对象来保存cookie
cj = cookielib.CookieJar()
# 使用HTTPCookieProcessor来创建cookie处理器
# 构建opener
opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
# 获取到登陆网页的html
html = opener.open(LOGIN_URL).read()
# 解析网页html,添加上
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
# 编码这些数据
encoded_data = urllib.parse.urlencode(data)
# 构造登陆请求
request = urllib.request.Request(LOGIN_URL, encoded_data)
# 发送登陆请求,并接收响应
response = opener.open(request)
# 打印响应页面的url,是否是登陆的url,因为登陆失败会重定向到登陆页面
print(response.geturl())
return opener
def login_firefox():
"""load cookies from firefox
从火狐浏览器加载cookie
从文件中加载cookie
"""
session_filename = find_ff_sessions()
cj = load_ff_sessions(session_filename)
opener = urllib2.build_opener(urllib.request.HTTPCookieProcessor(cj))
html = opener.open(COUNTRY_URL).read()
tree = lxml.html.fromstring(html)
print(tree.cssselect('ul#navbar li a')[0].text_content())
return opener
def parse_form(html):
"""extract all input properties from the form
从表单中提取出所有的input标签属性
"""
tree = lxml.html.fromstring(html)
data = {}
for e in tree.cssselect('form input'):
if e.get('name'):
data[e.get('name')] = e.get('value')
return data
def load_ff_sessions(session_filename):
cj = cookielib.CookieJar()
if os.path.exists(session_filename):
try:
json_data = json.loads(open(session_filename, 'rb').read())
except ValueError as e:
print 'Error parsing session JSON:', str(e)
else:
for window in json_data.get('windows', []):
for cookie in window.get('cookies', []):
import pprint; pprint.pprint(cookie)
c = cookielib.Cookie(0, cookie.get('name', ''), cookie.get('value', ''),
None, False,
cookie.get('host', ''), cookie.get('host', '').startswith('.'), cookie.get('host', '').startswith('.'),
cookie.get('path', ''), False,
False, str(int(time.time()) + 3600 * 24 * 7), False,
None, None, {})
cj.set_cookie(c)
else:
print('Session filename does not exist:', session_filename)
return cj
def find_ff_sessions():
paths = [
'~/.mozilla/firefox/*.default',
'~/Library/Application Support/Firefox/Profiles/*.default',
'%APPDATA%/Roaming/Mozilla/Firefox/Profiles/*.default'
]
for path in paths:
filename = os.path.join(path, 'sessionstore.js')
matches = glob.glob(os.path.expanduser(filename))
if matches:
return matches[0]
def main():
login_cookies()
if __name__ == '__main__':
main()
关于cookie、session的理解建议仔细看这篇博客 python3下使用requests实现模拟用户登录 —— 基础篇(马蜂窝),讲的通俗易懂。
# -*- coding: utf-8 -*-
import urllib
import mechanize
import login
COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239'
def edit_country():
"""
方式一、修改网页中某些内容
"""
opener = login.login_cookies()
country_html = opener.open(COUNTRY_URL).read()
data = login.parse_form(country_html)
import pprint; pprint.pprint(data)
print('Population before: ' + data['population'])
data['population'] = int(data['population']) + 1
encoded_data = urllib.parse.urlencode(data)
request = urllib.request.Request(COUNTRY_URL, encoded_data)
response = opener.open(request)
country_html = opener.open(COUNTRY_URL).read()
data = login.parse_form(country_html)
print('Population after:', data['population'])
def mechanize_edit():
"""
方式二、使用Mechanize修改网页中某些内容
"""
# login
br = mechanize.Browser()
br.open(login.LOGIN_URL)
br.select_form(nr=0)
print(br.form)
br['email'] = login.LOGIN_EMAIL
br['password'] = login.LOGIN_PASSWORD
response = br.submit()
# edit country
br.open(COUNTRY_URL)
br.select_form(nr=0)
print('Population before:', br['population'])
br['population'] = str(int(br['population']) + 1)
br.submit()
# check population increased
br.open(COUNTRY_URL)
br.select_form(nr=0)
print('Population after:', br['population'])
if __name__ == '__main__':
edit_country()
mechanize_edit()
[1]《用python写web爬虫(web scraping with python)》