用Python写网络爬虫系列(三)表单处理

import  urllib,urllib2
LOGIN_URL = r'http://example.webscraping.com/user/login'
LOGIN_EMAIL = '[email protected]'
LOGIN_PASSWORD ='qq123456'
data ={'email':LOGIN_EMAIL,'password':LOGIN_PASSWORD}
encoded_data = urllib.urlencode(data) # 首先把中文字符转换为十六进制,然后在每个字符前面加一个标识符%
request = urllib2.Request(LOGIN_URL,encoded_data)
response = urllib2.urlopen(request)
response.geturl() #如果登陆成功会跳转到主页 否则会跳转到登陆页面
# 因为除了邮箱和密码之外还需要提交其他的几个域 但是这些域被隐藏了,但是可以通过lxml方法访问
import lxml.html
def parse_form(html):
    tree = lxml.html.fromstring(html)
    data = {}
    for e in tree.cssselect('form input'):
        if e.get('name'):
            data[e.get('name')] = e.get('value')
    return data
# 遍历表单里面的所有input 标签 然后用字典的形式返回name和value的属性值
import pprint
html = urllib2.urlopen(LOGIN_URL).read()
form = parse_form(html)
pprint.pprint(form)
#_formkey 服务器使用这个唯一的ID来避免表单避免多次提交,每次加载网页都会产生不同的ID然后服务器根据这个ID来判断是否提交过
#修改过的代码
html = urllib2.urlopen(LOGIN_URL).read()
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
encoded_data = urllib.urlencode(data)
request = urllib2.Request(LOGIN_URL,encoded_data)
response = urllib2.urlopen(request)
response.geturl()
#依旧没有办法正常运行 因为缺少一个很重要的部分 cookies
import  cookielib
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(LOGIN_URL).read()
data = parse_form(html)
data['email'] = LOGIN_EMAIL
data['password'] = LOGIN_PASSWORD
encoded_data = urllib.urlencode(data)
request = urllib2.Request(LOGIN_URL, encoded_data)
response = opener.open(request)
response.geturl()
# 使用了urllib2.HTTPCookieProcessor增加了cookies的支持之后的代码 成功登陆
import mechanize  #自动化表单填写
br = mechanize.Browser()
br.open(LOGIN_URL)
br.select_form(nr=0)
br['email'] = LOGIN_EMAIL
br['password'] = LOGIN_PASSWORD
response = br.submit()
br.open(COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239')
br.select_form(nr = 0)
print 'Population before:', br['population']
br['population'] = str(int(br['population']) + 1)
br.submit()
br.open(COUNTRY_URL = 'http://example.webscraping.com/edit/United-Kingdom-239')
br.select_form(nr=0)
print 'Population after:', br['population']

你可能感兴趣的:(Python爬虫)