mechanize是非常合适的模拟浏览器的模块,它的特点主要有:
http://blog.chinaunix.net/uid-26722078-id-3507409.html mechanize模拟浏览器行为使用总结
1 http,https协议等。
2 简单的HTML表单填写。
3 浏览器历史记录和重载。
4 Referer的HTTP头的正确添加(可选)。
5 自动遵守robots.txt的。
6 自动处理HTTP-EQUIV和刷新。
所以你可以用mechanize来完成一些自动化浏览器想要做的事情,比如自动登录表单,自动填写表单等。
#!/usr/bin/env python import mechanize import cookielib from bs4 import BeautifulSoup br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj)##关联cookies ###设置一些参数,因为是模拟客户端请求,所以要支持客户端的一些常用功能,比如gzip,referer等 br.set_handle_equiv(True) br.set_handle_gzip(False) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) ###这个是degbug##你可以看到他中间的执行过程,对你调试代码有帮助 br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.11) Gecko/20100701 Firefox/3.5.11')]##模拟浏览器头 response = br.open('http://www.renren.com/PLogin.do')##自己设定一个url for f in br.forms():##有的页面有很多表单,你可以通过来查看 print f br.select_form(nr=0)##选择表单1, br['email'] = 'xxx' br['password'] = 'xxx' br.submit()##提交表单 print 'success login'
登录163邮箱:
#!/usr/bin/env/ python #coding=utf-8 import mechanize import cookielib # Cookie Jar cj = cookielib.LWPCookieJar() # Browser br = mechanize.Browser() br.set_cookiejar(cj) # Browser options br.set_handle_equiv(True) #br.set_handle_gzip(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? #br.set_debug_http(True) #br.set_debug_redirects(True) #br.set_debug_responses(True) # User-Agent (http header) #br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/4.0.0')] br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.1.11) Gecko/20100701 Firefox/3.5.11')] # HTTP access and get response pack r = br.open("http://reg.163.com/logins.jsp?type=1&product=mail163&url=http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight%3D1%26verifycookie%3D1%26language%3D-1%26style%3D1") #html = r.read() #print br.response().read().decode("utf-8") #print r.info() #print r.geturl() for f in br.forms(): print f br.select_form(nr=0) br['username']='xxx' br['password']='xxx' br.submit() print 'sucess login' #print br.response().read()
输出结果:
<fLogin POST https://reg.163.com/logins.jsp application/x-www-form-urlencoded <HiddenControl(url=http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight=1&verifycookie=1&language=-1&style=1) (readonly)> <HiddenControl(product=mail163) (readonly)> <HiddenControl(savelogin=) (readonly)> <HiddenControl(outfoxer=) (readonly)> <HiddenControl(domains=) (readonly)> <HiddenControl(syscheckcode=b8bad95b10ac02219480122ee11b1c4cdc291e3c) (readonly)> <TextControl(username=)> <PasswordControl(password=)> <SubmitControl(Submit=) (readonly)>> sucess login
自动搜索"computer"
# -*- coding: utf-8 -*- import ie url = "http://ieeexplore.ieee.org/xpl/periodicals.jsp" myie = ie.NewIE(url) ie.Visible(myie) mybody = ie.GetBody(myie) input_ids=ie.GetNodes(mybody,"input") print input_ids input_id_topsearch = ie.NodeByAttr(input_ids, "id", "browse_keyword") print input_id_topsearch input_id_topsearch.value = "computer" div_ids=ie.GetNodes(mybody,"div") div_browser_title_hdr = ie.NodeByAttr(div_ids, "id", "browse-title-hdr") print div_browser_title_hdr div_browser_input = ie.GetNodes(div_browser_title_hdr, "input") input_id_search = ie.NodeByAttr(div_browser_input, "type", "image") print input_id_search input_id_search.click()
输出结果:
python模拟登陆163邮箱并获取通讯录
#-*- coding:UTF-8 -*- import urllib,urllib2,cookielib import xml.etree.ElementTree as etree #xml解析类 class Login163: #伪装browser header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:30.0) Gecko/20100101 Firefox/30.0'} #username = 'xxx' #passwd = 'xxx' cookie = None #cookie对象 cookiefile = 'D:\cookies.dat' #cookie临时存放地 user = '' def __init__(self,username,passwd): self.username = username self.passwd = passwd #cookie设置 self.cookie = cookielib.LWPCookieJar() #自定义cookie存放 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie)) urllib2.install_opener(opener) #登陆 def login(self): #请求参数设置 postdata = { 'username':self.username, 'password':self.passwd, 'type':1 } postdata = urllib.urlencode(postdata) #发起请求 req = urllib2.Request( url='http://reg.163.com/logins.jsp?type=1&product=mail163&url=http://entry.mail.163.com/coremail/fcg/ntesdoor2?lightweight%3D1%26verifycookie%3D1%26language%3D-1%26style%3D1', data= postdata,#请求数据 headers = self.header #请求头 ) result = urllib2.urlopen(req).read() result = str(result) #print result self.user = self.username.split('@')[0] self.cookie.save(self.cookiefile)#保存cookie if '登录成功,正在跳转...' in result: #print("%s 你已成功登陆163邮箱。---------\n" %(self.user)) flag = True else: flag = '%s 登陆163邮箱失败。'%(self.user) return flag #获取通讯录 def address_list(self): #获取认证sid auth = urllib2.Request( url='http://entry.mail.163.com/coremail/fcg/ntesdoor2?username='+self.user+'&lightweight=1&verifycookie=1&language=-1&style=1', headers = self.header ) auth = urllib2.urlopen(auth).read() for i,sid in enumerate(self.cookie):#enumerate()用于同时返数字索引与数值,实际上是一个元组:((0,test[0]),(1,test[1]).......)这有点像php里的foreach 语句的作用 sid = str(sid) #print sid if 'sid' in sid: sid = sid.split()[1].split('=')[1] break #print sid self.cookie.save(self.cookiefile) #请求地址 url = 'http://twebmail.mail.163.com/js6/s?sid='+sid+'&func=global:sequential&showAd=false&userType=browser&uid='+self.username #参数设定(var 变量是必需要的,不然就只能看到:<code>S_OK</code><messages/>这类信息) #这里参数也是在firebug下查看的。 postdata = { 'func':'global:sequential', 'showAd':'false', 'sid':sid, 'uid':self.username, 'userType':'browser', 'var':'<?xml version="1.0"?><object><array name="items"><object><string name="func">pab:searchContacts</string><object name="var"><array name="order"><object><string name="field">FN</string><boolean name="desc">false</boolean><boolean name="ignoreCase">true</boolean></object></array></object></object><object><string name="func">pab:getAllGroups</string></object></array></object>' } postdata = urllib.urlencode(postdata) #组装请求 req = urllib2.Request( url = url, data = postdata, headers = self.header ) res = urllib2.urlopen(req).read() #print res #解析XML,转换成json #说明:由于这样请求后163给出的是xml格式的数据, #为了返回的数据能方便使用最好是转为JSON json = [] tree = etree.fromstring(res) obj = None for child in tree: #print child.tag #print child.attrib #print child.text #print child.getchildren if child.tag == 'array': obj = child #print obj #<Element 'array' at 0x21971b0> break #这里多参考一下,etree元素的方法属性等,包括attrib,text,tag,getchildren()等 obj = obj[0].getchildren().pop() for child in obj: for x in child: attr = x.attrib if attr['name']== 'EMAIL;PREF': value = {'email':x.text} #print value json.append(value) return json #Demo print("Requesting......\n\n") login = Login163('xxx','xxx') flag = login.login() if type(flag) is bool: print("Successful landing,Resolved contacts......\n\n") res = login.address_list() #print res for x in res: print(x['email']) else: print(flag)
附结果图: