对于表单隐藏在js内,需要解析js并动态加载网页,找了很多组件,包括httpunit,后来试验htmlunit通过。感觉httpunit还是比htmlunit差点,所以建议用htmlunit来解析js动态加载的网页并自动提交表单。对于模拟浏览器的htmlunit组件还是有很多需要挖掘,这样才能试验不同网站的通用性解析。
重点是:Htmlunit版本会爆出如sslcontext以及httpclient版本不兼容问题,换了3个版本,最后2.18版本可以用。
htmlunit基础代码如下:
package gddx; import java.util.HashMap; import java.util.Map; import java.util.Set; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.html.HtmlButton; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.gargoylesoftware.htmlunit.html.HtmlPasswordInput; import com.gargoylesoftware.htmlunit.html.HtmlTextInput; import com.gargoylesoftware.htmlunit.util.Cookie; public class Email163Login { public static void main(String[] args) { try { //创建一个webclient WebClient webClient = new WebClient(BrowserVersion.CHROME); //参数设置 // 1 启动JS webClient.getOptions().setJavaScriptEnabled(true); // 2 禁用Css,可避免自动二次请求CSS进行渲染 webClient.getOptions().setCssEnabled(false); //3 启动客户端重定向 webClient.getOptions().setRedirectEnabled(true); // 4 运行错误时,是否抛出异常 webClient.getOptions().setThrowExceptionOnScriptError(false); // 5 设置超时 webClient.getOptions().setTimeout(50000); //6 设置忽略证书 //webClient.getOptions().setUseInsecureSSL(true); //7 设置Ajax //webClient.setAjaxController(new NicelyResynchronizingAjaxController()); //8设置cookie webClient.getCookieManager().setCookiesEnabled(true); //获取页面 HtmlPage page = webClient.getPage("http://mail.163.com/"); // 根据form的名字获取页面表单,也可以通过索引来获取:page.getForms().get(0) HtmlForm form = page.getFormByName("login163"); HtmlTextInput username = (HtmlTextInput) form.getInputByName("username"); HtmlPasswordInput password = (HtmlPasswordInput) form.getInputByName("password"); username.setValueAttribute("sharpsword"); password.setValueAttribute("xyz"); HtmlButton button =(HtmlButton)page.getHtmlElementById("loginBtn"); HtmlPage retPage = (HtmlPage) button.click(); // 等待JS驱动dom完成获得还原后的网页 webClient.waitForBackgroundJavaScript(10000); //输出网页内容 System.out.println(retPage.asXml()); //获取cookie Set<Cookie> cookies = webClient.getCookieManager().getCookies();; Map<String, String> responseCookies = new HashMap<String, String>(); for (Cookie c : cookies) { responseCookies.put(c.getName(), c.getValue()); System.out.print(c.getName()+":"+c.getValue()); } //关闭webclient webClient.close(); }catch (Exception e) {System.err.println( "Exception: " + e ); } } }