一、HttpClient模拟登录是通过post或get请求,登录后抓取的是静态页面,动态页面使用HtmlUnit
public static void main(String[] args) throws IOException {
HttpClient httpclient = new DefaultHttpClient();
HttpPost httpost = new HttpPost("http://localhost:8080/jeefh/vworkerC/login"); // 登录url
List nvp = new ArrayList();
nvp.add(new BasicNameValuePair("loginname", "白斐"));
nvp.add(new BasicNameValuePair("password", "123456"));
nvp.add(new BasicNameValuePair("isRemember", "1"));
String sCharSet = "utf-8";
httpost.setEntity(new UrlEncodedFormEntity(nvp, sCharSet));
HttpResponse response = httpclient.execute(httpost);
String str = EntityUtils.toString(response.getEntity()); // post请求成功后的返回值
String cookie = response.getLastHeader("Set-Cookie").getValue(); // 获取cookie值
HttpGet index = new HttpGet("http://localhost:8080/jeefh/main"); // 数据接口url
index.setHeader("Cookie", cookie); // 设置之前获取到的cookie
index.setHeader("Content-Type", "application/json;charset=UTF-8");
HttpResponse response1 = httpclient.execute(index);
str = EntityUtils.toString(response1.getEntity()); // 取到的数据
System.out.println(str);
Document doc= Jsoup.parse(str);
Element table = doc.getElementById("my_task_lisk");
}
注:iframe引入的url在doc里查看
maven:
org.apache.httpcomponents
httpclient
4.5.2
org.apache.httpcomponents
httpmime
4.5.2
效果:
二、HtmlUnit模拟登录
public static void main(String[] args) throws IOException {
WebClient webClient = new WebClient(BrowserVersion.CHROME);//设置浏览器
webClient.getOptions().setCssEnabled(false);//设置css是否生效
webClient.getOptions().setJavaScriptEnabled(true);//设置js是否生效
webClient.getOptions().setThrowExceptionOnScriptError(false);//设置是否抛js异常
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置ajax请求
webClient.getOptions().setTimeout(10000);
webClient.waitForBackgroundJavaScript(3000);
HtmlPage htmlPage = webClient.getPage("http://localhost:8080/jeefh/login");//访问路径设置
//System.out.println(page.asText());
//登录
HtmlInput ln = htmlPage.getHtmlElementById("username");
HtmlInput pwd = htmlPage.getHtmlElementById("password");
HtmlInput submit = htmlPage.getFirstByXPath("//form[@id='login_form']/ul/li[4]/input");
ln.setAttribute("value", "ceshi01");
pwd.setAttribute("value", "123456");
HtmlPage page2 = submit.click();
//登录完成,爬取iframe的url
HtmlPage page3 = webClient.getPage("http://localhost:8080/jeefh/home/home.jsp");
Document doc= Jsoup.parse(page3.asXml());
Element table = doc.getElementById("my_task_lisk");
Elements tds = table.select("td");
webClient.close();
}
注:getFirstByXPath方法的参数为XPath
maven:
net.sourceforge.htmlunit
htmlunit
2.23
效果:
三、HtmlUnit模拟登录引入iframe
public static void main(String[] args) throws IOException {
WebClient webClient = new WebClient(BrowserVersion.CHROME);//设置浏览器
webClient.getOptions().setCssEnabled(false);//设置css是否生效
webClient.getOptions().setJavaScriptEnabled(true);//设置js是否生效
webClient.getOptions().setThrowExceptionOnScriptError(false);//设置是否抛js异常
webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置ajax请求
webClient.getOptions().setTimeout(10000);
webClient.waitForBackgroundJavaScript(3000);
HtmlPage login = webClient.getPage("http://10.96.4.136:9300/p2pd/servlet/dispatch");//访问路径设置
//登录
HtmlInput ln = login.getElementByName("CAMUsername");
HtmlInput pwd = login.getElementByName("CAMPassword");
HtmlAnchor submit = login.getFirstByXPath("//a[@href='javascript:doSubmit()']");
ln.setAttribute("value", "f0000");
pwd.setAttribute("value", "123456");
submit.click();
System.out.println("登录成功。。");
HtmlPage index = webClient.getPage("http://10.96.4.136:9300/p2pd/servlet/dispatch?b_action=xts.run&m=portal/cc.xts&m_folder=iB3FE4EB598AC4ED894A69C6B28150532");
HtmlAnchor temp = index.getFirstByXPath("//table[@class='cctable']/tbody/tr[3]/td[5]/a");
HtmlPage data = temp.click();
System.out.println("页面跳转成功。。");
List window = data.getFrames();
HtmlPage iframe = (HtmlPage)window.get(0).getEnclosedPage();//getEnclosedPage方法有两个返回值不同,调用返回值为page的
System.out.println("IFrame加载成功。。");
String str = iframe.asXml();
Document doc = Jsoup.parse(str);
Elements tables = doc.getElementsByClass("tb");
Element table = tables.first();
webClient.close();
}
注:1、getEnclosedPage方法有两个返回值不同,调用返回值为page的再强转,否则页面还是原来的;
2、iframe有name属性的调用HtmlPage.getFrameByName("classFrame").getEnclosedPage();
3、没name属性的调用HtmlPage.getFrames().get(0).getEnclosedPage();参数0是页面中出现的第1个iframe;