(二)爬虫htmlunit 练习例子,模拟下单。

下载htmlunit所需的jar包,

https://download.csdn.net/download/final0402/12158044

 

import java.net.URL;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class ExampleHtmlUnit {

    
    public  static void main(String args[]) throws Exception{
        
        
        System.out.println("-----------------------------------------开始执行代码----------------------------------------------");
        final WebClient webClient = new WebClient(BrowserVersion.CHROME);
        webClient.getOptions().setThrowExceptionOnScriptError(false);//当JS执行出错的时候是否抛出异常, 
        webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);//当HTTP的状态非200时是否抛出异常, 
        webClient.getOptions().setActiveXNative(false);
        webClient.getOptions().setCssEnabled(false);//是否启用CSS, 因为不需要展现页面, 
        webClient.getOptions().setJavaScriptEnabled(true); //启用JS
        webClient.setAjaxController(new NicelyResynchronizingAjaxController());//设置支持AJAX
        System.out.println("-----------------------------------------初始化浏览器对象完成----------------------------------------------");
                


        //下单网页地址,
        URL link=new URL("https://xxz.xxxx.xxx/ssss146.html"); 
        WebRequest request=new WebRequest(link); 

 

 

      //通过F12查看,进行设置,有多少,加多少
        request.setAdditionalHeader("Referer", "https://passport.jd.com/new/login.aspx?ReturnUrl=https%3A%2F%2Fitem.jd.com%2F100011385146.html");//设置请求报文头里的refer字段
        设置请求报文头里的User-Agent字段
        request.setAdditionalHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3314.0 Safari/537.36 SE 2.X MetaSr 1.0");
        request.setAdditionalHeader("Connection", "keep-alive");
        request.setAdditionalHeader("upgrade-insecure-requests", "1");
        request.setAdditionalHeader("accept-language", "zh-CN,zh;q=0.9");
        request.setAdditionalHeader("accept-encoding", "gzip, deflate, br");
        request.setAdditionalHeader("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        request.setAdditionalHeader("authority", "item.jd.com");
        request.setAdditionalHeader("method", "GET");
        request.setAdditionalHeader("path", "/100ddd.html");
        request.setAdditionalHeader("scheme", "https");
        
        
        
        //设置个人用户cookie游览器参数,可以登录,F12查看,整体拷贝就行了。
        request.setAdditionalHeader("Cookie", 
                "shshshfpb=; "
                + " __jdu=; "
                + " shshshfpa=xxx-xx-xx-xx-xxx-xx;"
                + " areaId=1; "
                + " pinId=xxxxxxxx-x-f3wj7;"
                + " pin=jd_4xxxxxxx;"
                + " unick=jd_xxxxxxx;"
                + " _tp=vhxxhrLR2BQCBLcNTxxjA%3D;"
                + " _pst=jd_4xx9;"
                + " user-key=xx-b482-xx-xx-xx;"
                + " ipLoc-djd=x-2901-x-x;"
                + " ipLocation=%xx%xx;"
                + " cn=31; "
                + " unpl=V2_ZzNtbxxxxxwUB3MRWANhUBQNclRCFnxxxxcRxBFCEdkexhdBxxxECdSbDVkAyJdQxxx9GlQHbgMQVEVXQxN2C0NRSylbNVczxxxVjChReSlVxxgYiXHJU;"
                + "  __jdv=xx|123.sogou.com|t_1000003625_sogoumz|tuiguang|xxxxxx|xxx; "
                + " PCSYCityID=CN_110000_110100_110114;"
                + " __jda=122270672.1581587293813861172244.1581587294.1581601331.1581777054.3;"
                + " __jdc=234234; "
                + " shshshfp=234234234; wlfstk_smdl=234234234; "
                + " TrackID=234234-WleEb0cj2GzLIeO-234234234234; "
                + " thor=234234234234; "
                + " ceshi3.com=201; __jdb=23423|23423423;"
                + " shshshsID=234234234234; "
                + " 3AB9D23F7A4B3C9B=xxxxxx");


        
        
        System.out.println("-----------------------------------------设置浏览器 参数----------------------------------------------");
        HtmlPage page = null;
        try {
            //page = webClient.getPage("https://item.jd.com/100011385146.html");//尝试加载上面图片例子给出的网页
            page = webClient.getPage(request);
        } catch (Exception e) {
            //e.printStackTrace();
        }finally {
            webClient.close();
        }
        
        webClient.waitForBackgroundJavaScript(30000);//异步JS执行需要耗时,所以这里线程要阻塞30秒,等待异步JS执行结束
     
        System.err.println("-----------------------------------------发出客户端请求----------------------------------------------"+page.getWebResponse().getLoadTime());      
        
        //assertEquals("HtmlUnit - Welcome to HtmlUnit", page.getTitleText());  
        //System.err.println(  page.getDocumentURI() );
        
        System.err.println(  page.getTitleText() );
        //System.err.println(  page.getHead() );
        //System.err.println(  page.getLocalName() );
        //System.err.println(  page.asText() );
        //System.err.println(  page.querySelector("") );
        //System.err.println(  page.getInputEncoding() );
        //System.err.println(  page.getBody() );
        
        HtmlPage pageResult = page;
        //System.err.println(  pageResult.getElementsByIdAndOrName("btn-reservation").c );
        
        
        //获得  等待预约  对象
        DomElement onclick = pageResult.getElementById("btn-reservation");
        System.err.println(   onclick.getTextContent()   );
        
        
        //模拟用户点击预约对象
        Page p = onclick.click();
        System.err.println( p.getWebResponse().getLoadTime() );
        System.err.println( p.getWebResponse().getStatusMessage() );
        
    }

 

}

你可能感兴趣的:(爬虫,java,htmlunit)