java爬取携程酒店的评价信息以及eleven参数获取

本贴已经无用了,需要爬数据的可以去携程app端,未加密可以直接请求接口获取数据 下面给出一个简单的思路:

访问页面https://m.ctrip.com/webapp/hotel/hoteldetail/dianping/7500321.html?&fr=detail&atime=20190510&days=1

java爬取携程酒店的评价信息以及eleven参数获取_第1张图片

请求地址:https://m.ctrip.com/restapi/soa2/16765/gethotelcomment?_fxpcqlniredt=09031069110301833861 

带入json参数  ,参数可以自己去看一下:

String json = "{\"hotelId\":" + hotelId + ",\"pageIndex\":" + pageIndex + ",\"tagId\":0,\"pageSize\":10,\"groupTypeBitMap\":2,\"needStatisticInfo\":0,\"order\":0,\"basicRoomName\":\"\",\"travelType\":-1,\"head\":{\"cid\":\"09031121310402803767\",\"ctok\":\"\",\"cver\":\"1.0\",\"lang\":\"01\",\"sid\":\"8888\",\"syscode\":\"09\",\"auth\":\"\",\"extension\":[]}}";

酒店 id 和 pageIndex 传一下就行了

用httpclient 循环请求该接口 然后 返回字段有个java爬取携程酒店的评价信息以及eleven参数获取_第2张图片

当此值为1就说明是最后一一页。爬取的时候注意让程序休眠一会。否则会可能限流以及封ip.

以下方法已经不可用

源码可以在我的资源中下载,谢谢!https://download.csdn.net/download/qq_39477018/10764634

java爬取携程酒店的评价信息以及eleven参数获取_第3张图片

java爬取携程酒店的评价信息以及eleven参数获取_第4张图片

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.http.*;
import org.springframework.http.HttpMethod;

import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
import javax.swing.plaf.metal.OceanTheme;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Matcher;

/**
 * Created by Administrator on 2018/9/28.
 */
public class GetHotelJudge {
    private static BlockingQueue insertqueuej= new LinkedBlockingQueue<>();//评价

    public static void main(String[] args) throws IOException {

     //入库操作可根据自己数据库设置
        String sql="insert into hoteljude(HOTELNAME,USERIMG,USERNAME,SCORE,GGSCORE,HOTELTYPE,CHECKINTIME,BADTYPE,JUDGETIME,JUDGEINTRO,JUDGEREPLAY,JUDGEIMG) values(?,?,?,?,?,?,?,?,?,?,?,?)";
        URLDemo2.insert(sql,insertqueuej);

        getHotel();

    }

    public static void   getHotel(){

        try {
            String hotelid="441351"; //酒店id
            String uri="http://hotels.ctrip.com/hotel/"+hotelid+".html";
            HttpClient h=new JavaHttpClient();
            HttpResponse  s = h.doGet(uri,null);//可以自己实现get请求链接  或者参考本人的。
            String hotel=s.getResponseString();
            Document doc=Jsoup.parse(hotel);//获取酒店信息
            Element e=doc.selectFirst("#J_htl_info > div.name > h2.cn_n");
            String hotelname=e.text();
            Map cookies=s.getCookies();
            Map head=gethead(hotelid);
            String even=oceanball(hotelid,head,cookies);//获取even
            int currentPage=0;
            getEiinfo(currentPage,hotelid,"",head,even,cookies,hotelname);//获取评价信息

        } catch (IOException e) {
            e.printStackTrace();
        }

    }


    //获取评价信息
    public static void getEiinfo(int currentPage,String hotelid,String sub,Map headMap,String eleven,Map cookies, String hotelname){
        try {
            String pl = "http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx?" +
                    "MasterHotelID=" + hotelid + "&hotel=" + hotelid + "&NewOpenCount=0&AutoExpiredCount=0&RecordCount=2365&OpenDate=&keywordPress=1&card=-1&property=-1" +
                    "&UserType=&productcode=&keyword=&roomName=&orderBy=2¤tPage="+currentPage+"&viewVersion=c&contyped=0" +
                    "&eleven="+eleven+"&callback="+getcallback(15)+"&_="+System.currentTimeMillis();
            HttpClient h=new JavaHttpClient();
            HttpResponse r=h.doGet2(pl,headMap,cookies);
            String result=r.getResponseString();
            Document doc=Jsoup.parse(result);
            Element element= doc.selectFirst("#divCtripComment > div.comment_detail_list");//得到所有的评论信息div
            if(element!=null){
                Elements elements=element.getElementsByClass("comment_block J_asyncCmt");
                if(elements!=null&&elements.size()>0){
                    for (Element e:elements){
                        Element eltImg=e.selectFirst("div.user_info > p.head > span.img > img");
                        String img=eltImg==null?null:eltImg.attr("src");//用户头像
                        //System.out.println("img:"+img);
                        Element eltName=e.selectFirst("div.user_info > p.name > span");
                        String name=eltName==null?null:eltName.text();//用户名称
                        //System.out.println("name:"+name);
                        Element eltscore=e.selectFirst("div.comment_main > p > span.score > span");
                        String score=eltscore==null?0+"":eltscore.text();//评分总分
                        //System.out.println("score:"+score);
                        Element eltggscore=e.selectFirst("div.comment_main > p > span.small_c");
                        String ggscore=eltggscore==null?null:eltggscore.attr("data-value");//各个维度评分
                        //System.out.println("ggscore:"+ggscore);
                        Element elttype=e.selectFirst("div.comment_main > p > span.type");
                        String type=elttype==null?null:elttype.text();//出游类型
                        //System.out.println("type:"+type);
                        Element elttime=e.selectFirst("div.comment_main > p > span.date");
                        String time=elttime==null?null:elttime.text();//出游时间
                        // System.out.println("time:"+time);
                        Element eltbad=e.selectFirst("div.comment_main > p > a");
                        String bad=eltbad==null?null:eltbad.text();//房型
                        //System.out.println("bad:"+bad);
                        Element eltdate=e.selectFirst("div.comment_main > div.comment_txt > div.comment_bar > p > span");
                        String date=eltdate==null?null:eltdate.text().replace("发表于","");//评论时间
                        //System.out.println("date:"+date);
                        Element eltintro=e.selectFirst("div.comment_main > div.comment_txt > div.J_commentDetail");
                        String intro=eltintro==null?null:eltintro.text();//评论内容
                        System.out.println("intro:"+intro);
                        Element eltreplay=e.selectFirst("div.comment_main > div.htl_reply > p.text");
                        String replay=eltreplay==null?null:eltreplay.text();//酒店回复内容
                        //System.out.println("replay:"+replay);
                        Element picturediv=e.selectFirst("div.comment_main > div > div.comment_pic");
                        String purl="";
                        if(picturediv!=null){
                            Elements picture=picturediv.getElementsByClass("pic");//评论的图集
                            if(picture!=null&&picture.size()>0){
                                for (Element pic:picture){
                                    Element imgs=pic.selectFirst("img.p");
                                    String url=imgs==null?null:imgs.attr("src");
                                    purl=purl+url+";";
                                    //System.out.println("url:"+url);
                                }
                            }
                        }

                        //组装成对象
                        Object[] o=new Object[12];
                        o[0]=hotelname;
                        o[1]=img;
                        o[2]=name;
                        o[3]=score;
                        o[4]=ggscore;
                        o[5]=type;
                        o[6]=time;
                        o[7]=bad;
                        o[8]=date;
                        o[9]=intro;
                        o[10]=replay;
                        o[11]=purl;
                        insertqueuej.add(o);//加入队列进行入库

                    }
                }
                Elements adiv=doc.select("#divCtripComment > div.c_page_box > div > div.c_page_list.layoutfix > a");//获取分页信息的a标签
                int tpage=Integer.parseInt(adiv.last().text());//得到总页数的值
                Element cPage=doc.selectFirst("#divCtripComment > div.c_page_box > div > div.c_page_list.layoutfix > a.current");//得到当前页的a标签
                int cpage=Integer.parseInt(cPage.text());//得到当前页的值
                if (cpage+1<=tpage){//进行循环读取
               /* try {
                    Thread.sleep(30000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }*/
                    String even=oceanball(hotelid,headMap,cookies);//获取even
                    getEiinfo(cpage+1,hotelid,"",headMap,even,cookies,hotelname);//获取评价信息
                }

            }else{
                String even=oceanball(hotelid,headMap,cookies);//获取even
                getEiinfo(currentPage,hotelid,"",headMap,even,cookies,hotelname);//获取评价信息
            }


            } catch (IOException e1) {
            e1.printStackTrace();
        }
    }

    /**
     * 请求头设置
     * @param hotelid
     * @return
     */
    public static  Map gethead(String hotelid){
        Map map = new HashMap();
        map.put("Host", "hotels.ctrip.com");

        map.put("Accept", "*/*");
        map.put("Cache-Control", "max-age=0");
        map.put("If-Modified-Since", "Thu, 01 Jan 1970 00:00:00 GMT");
        map.put("Content-Type","application/x-javascript; charset=utf-8");
        map.put("Accept-Language", "zh-CN,zh;q=0.8");
        map.put("Referer", "http://hotels.ctrip.com/hotel/" + hotelid + ".html");
        map.put("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36");
        return  map;
    }

    /**
     *  加密
     * @param hotelid
     * @return
     */
    public static String oceanball(String hotelid,Map headMap,Map cookies){

        try {
            String callback = getcallback(15);
            HttpClient h=new JavaHttpClient();
            long currtime = System.currentTimeMillis();
            String oceanball = "http://hotels.ctrip.com/domestic/cas/oceanball?callback="+callback+"&_="+currtime+"";
            HttpResponse res=h.doGet2(oceanball,headMap,cookies);
            String ocean = res.getResponseString();
            ocean = ocean.replace("eval","JSON.stringify");
            ScriptEngineManager manager = new ScriptEngineManager();
            ScriptEngine engine = manager.getEngineByName("javascript");
            ocean = String.valueOf( engine.eval(ocean));
            ocean = ocean.replace(callback,"var eleven=" + callback);
            ocean = String.valueOf(engine.eval(new StringReader(ocean)));
            ScriptEngineManager manager1 = new ScriptEngineManager();
            ScriptEngine engine1 = manager1.getEngineByName("javascript");

            engine1.eval("var hotel_id = \""+hotelid+"\"; var site = {}; site.getUserAgent = function(){}; var Image = function(){}; var window = {}; window.document = window.document = {body:{innerHTML:\"1\"}, documentElement:{attributes:{webdriver:\"1\"}}, createElement:function(x){return {innerHTML:\"1\"}}}; var document = window.document;window.navigator = {\"appCodeName\":\"Mozilla\", \"appName\":\"Netscape\", \"language\":\"zh-CN\", \"platform\":\"Win\"}; window.navigator.userAgent = site.getUserAgent(); var navigator = window.navigator; window.location = {}; window.location.href = \"http://hotels.ctrip.com/hotel/\"+hotel_id+\".html\"; var location = window.location;" +
                    " var navigator = {userAgent:{indexOf: function(x){return \"1\"}}, geolocation:\"1\"};var getEleven = 'zgs';  " );
            engine1.eval("var "+callback+" = function(a){getEleven = a;};");
            engine1.eval(ocean);
            String eleven = "";
            if (engine instanceof Invocable) {
                Invocable invocable = (Invocable) engine1;
                eleven = (String) invocable.invokeFunction("getEleven");//4.使用 invocable.invokeFunction掉用js脚本里的方法,第一個参数为方法名,后面的参数为被调用的js方法的入参
            }
            return eleven;
        } catch (IOException e) {
            e.printStackTrace();
        } catch (NoSuchMethodException e) {
            e.printStackTrace();
        } catch (ScriptException e) {
            e.printStackTrace();
        }

        return null;
    }

    /**
     * callback参数获取
     * @param number
     * @return
     */
    public  static String getcallback(int number){
        String s[]={"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"};
        String cal="CAS";
        for(int i=0;i

get 或者post 请求代码 好几个类写的比较复杂,可以直接复制使用。

import javax.net.ssl.*;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.URL;
import java.security.KeyManagementException;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * @Author: liaog
 * @Date: 2018-08-01 13:28
 * @description:
 */
public class JavaHttpClient extends AbstractHttpClient implements HttpClient{

    protected String defaultCharset = "utf-8";

    protected int timeout = 20000;

    protected boolean useProxy;

    protected String hostname;

    protected int port;

    public HttpResponse doRequest(HttpMethod method, String url, Map userHeaders, InputStream data,Map cookies) throws IOException {

        URL urlObject = new URL(url);
        HttpURLConnection urlConnection = openConnection(urlObject);

        if (urlObject.getProtocol().equalsIgnoreCase("https")) {
            HttpsURLConnection httpsURLConnection = (HttpsURLConnection) urlConnection;
            prepareForHttps(httpsURLConnection);
        }
        String sessionid="";
        if (cookies != null) {
            for (Map.Entry entry : cookies.entrySet()) {
                //urlConnection.addRequestProperty(entry.getKey(), entry.getValue());
                sessionid=sessionid+entry.getKey()+"="+entry.getValue()+";";
            }
            urlConnection.addRequestProperty("Cookie", sessionid);
        }

        urlConnection.setRequestMethod(method.name());
        urlConnection.setConnectTimeout(timeout);
        urlConnection.setReadTimeout(timeout);
        urlConnection.setInstanceFollowRedirects(false);
        if (userHeaders != null) {
            for (Map.Entry entry : userHeaders.entrySet()) {
                urlConnection.addRequestProperty(entry.getKey(), entry.getValue());
            }
        }
        if (data != null) {
            int len = data.available();
            urlConnection.addRequestProperty("Content-Length", String.valueOf(len));
            urlConnection.setDoInput(true);
            urlConnection.setDoOutput(true);
            OutputStream outputStream = urlConnection.getOutputStream();
            IOUtil.copyAndClose(data, outputStream);
        }
        urlConnection.setInstanceFollowRedirects( false );
        InputStream responseInputStream = urlConnection.getInputStream();
        int responseCode = urlConnection.getResponseCode();
        ByteArrayOutputStream bos = new ByteArrayOutputStream(64);
        IOUtil.copyAndClose(responseInputStream, bos);

        Map> headers = new HashMap>(urlConnection.getHeaderFields());
        String sessionId = "";
        String cookieVal = "";
        String key = null;
        Map map=new HashMap();
        for(int i = 1; (key = urlConnection.getHeaderFieldKey(i)) != null; i++){//获取cookies
            if(key.equalsIgnoreCase("set-cookie")){
                cookieVal = urlConnection.getHeaderField(i);
                cookieVal = cookieVal.substring(0, cookieVal.indexOf(";"));
                String s[]=cookieVal.split("=");
                map.put(s[0],s[1]);
                sessionId = sessionId + cookieVal + ";";
                System.out.println("==="+cookieVal);
            }
        }
       // System.out.println("session"+sessionId);
        String redirect=urlConnection.getHeaderField( "location" );//获得302转发地址
        //System.out.println("location:"+redirect);
        /*if(redirect!=null){
            doRequest(HttpMethod.GET,redirect,userHeaders,null,map);
        }*/

        return new HttpResponse(defaultCharset, responseCode, headers, bos.toByteArray(),map);
    }

    private HttpURLConnection openConnection(URL url) throws IOException {
        if (isUseProxy()) {
            return (HttpURLConnection)url.openConnection(new Proxy(Proxy.Type.HTTP, new InetSocketAddress(getHostname(), getPort())));
        } else {
            return (HttpURLConnection)url.openConnection();
        }
    }

    private void prepareForHttps(HttpsURLConnection httpsURLConnection) {
        try {
            SSLContext sslContext = SSLContext.getInstance("SSL");
            sslContext.init(null, new TrustManager[]{new AbstractHttpClient.TrustAnyTrustManager()}, secureRandom);
            httpsURLConnection.setSSLSocketFactory(sslContext.getSocketFactory());
            httpsURLConnection.setHostnameVerifier(new TrustAnyHostnameVerifier());
        } catch (NoSuchAlgorithmException e) {
            throw new RuntimeException(e);
        } catch (KeyManagementException e) {
            throw new RuntimeException(e);
        }
    }

    public String getDefaultCharset() {
        return defaultCharset;
    }

    public void setDefaultCharset(String defaultCharset) {
        this.defaultCharset = defaultCharset;
    }

    public int getTimeout() {
        return timeout;
    }

    public void setTimeout(int timeout) {
        this.timeout = timeout;
    }

    public boolean isUseProxy() {
        return useProxy;
    }

    public void setUseProxy(boolean useProxy) {
        this.useProxy = useProxy;
    }

    public String getHostname() {
        return hostname;
    }

    public void setHostname(String hostname) {
        this.hostname = hostname;
    }

    public int getPort() {
        return port;
    }

    public void setPort(int port) {
        this.port = port;
    }
}

 

 

import java.io.IOException;
import java.io.InputStream;
import java.util.Map;

/**
 * @Author: liaogk
 * @Date: 2018-08-01 13:18
 * @description:
 */
public interface HttpClient {

    HttpResponse doRequest(HttpMethod method, String url, Map userHeaders, InputStream data,Map cookies)throws IOException;

    byte[] doGet(String url) throws IOException;

    HttpResponse doGet(String url, Map headers) throws IOException;
    HttpResponse doGet2(String url, Map headers,Map cookies) throws IOException;
    HttpResponse dopost(String url, Map headers,InputStream data,Map cookies) throws IOException;

}

 

 

/**
 * @Author: liaogk
 * @Date: 2018-08-01 13:22
 * @description:
 */
public enum  HttpMethod {
    POST, GET, PUT, DELETE, HEADER, OPTIONS;
}
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.Map;

/**
 * @Author: liaog
 * @Date: 2018-08-01 13:24
 * @description:
 */
public class HttpResponse {

    private int responseCode;
    private Map> headers;
    private byte[] responseData;
    private String defaultCharset;
    private Map cookies;

    public HttpResponse(String defaultCharset, int responseCode, Map> headers, byte[] responseData,Map cookies) {
        this.defaultCharset = defaultCharset;
        this.responseCode = responseCode;
        this.headers = headers;
        this.responseData = responseData;
        this.cookies = cookies;
    }

    public int getResponseCode() {
        return responseCode;
    }

    public byte[] getResponseData() {
        return responseData;
    }

    public String getResponseString() {
        try {
            return new String(responseData, getResponseCharset());
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(e);
        }
    }

    public List getHeaders(String name) {
        return headers.get(name);
    }

    public String getHeader(String name) {
        List theHeaders = getHeaders(name);
        if (theHeaders == null || theHeaders.isEmpty()) {
            return null;
        }
        return theHeaders.get(0);
    }


    public String getResponseCharset() {
        String contentType = getHeader("Content-Type");
        if (contentType == null || contentType.length() == 0) {
            return defaultCharset;
        }
        String[] parts = contentType.split(";");
        for (String part : parts) {
            part = part.trim();
            String[] kvParts = part.split("=");
            if (kvParts.length < 2) {
                continue;
            }
            String key = kvParts[0].trim();
            String value = kvParts[1].trim();
            if (key.equals("charset")) {
                return value;
            }
        }
        return defaultCharset;
    }

    public Map getCookies() {
        return cookies;
    }

    public void setCookies(Map cookies) {
        this.cookies = cookies;
    }
}

 

import java.io.*;

/**
 * @Author: liaog
 * @Date: 2018-08-01 13:39
 * @description:
 */
public class IOUtil {

    private static final int BUF_LEN = 1024 * 8;
    /**
     * 通过threadLocal做cache优化,避免重复申请内存
     */
    private static final ThreadLocal bufTl = new ThreadLocal() {
        @Override
        protected byte[] initialValue() {
            return new byte[BUF_LEN];
        }
    };

    private static byte[] getBuf() {
        return bufTl.get();
    }

    public static void copyAndClose(InputStream is, OutputStream os) throws IOException {
        byte[] buf = getBuf();
        while (true) {
            int len = is.read(buf);
            if (len < 0) {
                break;
            }
            os.write(buf, 0, len);
        }
        close(is);
        close(os);
    }

    public static byte[] readAsBytes(File file) throws IOException {
        ByteArrayOutputStream bos = new ByteArrayOutputStream(BUF_LEN);
        copyAndClose(readAsStream(file), bos);
        return bos.toByteArray();
    }

    public static void writeBytesToFile(File file, byte[] data) throws IOException{
        FileOutputStream fos = new FileOutputStream(file);
        copyAndClose(new ByteArrayInputStream(data), fos);
    }

    public static InputStream readAsStream(File file) throws IOException {
        return new FileInputStream(file);
    }

    public static void close(Closeable c) {
        if (c != null) {
            try {
                c.close();
            } catch (Exception e) {
                // ignore
            }
        }
    }

    public static String getFileNameSuffix(String name) {
        if (name == null) {
            return null;
        }
        int pos = name.lastIndexOf('.');
        if (pos < 0) {
            return null;
        }
        return name.substring(pos);
    }

}

 

import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLSession;
import javax.net.ssl.X509TrustManager;
import java.io.IOException;
import java.io.InputStream;
import java.security.SecureRandom;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.Map;

/**
 * @Author: liaogk
 * @Date: 2018-08-01 13:19
 * @description:
 */
public abstract class AbstractHttpClient implements HttpClient{

    @Override
    public abstract HttpResponse doRequest(HttpMethod method, String url, Map userHeaders, InputStream data ,Map cookies) throws IOException;

    @Override
    public  byte[] doGet(String url) throws IOException {
        return doGet(url, null).getResponseData();
    }

    @Override
    public HttpResponse doGet(String url, Map headers) throws IOException {
        return doRequest(HttpMethod.GET, url, headers, null,null);
    }


    @Override
    public HttpResponse doGet2(String url, Map headers,Map cookies) throws IOException {
        return doRequest(HttpMethod.GET, url, headers, null,cookies);
    }

    @Override
    public HttpResponse dopost(String url, Map headers,InputStream data, Map  cookies) throws IOException {
        return doRequest(HttpMethod.POST, url, headers, data,cookies);
    }
    protected static class TrustAnyTrustManager implements X509TrustManager {
        @Override
        public void checkClientTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
        }

        @Override
        public void checkServerTrusted(X509Certificate[] x509Certificates, String s) throws CertificateException {
        }

        @Override
        public X509Certificate[] getAcceptedIssuers() {
            return new X509Certificate[0];
        }
    }

    protected static class TrustAnyHostnameVerifier implements HostnameVerifier {
        @Override
        public boolean verify(String s, SSLSession sslSession) {
            return true;
        }
    }

    protected final SecureRandom secureRandom = new SecureRandom();
}

 

 

 

你可能感兴趣的:(Java爬虫)