Java爬虫技术之绕过百度云防护抓取网站内容

大家好,我是Coody


最近做文章采集,碰到一个有经过百度云加速的网站,由于打开浏览器需要安全检查,所以针对相关机制做了一下研究,故此封装了一个HTTP工具。


本文已发布之开源中国,由于csdn用户量巨大且易于搜索引擎收录,故此分享出来希望对特定的友友有所帮助。


直接贴代码,copy下来可以直接使用






如图:


输入图片说明


首先需要一个Http工具类:HttpHandle


package org.coody.robot.util;


import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;




public class HttpHandle {



public final static String POST="POST";
public final static String GET="GET";
public final static String HEAD="HEAD";
public final static String PUT="PUT";
public final static String CONNECT="CONNECT";
public final static String OPTIONS="OPTIONS";
public final static String DELETE="DELETE";
public final static String PATCH="PATCH";
public final static String PROPFIND="PROPFIND";
public final static String PROPPATCH="PROPPATCH";
public final static String MKCOL="MKCOL";
public final static String COPY="COPY";
public final static String MOVE="MOVE";
public final static String LOCK="LOCK";
public final static String UNLOCK ="UNLOCK";
public final static String TRACE="TRACE";

public final static String HTTP_GENERAL="HTTP_GENERAL";

public final static String HTTP_JSON="HTTP_JSON";

public HttpConfig config=new HttpConfig();




public HttpConfig getConfig() {
return config;
}


public void setConfig(HttpConfig config) {
this.config = config;
}
public static class HttpConfig{

private boolean allowRedirects=true;

private String cookie="";

private String encode="UTF-8";

private int timeOut=15;

private String httpModule=HTTP_GENERAL;

private Map headerMap=new HashMap();


public void setEncode(String encode) {
this.encode = encode;
}




public void setTimeOut(int timeOut) {
this.timeOut = timeOut;
}




public void setCookie(String cookie) {
this.cookie = cookie;
}




public void setHeaderMap(Map headerMap) {
this.headerMap = headerMap;
}


//设置Header头部
public void setRequestProperty(String fieldName,String value){
headerMap.put(fieldName, value);
}
//是否开启Gzip
public void setGzip(boolean isGzip){
if(isGzip){
headerMap.put("Accept-Encoding", "gzip, deflate, sdch");
return;
}
headerMap.put("Accept-Encoding", "*");
}
//是否保持连接
public void setKeepAlive(boolean keepAlive){
if(keepAlive){
headerMap.put("Connection", "keep-alive");
return;
}
headerMap.put("Connection", "close");
}

//是否允许重定向
public void allowRedirects(boolean allowRedirects){
this.allowRedirects=allowRedirects;
}
}

private HttpURLConnection createConnectionGeneral(String url) {
try {
HttpURLConnection conn = (HttpURLConnection) new URL(url)
.openConnection();
conn.addRequestProperty("Referer", getDomain(url));
conn.addRequestProperty(
"Accept",
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
conn.addRequestProperty("Content-type",
"application/x-www-form-urlencoded");
conn.addRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
return conn;
} catch (Exception e) {
return null;
}
}

private HttpURLConnection createConnectionJson(String url) {
try {
HttpURLConnection conn = (HttpURLConnection) new URL(url)
.openConnection();
conn.addRequestProperty("Referer", getDomain(url));
conn.addRequestProperty(
"Accept",
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
conn.addRequestProperty("Content-type",
"application/x-www-form-urlencoded");
conn.addRequestProperty(
"User-Agent",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36");
return conn;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}








//获取默认来源地址
public static String getDomain(String urlStr){
try {
URI uri=new URI(urlStr);
String result=uri.getScheme()+"://"+uri.getHost();
if(uri.getPort()>0&&uri.getPort()!=80){
result+=("/"+uri.getPort());
}
if(!result.endsWith("/")){
result+="/";
}
return result;
} catch (Exception e) {
e.printStackTrace();
return null;
}

}
//合并Cookie
private static String mergeCookie(String oldCookie, String newCookie) {
if (newCookie == null) {
return oldCookie;
}
Map cookieMap = new HashMap();
String[] cookTmp = null;
String[] cookieTab = null;
StringBuilder valueTmp = new StringBuilder();
String[] cookies = { oldCookie, newCookie };
for (String currCookie : cookies) {
if (StringUtil.isNullOrEmpty(currCookie)) {
continue;
}
cookieTab = currCookie.split(";");
for (String cook : cookieTab) {
cookTmp = cook.split("=");
if (cookTmp.length < 2) {
continue;
}
valueTmp = new StringBuilder();
for (int i = 1; i < cookTmp.length; i++) {
valueTmp.append(cookTmp[i]);
if (i < cookTmp.length - 1) {
valueTmp.append("=");
}
}
if (StringUtil.findNull(cookTmp[0], valueTmp) > -1) {
continue;
}
cookieMap.put(cookTmp[0], valueTmp.toString());
}
}
valueTmp = new StringBuilder();
for (String key : cookieMap.keySet()) {
valueTmp.append(key).append("=").append(cookieMap.get(key));
valueTmp.append(";");
}
return valueTmp.toString();
}

private HttpURLConnection getConnection(String url) {
if(config.httpModule.equals(HTTP_GENERAL)){
return createConnectionGeneral(url);
}
if(config.httpModule.equals(HTTP_JSON)){
return createConnectionJson(url);
}
return null;
}

public HttpEntity Get(String url){
return Conn(url, GET, null);
}

public HttpEntity Post(String url,String data){
return Conn(url, POST, data);
}

public HttpEntity Conn(String url, String method,
String postData){
if(url.contains(" ")){
url=url.replace(" ", "%20");
}
HttpURLConnection conn = getConnection(url);
if (conn == null) {
return null;
}
if (!StringUtil.isNullOrEmpty(config.headerMap)) {
for (String key : config.headerMap.keySet()) {
conn.setRequestProperty(key, config.headerMap.get(key));
key = null;
}
}
if(!config.allowRedirects){
conn.setInstanceFollowRedirects(false);
}
if (!StringUtil.isNullOrEmpty(config.cookie)) {
conn.setRequestProperty("Cookie", config.cookie);
}
try {
conn.setRequestMethod(method);
if (method.equalsIgnoreCase(POST)||method.equalsIgnoreCase(PUT)) {
conn.setDoOutput(true);
byte [] postByte=postData.getBytes(config.encode);
conn.setRequestProperty("Content-Length", String.valueOf(postByte.length));
conn.getOutputStream().write(postByte);
conn.connect();
conn.getOutputStream().flush();
conn.getOutputStream().close();
}
} catch (Exception e) {
e.printStackTrace();
}
conn.setConnectTimeout(config.timeOut*1000);
InputStream ins = null;
HttpEntity hEntity = new HttpEntity();
String key = "";
StringBuilder cookie = new StringBuilder();
try {
Integer status=conn.getResponseCode();
if (status !=HttpURLConnection.HTTP_OK) {
ins=conn.getErrorStream();
}else{
ins=conn.getInputStream();
}
hEntity.setCode(conn.getResponseCode());
Map headMap=new HashMap();
for (int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++) {
headMap.put(key, conn.getHeaderField(key));
if (key.equalsIgnoreCase("set-cookie")) {
try {
cookie.append(conn.getHeaderField(i).replace("/", ""));
} catch (Exception e) {
}
}
}
config.cookie = mergeCookie(config.cookie, cookie.toString());
byte[] b = toByte(ins);
if((headMap.get("Content-Encoding")!=null && headMap.get("Content-Encoding").contains("gzip"))||(conn.getRequestProperty("Accept-Encoding")!=null&&conn.getRequestProperty("Accept-Encoding").contains("gzip"))){
b = GZIPUtils.uncompress(b);
}
hEntity.setEncode(config.encode);
hEntity.setBye(b);
hEntity.setCookie(config.cookie);
hEntity.setHeadMap(headMap);
} catch (Exception e) {
e.printStackTrace();
}finally{
try {
ins.close();
} catch (Exception e2) {
}
}
return hEntity;
}


private byte[] toByte(InputStream ins) {
if(ins==null){
return null;
}
ByteArrayOutputStream swapStream = null;
try {
swapStream = new ByteArrayOutputStream();
byte[] buff = new byte[1024];
int rc = 0;
while ((rc = ins.read(buff, 0, 1024)) > 0) {
swapStream.write(buff, 0, rc);
}
return swapStream.toByteArray();
} catch (Exception e) {
e.printStackTrace();
return null;
} finally {
try {
swapStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}




}


其次需要一个Http响应对象类:HttpEntity


package org.coody.robot.util;


import java.util.HashMap;
import java.util.Map;


public class HttpEntity {



private String html;
private byte[] bye;
private String cookie;
private Integer code=-1;
private Map headMap;

public Map getHeadMap() {
return headMap;
}


public void setHeadMap(Map headMap) {
this.headMap = headMap;
}


private String encode="UTF-8";

public String getHtml() {
try {
if(html!=null){
return html;
}
if(bye==null){
return null;
}
String str= new String(bye, encode);
html=str;
return str;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}

public String getHtml(boolean isGzip) {
try {
if(bye==null){
return null;
}
String str= new String(GZIPUtils.uncompress(bye), encode);
return str;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}


public String getEncode() {
return encode;
}


public void setEncode(String encode) {
this.encode = encode;
}


public void setHtml(String html) {
this.html = html;
}


public Integer getCode() {
return code;
}


public void setCode(Integer code) {
this.code = code;
}


public String getCookie() {
return cookie;
}


public void setCookie(String cookie) {
this.cookie = cookie;
}


public byte[] getBye() {
return bye;
}


public void setBye(byte[] bye) {
this.bye = bye;
}


public Map getCookieMap() {
if (cookie == null) {
return null;
}
Map cookieMap = new HashMap();
String[] cookies = cookie.split(";");
for (String cook : cookies) {
String[] tmps = cook.split("=");
if (tmps.length >= 2) {
String cookieValue = "";
for (int i = 1; i < tmps.length; i++) {
cookieValue += tmps[i];
if (i < tmps.length-1) {
cookieValue += "=";
}
}
cookieMap.put(tmps[0].trim(), cookieValue.trim());
}
}
return cookieMap;
}
}


某些网站是有Gzip压缩的,需要一个Gzip压缩类GzipUtils


package org.coody.robot.util;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;  
  
  
  
/** 
 *  
 * @author wenqi5 
 *  
 */  
public class GZIPUtils {  
  
    public static final String GZIP_ENCODE_UTF_8 = "UTF-8";  
  
  
    /** 
     * 字符串压缩为GZIP字节数组 
     *  
     * @param str 
     * @return 
     */  
    public static byte[] compress(String str) {  
        try {
return compress(str.getBytes("UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return null;
}  
    }  
  
    /** 
     * 字符串压缩为GZIP字节数组 
     *  
     * @param str 
     * @param encoding 
     * @return 
     */  
    public static byte[] compress(byte[] data) {  
        if (data == null || data.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        GZIPOutputStream gzip = null;  
        try {  
            gzip = new GZIPOutputStream(out);  
            gzip.write(data);  
        } catch (IOException e) {  
        }finally {
        try {
        gzip.close();  
} catch (Exception e2) {
}
        try {
        out.close();
} catch (Exception e2) {
}
}
        return out.toByteArray();  
    }  
  
    /** 
     * GZIP解压�? 
     *  
     * @param bytes 
     * @return 
     */  
    public static byte[] uncompress(byte[] bytes) {  
        if (bytes == null || bytes.length == 0) {  
            return null;  
        }  
        ByteArrayOutputStream out = new ByteArrayOutputStream();  
        ByteArrayInputStream in = new ByteArrayInputStream(bytes);  
        try {  
            GZIPInputStream ungzip = new GZIPInputStream(in);  
            byte[] buffer = new byte[256];  
            int n;  
            while ((n = ungzip.read(buffer)) >= 0) {  
                out.write(buffer, 0, n);  
            }  
        } catch (IOException e) {  
        }  finally {
try {
in.close();
} catch (Exception e2) {
}
try {
out.close();
} catch (Exception e2) {
}
        }
  
        return out.toByteArray();  
    }  
}  
以上类均依赖一个StringUtil,笔者比较懒,也没有拆分出来


package org.coody.robot.util;


import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


//import oracle.sql.CLOB;


public class StringUtil {



public static Integer[] getIntegerParas(Object[] objs) {
if (isNullOrEmpty(objs)) {
return null;
}
Integer[] ints = new Integer[objs.length];
for (int i = 0; i < objs.length; i++) {
try {
ints[i] = Integer.valueOf(objs[i].toString());
} catch (Exception e) {
}
}
return ints;
}


/**
* 生成指定数目字符串按分隔符分�?

* @param baseStr
* @param mosaicChr
* @param size
* @return
*/
public static String getByMosaicChr(String baseStr, String mosaicChr, Integer size) {
List list = new ArrayList();
for (int i = 0; i < size; i++) {
if (isNullOrEmpty(baseStr)) {
continue;
}
list.add(baseStr);
}
return collectionMosaic(list, mosaicChr);
}


/**
* 根据分割符将字符串分割成String数组

* @param src
*            源字符串
* @param separator
*            分隔�??
* @return String数组
*/
public static String[] splitToStringArray(String src, String separator) {
Vector splitArrays = new Vector();
int i = 0;
int j = 0;
while (i <= src.length()) {
j = src.indexOf(separator, i);
if (j < 0) {
j = src.length();
}
splitArrays.addElement(src.substring(i, j));
i = j + 1;
}
int size = splitArrays.size();
String[] array = new String[size];
System.arraycopy(splitArrays.toArray(), 0, array, 0, size);
return array;
}


/**
* 根据分割符将字符串分割成Integer数组

* @param src
*            源字符串
* @param separator
*            分隔�??
* @return Integer数组
*/
public static Integer[] splitToIntgArray(String src, String separator) {
String[] arr = splitToStringArray(src, separator);
Integer[] intArr = new Integer[arr.length];
for (int i = 0; i < arr.length; i++) {
intArr[i] = Integer.valueOf(arr[i]);
}
return intArr;
}


/**
* 根据分隔符将字符串分割成int数组

* @param src
*            源字符串
* @param separator
*            分隔�??
* @return int数组
*/
public static int[] splitToIntArray(String src, String separator) {
String[] arr = splitToStringArray(src, separator);
int[] intArr = new int[arr.length];
for (int i = 0; i < arr.length; i++) {
intArr[i] = Integer.parseInt(arr[i]);
}
return intArr;
}


public static String getInPara(Integer size) {
return getByMosaicChr("?", ",", size);


}


public static String textCutCenter(String allTxt, String firstTxt, String lastTxt) {
try {
String tmp = "";
int n1 = allTxt.indexOf(firstTxt);
if (n1 == -1) {
return "";
}
tmp = allTxt.substring(n1 + firstTxt.length(), allTxt.length());
int n2 = tmp.indexOf(lastTxt);
if (n2 == -1) {
return "";
}
tmp = tmp.substring(0, n2);
return tmp;
} catch (Exception e) {
return "";
}
}


public static List textCutCenters(String allTxt, String firstTxt, String lastTxt) {
try {
List results = new ArrayList();
while(allTxt.contains(firstTxt)){
int n = allTxt.indexOf(firstTxt);
allTxt=allTxt.substring(n+firstTxt.length(), allTxt.length());
n=allTxt.indexOf(lastTxt);
if(n==-1){
return results;
}
String result=allTxt.substring(0, n);
results.add(result);
allTxt=allTxt.substring(n+firstTxt.length(), allTxt.length());
}
return results;
} catch (Exception e) {
return null;
}
}
public static String convertToUnicode(String source) {
String result = "";
char[] chrs = source.toCharArray();
for (int i = 0; i < chrs.length; i++) {
result += "&#" +Character.codePointAt(chrs, i);
}
return result;
}
public static Integer toInteger(Object obj) {
if (isNullOrEmpty(obj)) {
return null;
}
try {
return Integer.valueOf(obj.toString());
} catch (Exception e) {
return null;
}
}


public static String toString(Object obj) {
if (isNullOrEmpty(obj)) {
return null;
}
try {
return String.valueOf(obj.toString());
} catch (Exception e) {
return null;
}
}


public static Double toDouble(Object obj) {
if (isNullOrEmpty(obj)) {
return null;
}
try {
return Double.valueOf(obj.toString());
} catch (Exception e) {
return null;
}
}


public static Float toFloat(Object obj) {
if (isNullOrEmpty(obj)) {
return null;
}
try {
return Float.valueOf(obj.toString());
} catch (Exception e) {
return null;
}
}


public static Long toLong(Object obj) {
if (isNullOrEmpty(obj)) {
return null;
}
try {
return Long.valueOf(obj.toString());
} catch (Exception e) {
return null;
}
}


public static Integer getRanDom(int start, int end) {
return (int) (Math.random() * (end - start + 1)) + start;
}


public static float getRanDom(Float start, Float end) {
String str = String.valueOf(start);
String[] tabs = str.split("\\.");
Integer startLength = 1;
if (tabs.length == 2) {
startLength = tabs[1].length();
}
str = String.valueOf(end);
tabs = str.split("\\.");
Integer endLength = 1;
if (tabs.length == 2) {
endLength = tabs[1].length();
}
if (endLength > startLength) {
startLength = endLength;
}
start = (float) (start * Math.pow(10, startLength));
end = (float) (end * Math.pow(10, startLength));
return (float) (getRanDom(start.intValue(), end.intValue()) / Math.pow(10, startLength));
}


public static String replaceBlank(String str) {
String dest = "";
if (str != null) {
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(str);
dest = m.replaceAll("");
}
return dest;
}


public static Boolean isMatcher(String val, String matcher) {
Pattern p = Pattern.compile(matcher);
Matcher m = p.matcher(val);
return m.matches();
}


public static boolean isMobile(String mobile) {
if (isNullOrEmpty(mobile)) {
return false;
}
Pattern p = Pattern.compile("^((13[0-9])|(15[^4,\\D])|(17[^4,\\D])|(18[0,5-9]))\\d{8}$");
Matcher m = p.matcher(mobile);
return m.matches();
}


public static boolean isLegal(String str) {
if (isNullOrEmpty(str)) {
return false;
}
Pattern p = Pattern.compile("[A-Za-z0-9_]{3,16}");
Matcher m = p.matcher(str);
return m.matches();
}


public static boolean isEmail(String email) {
if (isNullOrEmpty(email)) {
return false;
}
Pattern p = Pattern.compile(
"^([a-zA-Z0-9_\\-\\.]+)@((\\[[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.)|(([a-zA-Z0-9\\-]+\\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\\]?)$");
Matcher m = p.matcher(email);
return m.matches();
}


public static boolean isMd5(String md5) {
if (isNullOrEmpty(md5)) {
return false;
}
Pattern p = Pattern.compile("[A-Za-z0-9_]{16,40}");
Matcher m = p.matcher(md5);
return m.matches();
}




public static boolean isAllNull(Object... obj) {
if (obj == null || obj.length == 0) {
return true;
}
for (int i = 0; i < obj.length; i++) {
if (!isNullOrEmpty(obj[i])) {
return false;
}
}
return true;
}


public static boolean isAllNull(List objs) {
return isAllNull(objs.toArray());
}


/**
* 把一个数组按照分隔符拼接成字符串

* @param 数组参数
* @param 分隔�?
* @return
*/
public static String collectionMosaic(Object[] objs, String mosaicChr) {
if (isNullOrEmpty(objs)) {
return null;
}
List objList = Arrays.asList(objs);
return collectionMosaic(objList, mosaicChr);
}


/**
* 把一个数组按照分隔符拼接成字符串

* @param 数组参数
* @param 分隔�?
* @return
*/
public static String collectionMosaic(int[] intObjs, String mosaicChr) {
Object[] objs = new Object[intObjs.length];
for (int i = 0; i < intObjs.length; i++) {
objs[i] = String.valueOf(intObjs[i]);
}
return collectionMosaic(objs, mosaicChr);
}


/**
* 把一个或多个字符串按照分隔符拼接成字符串

* @param 数组参数
* @param 分隔�?
* @return
*/
public static String collectionMosaic(String mosaicChr, Object... objs) {
List objList = Arrays.asList(objs);
return collectionMosaic(objList, mosaicChr);
}


/**
* 把一个集合按照分隔符拼接成字符串

* @param 集合参数
* @param 分隔�?
* @return 字符�?
*/
public static String collectionMosaic(List objs, String mosaicChr) {
if (objs == null || objs.isEmpty()) {
return null;
}
StringBuilder sb = new StringBuilder();
int i = 0;
for (Object obj : objs) {
if (isNullOrEmpty(obj)) {
continue;
}
sb.append(obj);
if (i < objs.size() - 1) {
sb.append(mosaicChr);
}
i++;
}
return sb.toString();
}


/**
* 生成指定数目字符串按分隔符分�?

* @param baseStr
* @param mosaicChr
* @param size
* @return
*/
public static String getStringSByMosaicChr(String baseStr, String mosaicChr, Integer size) {
List list = new ArrayList();
for (int i = 0; i < size; i++) {
if (isNullOrEmpty(baseStr)) {
continue;
}
list.add(baseStr);
}
return collectionMosaic(list, mosaicChr);
}


/**
* 按照分隔符分�?,得到字符串集�?

* @param text
*            原字符串
* @param mosaiChr
*            分隔�?
* @return list
*/
public static List splitByMosaic(String text, String mosaiChr) {
if (text == null || mosaiChr == null) {
return null;
}
String[] tab = text.split(mosaiChr);
List list = new ArrayList();
for (int i = 0; i < tab.length; i++) {
if (isNullOrEmpty(tab[i])) {
continue;
}
list.add(tab[i]);
}
return list;
}


/**
* 按照分隔符分�?,得到字符串集�?

* @param text
*            原字符串
* @param mosaiChr
*            分隔�?
* @return list
*/
public static List splitByMosaicInteger(String text, String mosaiChr) {
if (text == null || mosaiChr == null) {
return null;
}
String[] tab = text.split(mosaiChr);
List list = new ArrayList();
for (int i = 0; i < tab.length; i++) {
if (isNullOrEmpty(tab[i])) {
continue;
}
try {
list.add(Integer.valueOf(tab[i]));
} catch (Exception e) {
}


}
return list;
}


/**
* 按照分隔符分�?,得到字符串集�?

* @param text
*            原字符串
* @param mosaiChr
*            分隔�?
* @return list
*/
public static Integer[] splitByMosaicIntegers(String text, String mosaiChr) {
if (text == null || mosaiChr == null) {
return null;
}
String[] tab = text.split(mosaiChr);
Integer[] list = new Integer[tab.length];
for (int i = 0; i < tab.length; i++) {
if (isNullOrEmpty(tab[i])) {
continue;
}
try {
list[i] = Integer.valueOf(tab[i]);
} catch (Exception e) {
}


}
return list;
}


public static List doMatcher(String context, String pat) {
try {
List images = new ArrayList();
Integer index = 0;
Pattern pattern = Pattern.compile(pat, Pattern.DOTALL);
Matcher matcher = pattern.matcher(context);
String tmp = null;
while (matcher.find(index)) {
tmp = matcher.group(0);
index = matcher.end();
if (StringUtil.isNullOrEmpty(tmp)) {
continue;
}
images.add(tmp);
}
return images;
} catch (Exception e) {
return null;
}
}


public static String doMatcherFirst(String context, String pat) {
List strs = doMatcher(context, pat);
if (StringUtil.isNullOrEmpty(strs)) {
return null;
}
return strs.get(0);
}


public static boolean isNullOrEmpty(Object obj) {
try {
if (obj == null)
return true;
if (obj instanceof CharSequence) {
return ((CharSequence) obj).length() == 0;
}
if (obj instanceof Collection) {
return ((Collection) obj).isEmpty();
}
if (obj instanceof Map) {
return ((Map) obj).isEmpty();
}
if (obj instanceof Object[]) {
Object[] object = (Object[]) obj;
if (object.length == 0) {
return true;
}
boolean empty = true;
for (int i = 0; i < object.length; i++) {
if (!isNullOrEmpty(object[i])) {
empty = false;
break;
}
}
return empty;
}
return false;
} catch (Exception e) {
return true;
}


}


public static Integer findNull(Object... objs) {
if (isNullOrEmpty(objs)) {
return 0;
}
for (int i = 0; i < objs.length; i++) {
if (isNullOrEmpty(objs[i])) {
return i;
}
}
return -1;
}


public static boolean hasNull(Object... objs) {
return findNull(objs)>-1;
}
// 判断是否为数�?
public static Boolean isNumber(String str) {
if (isNullOrEmpty(str)) {
return false;
}
try {
Integer.valueOf(str);
return true;
} catch (Exception e) {
return false;
}
}


public static String argsToString(String[] args) {
StringBuilder sb = new StringBuilder();
for (String tmp : args) {
sb.append(tmp);
}
return sb.toString();
}


// 字符串意义分�?
public static String[] splitString(String str) {
if (isNullOrEmpty(str)) {
return null;
}
String[] finalStrs = new String[str.length()];
for (int i = 0; i < str.length(); i++) {
finalStrs[i] = str.substring(i, i + 1);
}
return finalStrs;
}


public static String getString(Object... objs) {
if (isNullOrEmpty(objs)) {
return "";
}
StringBuilder sb = new StringBuilder();
for (Object obj : objs) {
if (isNullOrEmpty(obj)) {
sb.append("null");
}
sb.append(String.valueOf(obj));
}
return sb.toString();
}


public static String stringSort(String str) {
if (isNullOrEmpty(str)) {
return "";
}
String[] strs = splitString(str);
Arrays.sort(strs);
return argsToString(strs);
}


/**
* 集合碰撞

* @param needList
*            �?要的集合
* @param actualList
*            当前实际集合
* @return 缺少的元�?
*/
public static List collisionList(List needList, List actualList) {
List list = new ArrayList();
for (Object o : needList) {
if (actualList.contains(o)) {
continue;
}
list.add(o);
}
if (isNullOrEmpty(list)) {
return null;
}
return list;
}


public static List integerListToLong(List ids) {
if (isNullOrEmpty(ids)) {
return null;
}
List list = new ArrayList();
for (Integer id : ids) {
list.add(Long.valueOf(id));
}
return list;
}


/**
* List碰撞取缺�?

* @param allList
*            理论应该出现的List
* @param conflictList
*            实际出现的List
* @return 丢失的List
*/
public static List listConflict(List allList, List conflictList) {
if (isNullOrEmpty(allList)) {
return null;
}
if (isNullOrEmpty(conflictList)) {
return allList;
}
List list = new ArrayList();
for (Object obj : allList) {
if (conflictList.contains(obj)) {
continue;
}
list.add(obj);
}
if (isNullOrEmpty(list)) {
return null;
}
return list;
}


public static Integer bambooParse(Integer... prs) {
Integer prSum = 0;
for (Integer pr : prs) {
prSum += pr;
}
Integer random = getRanDom(1, prSum);
prSum = 0;
for (int i = 0; i < prs.length; i++) {
prSum += prs[i];
if (random <= prSum) {
return i;
}
}
return 0;
}


public static Integer SumInteger(Integer... sums) {
if (isNullOrEmpty(sums)) {
return -1;
}
Integer total = 0;
for (Integer tmp : sums) {
total += tmp;
}
return total;
}


/**
* 概率算法

* @param chances
*            各成员概率权�?
* @return 权重下标
*/
public static Integer getBambooIndex(Integer... chances) {
if (isNullOrEmpty(chances)) {
return -1;
}
Integer total = SumInteger(chances);
Integer random = getRanDom(1, total);
total = new Integer(0);
for (int i = 0; i < chances.length; i++) {
total += chances[i];
if (random <= total) {
return i;
}
}
return -1;
}


public static List removeEmpty(List list) {
if (StringUtil.isNullOrEmpty(list)) {
return null;
}
List newList = new ArrayList(list.size());
for (Object obj : list) {
if (isNullOrEmpty(obj)) {
continue;
}
newList.add(obj);
}
if (isNullOrEmpty(newList)) {
return null;
}
return newList;
}


public static Integer getBambooIndex(Float... chanceSources) {
if (isNullOrEmpty(chanceSources)) {
return -1;
}
Float[] chances = Arrays.copyOf(chanceSources, chanceSources.length);
Integer smallLength = 0;
for (Float f : chances) {
String str = String.valueOf(f);
String[] tabs = str.split("\\.");
if (tabs.length != 2) {
continue;
}
smallLength = tabs[1].length();
}
if (smallLength > 0) {
Integer multiple = Double.valueOf(Math.pow(10, smallLength)).intValue();
for (int i = 0; i < chances.length; i++) {
chances[i] = chances[i] * multiple;
}
}
Integer[] chanceInts = new Integer[chances.length];
for (int i = 0; i < chances.length; i++) {
chanceInts[i] = chances[i].intValue();
}
return getBambooIndex(chanceInts);
}


public static Float floatCut(Float f1, Float f2) {
BigDecimal b1 = new BigDecimal(Float.toString(f1));
BigDecimal b2 = new BigDecimal(Float.toString(f2));
return b1.subtract(b2).floatValue();
}


/**
* 获取网址后缀

* @param url
* @return
*/
public static String getSuffix(String url) {
if (isNullOrEmpty(url)) {
return "";
}
String[] tab = url.split("\\.");
if (tab.length > 1) {
return tab[tab.length - 1];
}
return "";
}


}


为了方便我们使用,特意为百度云防护的网站封装了一个工具类RobotHttpHandle,维护了Cookie机制


package org.coody.robot.rote;


import java.util.Date;


import javax.script.Invocable;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;


import org.coody.robot.util.HttpEntity;
import org.coody.robot.util.HttpHandle;
import org.coody.robot.util.StringUtil;




public class RobotHttpHandle {


public static String cookie="";


public RobotHttpHandle(){

}


private HttpEntity initCookie(String url){
try {
String baseURL=HttpHandle.getDomain(url);
HttpHandle http=new HttpHandle();
http.config.setRequestProperty("If-Modified-Since", new Date().toString());
http.config.setRequestProperty("Cache-Control", "max-age=0");
http.config.setRequestProperty("Upgrade-Insecure-Requests", "1");
http.config.setKeepAlive(true);
HttpEntity entity = http.Get(baseURL);
System.out.println(entity.getCookie());
String html = entity.getHtml();
String temp = html.replace(" ", "");
String jschl_vc = StringUtil.textCutCenter(temp, "jschl_vc\"value=\"", "\"");
String pass = StringUtil.textCutCenter(temp, "pass\"value=\"", "\"");


String funcCode = StringUtil.textCutCenter(html, "setTimeout(function(){", "f.submit();");


funcCode = funcCode.replace("a.value", "a");
funcCode = funcCode.replace("  ", " ");
String[] tabs = funcCode.split("\n");
funcCode = tabs[1];
funcCode += "\r\nt=\"" + baseURL + "\";";
funcCode += "\r\nr = t.match(/https?:\\/\\//)[0];";
funcCode += "\r\nt = t.substr(r.length);";
funcCode += "\r\nt = t.substr(0, t.length - 1);";
funcCode += tabs[8];
funcCode += "\r\n return a;";


funcCode = "function jschl_answer(){\r\n" + funcCode + "\r\n}";


ScriptEngineManager manager = new ScriptEngineManager();
ScriptEngine engine = manager.getEngineByName("js");
engine.eval(funcCode);
Invocable invocable = (Invocable) engine;
Double jschl_answer = (Double) invocable.invokeFunction("jschl_answer");
url=baseURL+"/cdn-cgi/l/chk_jschl?jschl_vc="+jschl_vc+"&pass="+pass+"&jschl_answer="+jschl_answer.intValue();
http.config.allowRedirects(false);
System.out.println(url);
Thread.sleep(3800l);
http.config.setGzip(true);
entity=http.Get(url);
cookie=entity.getCookie();
if(!cookie.contains("cf_clearance")){
return null;
}
return entity;
} catch (Exception e) {
e.printStackTrace();
return null;
}
}

public HttpEntity Get(String url){
if(cookie!=null&&!"".equals(cookie)){
loadCookie(url);
}
HttpHandle http=new HttpHandle();
http.config.setRequestProperty("If-Modified-Since", new Date().toString());
http.config.setRequestProperty("Cache-Control", "max-age=0");
http.config.setRequestProperty("Upgrade-Insecure-Requests", "1");
http.config.setKeepAlive(true);
http.config.setCookie(cookie);
HttpEntity entity=http.Get(url);
if(entity.getCode()!=200){
loadCookie(url);
http.config.setCookie(cookie);
entity=http.Get(url);
}
return entity;
}

public void loadCookie(String url){
cookie=null;
HttpEntity entity=initCookie(url);
while(entity==null){
entity=initCookie(url);
}
}


public static void main(String[] args) throws NoSuchMethodException, ScriptException, InterruptedException {
HttpEntity entity=new RobotHttpHandle().Get("http://www.myexception.cn/");
System.out.println(entity.getHtml());
}
}


使用方式:


HttpEntity entity=new RobotHttpHandle().Get("http://www.myexception.cn/");
System.out.println(entity.getHtml());
如图:



输入图片说明

你可能感兴趣的:(JAVA)