设计思路:基于bean+spring配置文件方式,配置多个项目属性,实现项目自动登录,实现通用接口或抽象类,自定义解析类,最后通过url传参,反射实例化对象,实现方法的通用。
import java.util.Map; import org.apache.http.client.HttpClient; /** * * 类功能描述:远程登录项目属性类 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-26 下午01:49:02 */ public class HttpAttributes { //发请求对象 private HttpClient httpClient; //解析当前域下网页类的包路径 private String packPath; //域名或IP地址 private String domain = ""; //登陆提交表单全路径地址 如http://www.iteye.com/login.jsp private String loginUrl = ""; //登陆失败之后的请求地址 如/error.jsp private String errorUrl = ""; //端口号 private int port = 80; //登陆参数 private Mapparams; //验证方式 private String scheme; public HttpClient getHttpClient() { return httpClient; } public void setHttpClient(HttpClient httpClient) { this.httpClient = httpClient; } public String getPackPath() { return packPath; } public void setPackPath(String packPath) { this.packPath = packPath; } public String getDomain() { return domain; } public void setDomain(String domain) { this.domain = domain; } public String getLoginUrl() { return loginUrl; } public void setLoginUrl(String loginUrl) { this.loginUrl = loginUrl; } public String getErrorUrl() { return errorUrl; } public void setErrorUrl(String errorUrl) { this.errorUrl = errorUrl; } public int getPort() { return port; } public void setPort(int port) { this.port = port; } public Map getParams() { return params; } public void setParams(Map params) { this.params = params; } public String getScheme() { return scheme; } public void setScheme(String scheme) { this.scheme = scheme; } } import java.util.Map; /** * * 类功能描述:解析统一接口 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param $ * Create: 2014-2-26 下午01:53:10 */ public interface IParse { /** * * @function:url中以m_开头的自定义参数 * @param params * @return * @author: mengqingyu 2014-3-4 上午09:32:54 */ abstract T process(Map params); } import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; /** * * 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-19 下午01:53:53 * @param */ public abstract class HtmlParse implements IParse { protected Log log = LogFactory.getLog(HtmlParse.class); protected Document doc; public HtmlParse(String doc) { this.doc = Jsoup.parse(doc); } } import net.sf.json.JSONObject; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * * 类功能描述:解析html网页抽象类,解析html可以继承扩展此类,如有通用方法可以写到此类中,进一步完善 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-19 下午01:53:53 * @param */ public abstract class JsonParse implements IParse { protected Log log = LogFactory.getLog(JsonParse.class); protected JSONObject doc; public JsonParse(String doc) { this.doc = JSONObject.fromObject(doc); } } package com.berheley.bi.grp.fetch.parse; import java.util.Map; /** * * 类功能描述:解析统一接口 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp @param $ * Create: 2014-2-26 下午01:53:10 */ public interface IParse { /** * * @function:url中以m_开头的自定义参数 * @param params * @return * @author: mengqingyu 2014-3-4 上午09:32:54 */ abstract T process(Map params); } import java.util.Map; import net.sf.json.JSONObject; import com.berheley.bi.grp.fetch.parse.HtmlParse; public class FyxxInfoHtmlParse extends HtmlParse { public FyxxInfoHtmlParse(String doc) { super(doc); } @Override public String process(Map params) { JSONObject jsonObj = new JSONObject(); //价位无 String tfj_rentcost = doc.getElementById("tfj_rentcost").val(); //租金 String tfj_buildingarea = doc.getElementById("tfj_buildingarea")==null?"":doc.getElementById("tfj_buildingarea").val();//面积 String tfj_standardstorey = doc.getElementById("tfj_standardstorey").val();// 标准层高 String tfj_floorloading = doc.getElementById("tfj_floorloading_d").val();//楼面承重 tfj_floorloading_d String tfj_phone = doc.getElementById("tfj_phone").val();//业主单位联系方式 String tfj_propertycost = doc.getElementById("tfj_propertycost").val();//物业 String tfj_watercost = doc.getElementById("tfj_watercost").val();//水 String tfj_eleccost = doc.getElementById("tfj_eleccost").val();//电 jsonObj.put("rentcost", tfj_rentcost); jsonObj.put("buildingarea", tfj_buildingarea); jsonObj.put("standardstorey", tfj_standardstorey); jsonObj.put("floorloading", tfj_floorloading); jsonObj.put("phone", tfj_phone); jsonObj.put("propertycost", tfj_propertycost); jsonObj.put("watercost", tfj_watercost); jsonObj.put("eleccost", tfj_eleccost); jsonObj.put("success", true); return jsonObj.toString(); } } /** * * 类功能描述:常量类 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-28 下午02:37:32 */ public final class HttpConstant { public static final String POST = "POST"; public static final String URL = "m_url"; public static final String PARSE = "m_parse"; public static final String GBK = "gbk"; public static final String UTF8 = "UTF-8"; } import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.lang.reflect.Constructor; import java.net.URLEncoder; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpResponse; import org.apache.http.NameValuePair; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.HttpClient; import org.apache.http.client.entity.UrlEncodedFormEntity; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.entity.ContentType; import org.apache.http.message.BasicNameValuePair; import org.apache.http.protocol.ExecutionContext; import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import com.berheley.bi.basic.exp.BusinessException; import com.berheley.bi.grp.fetch.parse.IParse; import com.berheley.bi.grp.fetch.pojo.HttpAttributes; /** * * 类功能描述:请求工具类 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-19 下午01:53:18 */ public final class HttpUtils { private static Log log = LogFactory.getLog(HttpUtils.class); /** * * @function:get请求 * @param httpclient * @param url * @return * @author: mengqingyu 2014-2-19 下午01:50:58 */ public static HttpResponse httpGet(HttpClient httpclient, String url) { HttpResponse response = null; HttpGet httpget = new HttpGet(url); try { response = httpclient.execute(httpget); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } log.info("get status: " + response.getStatusLine()); return response; } /** * get请求 * @param httpclient * @param url * @param handler * @param context new BasicHttpContext() 可取到请求后url * @return */ public static HttpResponse httpGet(HttpClient httpclient, String url, HttpContext context) { HttpResponse response = null; HttpGet httpget = new HttpGet(url); try { response = httpclient.execute(httpget, context); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } log.info("get status: " + response.getStatusLine()); return response; } /** * get请求 包含判断是否需要登录的POST * @param httpclient * @param url * @param handler * @param context new BasicHttpContext() 可取到请求后url * @return */ public static HttpResponse httpGetByScheme(HttpClient httpclient, String url, HttpContext context, HttpAttributes attributes) { HttpResponse response = httpGet(httpclient, url, context); HttpUriRequest req = (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST); log.info("get请求跳转地址: " + req.getURI()); if(HttpConstant.POST.equalsIgnoreCase(attributes.getScheme())&&attributes.getErrorUrl().equalsIgnoreCase(req.getURI().toString())){ httpPost(httpclient, attributes.getLoginUrl(), getPairs(attributes.getParams())); response = httpGet(httpclient, url, context); } log.info("get status: " + response.getStatusLine()); return response; } /** * * @function:post提交 * @param httpclient * @param url * @param params * @return * @author: mengqingyu 2014-2-19 下午01:51:38 */ public static HttpResponse httpPost(HttpClient httpclient, String url, List params) { HttpResponse response = null; HttpPost httpost = new HttpPost(url); httpost.setEntity(new UrlEncodedFormEntity(params, Charset.forName(HttpConstant.GBK))); // httpost.getParams().setBooleanParameter(CoreProtocolPNames.USE_EXPECT_CONTINUE,false); try { response = httpclient.execute(httpost); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } log.info("post status: " + response.getStatusLine()); return response; } /** * * @function:主机地址 * @param context * @return * @author: mengqingyu 2014-2-19 下午01:51:57 */ public static HttpHost getHttpHost(HttpContext context) { return (HttpHost) context.getAttribute(ExecutionContext.HTTP_TARGET_HOST); } /** * * @function:子地址 * @param context * @return * @author: mengqingyu 2014-2-19 下午01:52:05 */ public static HttpUriRequest getHttpUriRequest(HttpContext context) { return (HttpUriRequest) context.getAttribute(ExecutionContext.HTTP_REQUEST); } /** * * @function:表单参数转换 * @param params * @return * @author: mengqingyu 2014-2-19 下午01:52:24 */ public static List getPairs(Map, ?> params) { List nameValuePairs = new ArrayList (); if (params != null) { for (Map.Entry, ?> entry : params.entrySet()) { nameValuePairs.add(new BasicNameValuePair(entry.getKey().toString(), entry.getValue().toString())); } } return nameValuePairs; } /** * * @function:实体类转换html文本 * @param response * @return * @author: mengqingyu 2014-2-19 下午01:52:40 */ public static String entityToString(HttpResponse response) { HttpEntity entity = response.getEntity(); InputStream is = null; BufferedReader br = null; StringBuilder sb = null; ContentType contentType = ContentType.getOrDefault(entity); Charset charset = contentType.getCharset(); if(charset==null) charset = Charset.forName(HttpConstant.GBK); try { is = entity.getContent(); br = new BufferedReader(new InputStreamReader(is, charset)); sb = new StringBuilder(); String line = null; while ((line = br.readLine()) != null) { sb.append(line); } } catch (Exception e) { e.printStackTrace(); } finally { try { is.close(); EntityUtils.consume(entity); } catch (IOException e) { e.printStackTrace(); } } return sb.toString(); } /** * * @function:反射生成解析策略类 * @param parseBean * @param html * @return * @throws BusinessException * @author: mengqingyu 2014-2-26 下午04:31:25 */ @SuppressWarnings({ "rawtypes", "unchecked" }) public static IParse newInstance(String packPath, String parseBean, String text) throws BusinessException{ IParse parse = null; try { Class clazz = Class.forName(packPath+"."+parseBean); Constructor constructor = clazz.getConstructor(String.class); parse = (IParse) constructor.newInstance(text); } catch (Exception e) { throw new BusinessException("网页解析类初始化错误 "+e.getMessage(), e); } return parse; } /** * * @function:通过url获取域名 * @param url * @return * @author: mengqingyu 2014-2-26 下午04:30:49 */ public static String initParams(Map params) { String url = params.get(HttpConstant.URL).toString(); int index = url.indexOf("?"); if(index==-1) return url; String urlPath = url.substring(0, url.indexOf("?")+1); String paramStr = url.substring(url.indexOf("?")+1); String[] urlArray = paramStr.split("&"); for (int i = 0; i < urlArray.length; i++) { String[] paramArray = null; if(urlArray[i].startsWith("m_")) { paramArray = urlArray[i].split("="); params.put(paramArray[0], paramArray[1]); paramStr = paramStr.replaceAll("(\\?|&)"+urlArray[i], ""); } } paramStr = urlEncoder(paramStr); paramStr = paramStr.replace("%3D", "=").replace("%26", "&"); return urlPath+paramStr; } /** * * @function:url 编码 * @param paramStr * @return * @author: mengqingyu 2014-2-28 下午02:58:59 */ public static String urlEncoder(String paramStr) { try { paramStr = URLEncoder.encode(paramStr,HttpConstant.UTF8); } catch (UnsupportedEncodingException e) { log.error("url编码错误", e); } return paramStr; } } import java.io.IOException; import jcifs.ntlmssp.NtlmFlags; import jcifs.ntlmssp.Type1Message; import jcifs.ntlmssp.Type2Message; import jcifs.ntlmssp.Type3Message; import jcifs.util.Base64; import org.apache.http.impl.auth.NTLMEngine; import org.apache.http.impl.auth.NTLMEngineException; /** * * 类功能描述:JCIFS实现NTLM windows域验证 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-26 下午01:55:31 */ public final class JCIFSEngine implements NTLMEngine { private static final int TYPE_1_FLAGS = NtlmFlags.NTLMSSP_NEGOTIATE_56 | NtlmFlags.NTLMSSP_NEGOTIATE_128 | NtlmFlags.NTLMSSP_NEGOTIATE_NTLM2 | NtlmFlags.NTLMSSP_NEGOTIATE_ALWAYS_SIGN | NtlmFlags.NTLMSSP_REQUEST_TARGET; public String generateType1Msg(final String domain, final String workstation) throws NTLMEngineException { final Type1Message type1Message = new Type1Message(TYPE_1_FLAGS, domain, workstation); return Base64.encode(type1Message.toByteArray()); } public String generateType3Msg(final String username, final String password, final String domain, final String workstation, final String challenge) throws NTLMEngineException { Type2Message type2Message; try { type2Message = new Type2Message(Base64.decode(challenge)); } catch (final IOException exception) { throw new NTLMEngineException("Invalid NTLM type 2 message", exception); } final int type2Flags = type2Message.getFlags(); final int type3Flags = type2Flags & (0xffffffff ^ (NtlmFlags.NTLMSSP_TARGET_TYPE_DOMAIN | NtlmFlags.NTLMSSP_TARGET_TYPE_SERVER)); final Type3Message type3Message = new Type3Message(type2Message, password, domain, username, workstation, type3Flags); return Base64.encode(type3Message.toByteArray()); } } import org.apache.http.auth.AuthScheme; import org.apache.http.auth.AuthSchemeFactory; import org.apache.http.impl.auth.NTLMScheme; import org.apache.http.params.HttpParams; /** * * 类功能描述:NTLM windows域验证 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-26 下午01:54:40 */ public class NTLMSchemeFactory implements AuthSchemeFactory { public AuthScheme newInstance(final HttpParams params) { return new NTLMScheme(new JCIFSEngine()); } } import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.auth.AuthScope; import org.apache.http.auth.NTCredentials; import org.apache.http.auth.UsernamePasswordCredentials; import org.apache.http.auth.params.AuthPNames; import org.apache.http.client.HttpClient; import org.apache.http.client.params.AuthPolicy; import org.apache.http.conn.ClientConnectionManager; import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.HttpParams; import com.berheley.bi.grp.fetch.ntlm.NTLMSchemeFactory; import com.berheley.bi.grp.fetch.pojo.HttpAttributes; import com.berheley.bi.grp.fetch.util.HttpConstant; /** * * 类功能描述:远程登录处理类 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-26 下午01:49:45 */ public class HttpHandler { private Log log = LogFactory.getLog(HttpHandler.class); // 创建socket的上线 private int maxTotal = 400; // 对每个指定连接的服务器(指定的ip)可以创建的并发数 private int maxRoute = 200; // 连接超时时间 private int cnTimeOut = 60000; // 数据传输超时 private int soTimeOut = 60000; //连接对象 private HttpClient httpClient; //连接属性设置 private HttpParams httpParams; //多线程连接管理 private ClientConnectionManager connectionManager; // key:IP地址,value:每个项目的属性 private Map attributes; public HttpHandler() { httpParams = this.getHp(); connectionManager = this.getCm(); httpClient = new DefaultHttpClient(connectionManager, httpParams); } public int getMaxTotal() { return maxTotal; } public void setMaxTotal(int maxTotal) { this.maxTotal = maxTotal; } public int getMaxRoute() { return maxRoute; } public void setMaxRoute(int maxRoute) { this.maxRoute = maxRoute; } public int getCnTimeOut() { return cnTimeOut; } public void setCnTimeOut(int cnTimeOut) { this.cnTimeOut = cnTimeOut; } public int getSoTimeOut() { return soTimeOut; } public void setSoTimeOut(int soTimeOut) { this.soTimeOut = soTimeOut; } public HttpParams getHttpParams() { return httpParams; } public void setHttpParams(HttpParams httpParams) { this.httpParams = httpParams; } public ClientConnectionManager getConnectionManager() { return connectionManager; } public void setConnectionManager(ClientConnectionManager connectionManager) { this.connectionManager = connectionManager; } public Map getAttributes() { return attributes; } public void setAttributes(Map attributes) { this.attributes = attributes; } /** * * @function:初始化 HttpClient * @author: mengqingyu 2014-2-26 下午02:57:09 */ public void init() { for (Entry entry : attributes.entrySet()) { HttpAttributes attributes = entry.getValue(); String scheme = attributes.getScheme(); DefaultHttpClient httpClient = null; if (AuthPolicy.NTLM.equalsIgnoreCase(scheme)) { httpClient = new DefaultHttpClient(connectionManager, httpParams); List authpref = new ArrayList (); authpref.add(AuthPolicy.NTLM); httpClient.getParams().setParameter(AuthPNames.TARGET_AUTH_PREF, authpref); // httpClient.getParams().setParameter(ClientPNames.COOKIE_POLICY,CookiePolicy.BEST_MATCH); httpClient.getAuthSchemes().register(AuthPolicy.NTLM, new NTLMSchemeFactory()); NTCredentials creds = new NTCredentials(attributes.getParams().get("username"), attributes.getParams().get("password"), "", ""); httpClient.getCredentialsProvider().setCredentials(AuthScope.ANY, creds); attributes.setHttpClient(httpClient); } else if (AuthPolicy.BASIC.equalsIgnoreCase(scheme)) { httpClient = new DefaultHttpClient(connectionManager, httpParams); httpClient.getCredentialsProvider().setCredentials(new AuthScope(attributes.getDomain(), attributes.getPort()), new UsernamePasswordCredentials(attributes.getParams().get("username"), attributes.getParams().get("password"))); attributes.setHttpClient(httpClient); } else if (HttpConstant.POST.equalsIgnoreCase(scheme)) { attributes.setHttpClient(this.httpClient); } } log.info("初始化 HttpClient"); } /** * * @function:连接属性设置 * @return * @author: mengqingyu 2014-2-26 下午02:56:49 */ private HttpParams getHp() { HttpParams params = new BasicHttpParams(); params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, cnTimeOut); params.setParameter(CoreConnectionPNames.SO_TIMEOUT, soTimeOut); return params; } /** * * @function:多线程连接设置 * @return * @author: mengqingyu 2014-2-26 下午02:56:49 */ private ClientConnectionManager getCm() { SchemeRegistry schemeRegistry = new SchemeRegistry(); schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory())); schemeRegistry.register(new Scheme("https", 433, PlainSocketFactory.getSocketFactory())); PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry); cm.setMaxTotal(maxTotal); cm.setDefaultMaxPerRoute(maxRoute); return cm; } /** * * @function:获得项目配置 * @param url * @return * @author: mengqingyu 2014-2-27 上午09:52:53 */ public HttpAttributes getHttpAttributes(String url) { url = url.substring(url.indexOf("://") + 3); url = url.substring(0, url.indexOf("/")); return attributes.get(url); } } import java.util.Map; import com.berheley.bi.basic.exp.BusinessException; /** * * 类功能描述:抓取网站业务类 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-26 下午01:56:24 */ public interface IFetchService { /** * * @function:抓取并解析数据 * @param params 包含以下 * @param 包含key为:m_url必传参数 每次请求全路径包含参数 在参数内的地址后需要包含参数m_parse * @return * @throws BusinessException * @author: mengqingyu 2014-2-26 下午01:56:38 */ public String findDate(Map params) throws BusinessException; } import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.HttpResponse; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HttpContext; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import com.berheley.bi.basic.exp.BusinessException; import com.berheley.bi.grp.fetch.handler.HttpHandler; import com.berheley.bi.grp.fetch.parse.IParse; import com.berheley.bi.grp.fetch.pojo.HttpAttributes; import com.berheley.bi.grp.fetch.util.HttpConstant; import com.berheley.bi.grp.fetch.util.HttpUtils; /** * * 类功能描述:抓取解析业务实现类 * * @author mengqingyu * @version $Id: codetemplates.xml,v 1.1 2009/03/06 01:13:01 mengqingyu Exp $ * Create: 2014-2-26 下午01:56:57 */ @Service public class FetchService implements IFetchService { private Log log = LogFactory.getLog(FetchService.class); @Autowired private HttpHandler httpHandler; @Override public String findDate(Map params) throws BusinessException { String url = HttpUtils.initParams(params); HttpAttributes attributes = httpHandler.getHttpAttributes(url); DefaultHttpClient httpclient = (DefaultHttpClient) attributes.getHttpClient(); HttpContext localContext = new BasicHttpContext(); HttpResponse response = HttpUtils.httpGetByScheme(httpclient, url, localContext, attributes); String result = HttpUtils.entityToString(response); IParse parse = HttpUtils.newInstance(attributes.getPackPath(), params.get(HttpConstant.PARSE).toString(), result); String json = parse.process(params); log.info(json); return json; } }