2019独角兽企业重金招聘Python工程师标准>>>
抓取页面:https://reports.ingenuity.com/rs/report/function?id=ING%3A48osn
主要代码
package com.ninemax.ak.html.v3;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.math.BigInteger;
import java.security.GeneralSecurityException;
import java.security.MessageDigest;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLSession;
import javax.net.ssl.SSLSocket;
import org.apache.http.HttpEntity;
import org.apache.http.HttpStatus;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLContextBuilder;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.conn.ssl.X509HostnameVerifier;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;
import org.apache.log4j.Logger;
import com.ninemax.ak.base.SpringContextUtil;
import com.ninemax.ak.dao.CommonDao;
import com.ninemax.ak.html.HtmlParser;
/**
* HTTP 请求工具类
*
*/
public class HtmlHttpClient{
public static Logger log =Logger.getLogger(HtmlHttpClient.class);
public CommonDao commonDao = SpringContextUtil.getBean("commonDao");
HtmlParser parser = new HtmlParser();
private static PoolingHttpClientConnectionManager connMgr;
private static RequestConfig requestConfig;
private static final int MAX_TIMEOUT = 700000;
static {
// 设置连接池
connMgr = new PoolingHttpClientConnectionManager();
connMgr.setMaxTotal(100);
connMgr.setDefaultMaxPerRoute(connMgr.getMaxTotal());
RequestConfig.Builder configBuilder = RequestConfig.custom();
// 设置连接超时
configBuilder.setConnectTimeout(MAX_TIMEOUT);
// 设置读取超时
configBuilder.setSocketTimeout(MAX_TIMEOUT);
// 设置从连接池获取连接实例的超时
configBuilder.setConnectionRequestTimeout(MAX_TIMEOUT);
// 在提交请求之前 测试连接是否可用
configBuilder.setStaleConnectionCheckEnabled(true);
requestConfig = configBuilder.build();
}
/**
* @param args
*/
public static void main(String[] args) {
Long start = System.currentTimeMillis();
HtmlHttpClient util = new HtmlHttpClient();
// 主页面链接
String url = "https://reports.ingenuity.com/rs/report/function?id=ING%3A48osn";
// 获取HTML
String html = doPostSSL(url);
Map map = new HashMap();
map.put("urlname", "DiseaseOrFunction");
map.put("originalurl", "/rs/report/function?id=ING%3A48osn");
map.put("wholeurl", url);
map.put("mdurl", md5Encode(url));
map.put("rank", 1);
List
HTML解析类
package com.ninemax.ak.html;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
public class HtmlParser {
public static Logger log = Logger.getLogger(HtmlParser.class);
/**
* 获取HTML节点
* @param html
* @param TagNodeName 节点名称
* @return HTML文本
*/
public String get_TagNode_Html(String html,String TagNodeName){
String subhtml = "";
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeList nodes = parser.parse(tagNode);
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
subhtml = textnode.toHtml();
}
} catch (ParserException e) {
e.printStackTrace();
}
return subhtml;
}
/**
* 获取HTML节点
* @param html
* @param TagNodeName 节点
* @param setAttrName 属性名
* @param setAttrValue 属性值
* @return HTML文本
*/
public String get_TagNode_Html(String html, String TagNodeName,String setAttrName, String setAttrValue) {
String second_details_html = "";
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeFilter attrNode_name = new HasAttributeFilter(setAttrName,setAttrValue);
NodeFilter andNode = new AndFilter(tagNode, attrNode_name);
NodeList nodes = parser.extractAllNodesThatMatch(andNode);
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
second_details_html = textnode.toHtml();
}
} catch (ParserException e) {
e.printStackTrace();
}
return second_details_html;
}
/**
* 获取HTML节点数组
* @param html
* @param TagNodeName 节点名称
* @return 节点数组
*/
public List get_TagNode_HtmlList(String html,String TagNodeName) {
List result = new ArrayList();
NodeList nodes=null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if (nodes != null && nodes.size() > 0) {
for (int i = 0; i < nodes.size(); i++) {
TagNode textnode = (TagNode) nodes.elementAt(i);
String s = textnode.toHtml();
if (!HtmlUtil.isEmptyTrim(s)) {
result.add(s);
}
}
}
return result;
}
/**
* 获取指定属性HTML节点
* @param html
* @param TagNodeName 节点名称
* @param setAttrName 属性名称
* @param setAttrValue 属性值
* @return HTML文本集合
*/
public List get_TagNode_HtmlList(String html,String TagNodeName ,String setAttrName ,String setAttrValue) {
List result = new ArrayList();
NodeList nodes = null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
NodeFilter attrNode_name = new HasAttributeFilter(setAttrName,setAttrValue);
NodeFilter andNode = new AndFilter(tagNode, attrNode_name);
nodes = parser.extractAllNodesThatMatch(andNode);
} catch (ParserException e) {
log.error("html:" + html, e);
}
if (nodes != null && nodes.size() > 0) {
for (int i = 0; i < nodes.size(); i++) {
TagNode textnode = (TagNode) nodes.elementAt(i);
String s = textnode.toHtml();
if (!HtmlUtil.isEmptyTrim(s)) {
result.add(s);
}
}
}
return result;
}
/**
* 获取HTML节点的属性
* @param html
* @param TagNodeName 节点名称
* @param getAttrName 属性名称
* @return
*/
public String get_TagNode_attr(String html, String TagNodeName,String getAttrName) {
String attr = "";
NodeList nodes = null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode = new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if (nodes != null && nodes.size() > 0) {
TagNode textnode = (TagNode) nodes.elementAt(0);
attr = textnode.getAttribute(getAttrName);
}
return attr;
}
/**
* 获取A标签链接名称
* @param html
* @param TagNodeName A
* @return
*/
public String get_LinkTag_text(String html,String TagNodeName) {
String text="";
NodeList nodes=null;
try {
Parser parser = new Parser(html);
NodeFilter tagNode=new TagNameFilter(TagNodeName);
nodes = parser.parse(tagNode);
} catch (ParserException e) {
e.printStackTrace();
}
if(nodes!=null&&nodes.size()>0) {
LinkTag textnode = (LinkTag) nodes.elementAt(0);
text = textnode.getLinkText().trim();
}
return text;
}
}
Remark:QQ交流群:260052172