最近因为项目需求,抓取了大大小小多个网站的新闻,刚开始写用的是jsoup解析页面,每个站点都有写一套解析方案,效率较慢,后来利用xpath解析,开发数度有了很大的提升,在一周内完成了一百多个站点的新闻抓取。
下面是我一个简单示例,博主刚毕业,还是个技术小白,如有写的不对或不妥的地方,请评论指出类,大家共同进步,下图是测试效果,不同的网站只需要更改xpath即可
为了帮助有需要的朋友,下面贴上我写的代码模型,由于新闻网站一般没有反爬,所有demo中没有反爬的相关策略,一般的爬虫项目由下载器、调度器、解析器组成,本demo中没有实现调度器。
1、项目是基于maven搭建的,首先引入相关依赖
UTF-8
4.5.1
2.16
1.2.17
log4j
log4j
${log4j_version}
com.google.guava
guava
19.0
commons-io
commons-io
2.4
commons-beanutils
commons-beanutils
1.9.2
commons-lang
commons-lang
2.6
net.sourceforge.htmlcleaner
htmlcleaner
${htmlcleaner_version}
junit
junit
3.8.1
test
org.apache.httpcomponents
fluent-hc
${httpclient_version}
org.apache.httpcomponents
httpclient
${httpclient_version}
org.apache.httpcomponents
httpcore
4.4.3
org.apache.httpcomponents
httpmime
${httpclient_version}
2.1 HttpClient连接池
package com.zhb.ims.utils.httpclient;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.ssl.TrustStrategy;
public class HttpClientManger {
private PoolingHttpClientConnectionManager connectionManager;
private HttpRequestRetryHandler httpRequestRetryHandler;
private static HttpClientManger httpClientManger;
private static Lock lock = new ReentrantLock();
private volatile AtomicBoolean isShutDown;
public void init() {
try {
SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new DefaultTrustStrategy()).build();
@SuppressWarnings("deprecation")
HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER;
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, hostnameVerifier);
Registry socketFactoryRegistry = RegistryBuilder. create()
.register("http", PlainConnectionSocketFactory.getSocketFactory()).register("https", sslsf).build();
connectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
connectionManager.setMaxTotal(800);
connectionManager.setDefaultMaxPerRoute(20);
httpRequestRetryHandler = new DefaultRequestRetryHandler();
this.isShutDown = new AtomicBoolean(false);
} catch (Exception e) {
e.printStackTrace();
}
}
private HttpClientManger() {
super();
this.isShutDown = new AtomicBoolean(true);
init();
}
public static HttpClientManger newInstance() {
lock.lock();
if (httpClientManger == null) {
httpClientManger = new HttpClientManger();
}
lock.unlock();
return httpClientManger;
}
public CloseableHttpClient getClient() {
CloseableHttpClient client = null;
lock.lock();
if (this.isShutDown.compareAndSet(false, true)) {
client = HttpClients.custom().setConnectionManager(this.connectionManager).setRetryHandler(httpRequestRetryHandler).build();
}else {
init();
client = HttpClients.custom().setConnectionManager(connectionManager).setRetryHandler(httpRequestRetryHandler).build();
}
lock.unlock();
return client;
}
public void destory() {
if (this.isShutDown.compareAndSet(false, true)) {
this.connectionManager.shutdown();
}
isShutDown = new AtomicBoolean(true);
}
class DefaultTrustStrategy implements TrustStrategy{
@Override
public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
return true;
}
}
}
2.2默认的重连策略
package com.zhb.ims.utils.httpclient;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
import java.util.Iterator;
import java.util.List;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.protocol.HttpContext;
import com.google.common.collect.Lists;
public class DefaultRequestRetryHandler implements HttpRequestRetryHandler {
private int executionCount;
List> ignoreException;
List> dealException;
public DefaultRequestRetryHandler() {
super();
Init();
}
@SuppressWarnings("unchecked")
public DefaultRequestRetryHandler(int executionCount) {
super();
this.executionCount = executionCount;
ignoreException = ignoreException.isEmpty()? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class
,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;
dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;
}
@SuppressWarnings("unchecked")
public DefaultRequestRetryHandler(int executionCount, List> ignoreException) {
super();
this.executionCount = executionCount;
this.ignoreException = ignoreException;
dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;
}
public DefaultRequestRetryHandler(int executionCount, List> ignoreException,
List> dealException) {
super();
this.executionCount = executionCount;
this.ignoreException = ignoreException;
this.dealException = dealException;
}
@SuppressWarnings("unchecked")
private void Init() {
executionCount = executionCount <= 0 ? 5 : executionCount;
ignoreException = (ignoreException == null ||ignoreException.isEmpty())? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class
,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;
dealException = (dealException ==null || dealException.isEmpty())? Lists.newArrayList(NoHttpResponseException.class): dealException;
}
@Override
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (executionCount >= this.executionCount) {
return false;
}
for (Iterator> iterator = ignoreException.iterator(); iterator.hasNext();) {
Class extends Exception> clazz = (Class extends Exception>) iterator.next();
if (exception.getClass().isAssignableFrom(clazz)) {
return false;
}
}
for (Iterator> iterator = dealException.iterator(); iterator.hasNext();) {
Class extends Exception> clazz = (Class extends Exception>) iterator.next();
if (exception.getClass().isAssignableFrom(clazz)) {
return true;
}
}
exception.printStackTrace();
return false;
}
}
2.3工具类
package com.zhb.ims.utils.httpclient;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
public class ClientMethodUtils {
private static final String DefaultCharSet = "utf-8";
/**
* 给post或者get添加header参数的泛型方法
*
* @param c
* HttpPost/HttpGet对象
* @param map
* 存放header的key-value的Map
* @return
* @throws Exception
* @throws InstantiationException
*/
public static T addHeader(T t, Map map) {
if (t != null && map != null && map.size() > 0) {
Iterator> iterable = map.entrySet().iterator();
while (iterable.hasNext()) {
try {
Entry entry = iterable.next();
t.setHeader(entry.getKey(), entry.getValue());
} catch (Exception e) {
System.out.println(e == null ? "HttpRequestBase Add Params Error!" : e.getMessage());
}
}
} else {
System.out.println("Parama Is Illegal!");
}
return t;
}
/**
* 给post方法添加参数
*
* @param post
* @param paramsMap
* @param charSet
* @return
*/
public static HttpPost addPostWithParams(HttpPost post, Map paramsMap, String charSet) {
if (post != null && paramsMap != null && paramsMap.size() > 0) {
List nvps = new ArrayList<>();
Iterator> iterator = paramsMap.entrySet().iterator();
try {
while (iterator.hasNext()) {
Entry entry = iterator.next();
String key = entry.getKey();
String value = entry.getValue();
if (key != null) {
nvps.add(new BasicNameValuePair(key, value == null ? "" : value));
} else {
continue;
}
}
post.setEntity(new UrlEncodedFormEntity(nvps, charSet));
} catch (Exception e) {
System.out.println("Add Params Error!");
}
} else {
System.out.println("Params Is Illegal!");
}
return post;
}
/**
* 从页面中解析字体编码
* @param htmlPage
* @return
*/
private static String getCharSet(final String htmlPage) {
String regex1 = "
2.4简单下载器
package com.lhh.request;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import com.zhb.ims.utils.httpclient.ClientMethodUtils;
import com.zhb.ims.utils.httpclient.HttpClientManger;
public class BaseTestRequest {
public static String getContent(String url) {
HttpGet get = new HttpGet(url);
CloseableHttpClient client = HttpClientManger.newInstance().getClient();
String page = ClientMethodUtils.getContent(client, get);
return page;
}
public static String postContent(String url) {
HttpPost get = new HttpPost(url);
CloseableHttpClient client = HttpClientManger.newInstance().getClient();
String page = ClientMethodUtils.getContent(client, get);
return page;
}
}
package com.lhh.parse;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.namespace.QName;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.lhh.util.LoggerUtils;
import com.lhh.util.ObjectUtils;
public class BaseParse {
public static String getNodeValue(final Object result) {
if (result != null) {
if (result instanceof NodeList) {
final StringBuffer stringBuffer = new StringBuffer();
NodeList nodeList = (NodeList) result;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
stringBuffer.append(node.getNodeValue().trim().replaceAll("\n", "") + " ");
}
return stringBuffer.toString();
}else {
LoggerUtils.warn("Result Is Not A Node Or NodeList");
}
}else {
LoggerUtils.warn("Result Is Null");
}
return null;
}
public static T parseObject(Class clazz,String htmlPage, Map itemMap) throws Exception {
HtmlCleaner hcCleaner = new HtmlCleaner();
TagNode tagNode = hcCleaner.clean(htmlPage);
Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Iterator> iterator = itemMap.entrySet().iterator();
XPath xPath = XPathFactory.newInstance().newXPath();
Map
4、用到的工具类
4.1、Logger工具类
package com.lhh.util;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
//import org.apache.log4j.Logger;
/**
*
* @author liuhang
*
*/
public class LoggerUtils {
static class LoggerWapper {
private Logger logger;
private StackTraceElement stackTraceElement;
private String methodName;
private int lineNum;
private Object message;
private Object wapperMessage;
private Class> clazz;
public Logger getLogger() {
return logger;
}
public void setLogger(Logger logger) {
this.logger = logger;
}
public StackTraceElement getStackTraceElement() {
return stackTraceElement;
}
public void setStackTraceElement(StackTraceElement stackTraceElement) {
this.stackTraceElement = stackTraceElement;
}
public String getMethodName() {
return methodName;
}
public void setMethodName(String methodName) {
this.methodName = methodName;
}
public int getLineNum() {
return lineNum;
}
public void setLineNum(int lineNum) {
this.lineNum = lineNum;
}
public Object getMessage() {
return message;
}
public void setMessage(Object message) {
this.message = message;
}
public Object getWapperMessage() {
return wapperMessage;
}
public void setWapperMessage(Object wapperMessage) {
this.wapperMessage = wapperMessage;
}
public Class> getClazz() {
return clazz;
}
public void setClazz(Class> clazz) {
this.clazz = clazz;
}
public LoggerWapper(Object message) {
super();
this.message = message;
}
@Override
public String toString() {
return "LoggerWapper [logger=" + logger + ", stackTraceElement=" + stackTraceElement + ", methodName="
+ methodName + ", lineNum=" + lineNum + ", message=" + message + ", wapperMessage=" + wapperMessage
+ ", clazz=" + clazz + "]";
}
}
private static Class> getInvokeClass(StackTraceElement stackTraceElement) {
if (stackTraceElement != null) {
Class> clazz;
try {
clazz = Class.forName(stackTraceElement.getClassName());
return clazz;
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
return null;
}
private static String getInvokeMethodName(StackTraceElement stackTraceElement) {
if (stackTraceElement != null) {
String methodName = null;
methodName = stackTraceElement.getMethodName();
return methodName;
}
return null;
}
private static Object msgWapper(Object message, StackTraceElement stackTraceElement) {
if (stackTraceElement != null) {
StringBuffer stringBuffer = new StringBuffer("");
int lineNum = getInvokeLineNum(stackTraceElement);
String methodName = getInvokeMethodName(stackTraceElement);
Class> clazz = getInvokeClass(stackTraceElement);
if (lineNum > 0) {
stringBuffer.append(
clazz.getName() + "." + methodName + "(" + clazz.getSimpleName() + ".java:" + lineNum + ")");
stringBuffer.append(" - " + message);
}
return stringBuffer.toString();
}
return message;
}
private static int getInvokeLineNum(StackTraceElement stackTraceElement) {
int num = 0;
if (stackTraceElement != null) {
num = stackTraceElement.getLineNumber();
}
return num;
}
private static StackTraceElement getInvokeInfo(int num) {
if (num > -1) {
Lock lock = new ReentrantLock();
lock.lock();
StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace();
lock.unlock();
if (stackTraceElements != null && stackTraceElements.length > num) {
StackTraceElement stackTraceElement = stackTraceElements[num];
return stackTraceElement;
}
}
return null;
}
private static LoggerWapper getLoggerWapper(Object message) {
LoggerWapper loggerWapper = new LoggerWapper(message);
StackTraceElement stackTraceElement = getInvokeInfo(4);
loggerWapper.setStackTraceElement(stackTraceElement);
Class> clazz = getInvokeClass(stackTraceElement);
loggerWapper.setClazz(clazz);
Logger logger = Logger.getLogger(clazz);
loggerWapper.setLogger(logger);
String methodName = getInvokeMethodName(stackTraceElement);
loggerWapper.setMethodName(methodName);
int lineNum = getInvokeLineNum(stackTraceElement);
loggerWapper.setLineNum(lineNum);
Object wapperMessage = msgWapper(message, loggerWapper.getStackTraceElement());
loggerWapper.setWapperMessage(wapperMessage);;
return loggerWapper;
}
public static void debug(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().debug(loggerWapper.getWapperMessage());
}
public static void debug(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().debug(loggerWapper.getWapperMessage(), t);
}
public static void error(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().error(loggerWapper.getWapperMessage());
}
public static void error(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().error(loggerWapper.getWapperMessage(), t);
}
public static void fatal(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage());
}
public static void fatal(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage(), t);
}
public static void info(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().info(loggerWapper.getWapperMessage());
}
public static void info(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().info(loggerWapper.getWapperMessage(), t);
}
public static void warn(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().warn(loggerWapper.getWapperMessage());
}
public static void warn(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().warn(loggerWapper.getWapperMessage(), t);
}
}
4.2、字符串工具类
package com.lhh.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringUtils extends org.apache.commons.lang.StringUtils {
public static String getRegexIndex(final String str, final String regex, final int index) {
if (isNotBlank(regex) && isNotBlank(str)) {
if (index >= 1) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(str);
while (matcher.find()) {
if (matcher.groupCount() < index) {
LoggerUtils.warn("Index Is OutOfBounds!");
} else {
return matcher.group(index);
}
return "";
}
} else {
LoggerUtils.warn("Index Is Illegal!");
}
} else {
LoggerUtils.warn("Str Or Regex Is Blank!");
}
return null;
}
}
4.3、对象工具类
package com.lhh.util;
import java.lang.reflect.Field;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
public class ObjectUtils {
/**
* 把r对象的所有属性拷贝到t对象中
* @param t
* @param r
*/
public static void copy(final T t, final R r) {
if (t != null && r != null) {
Field[] rfields = r.getClass().getDeclaredFields();
Field[] tfields = t.getClass().getDeclaredFields();
L: for (Field rfield : rfields) {
rfield.setAccessible(true);
for (Field tfield : tfields) {
if (rfield.getName().equals(tfield.getName())) {
tfield.setAccessible(true);
try {
tfield.set(t, rfield.get(r));
} catch (Exception e) {
continue L;
}
}
}
}
}
}
/**
* 把map对象的key-value拷贝到t对象中
* @param t
* @param r
*/
public static void copyWithMap(final T t, final Map
5、模型对象
package com.lhh.model;
public class NewModel {
private String title;
private String content;
private String time;
private String source;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
@Override
public String toString() {
return "NewModel [title=" + title + ", content=" + content + ", time=" + time + ", source=" + source + "]";
}
}
6、测试
package com.lhh.test;
import java.util.HashMap;
import java.util.Map;
import javax.xml.xpath.XPathConstants;
import org.apache.http.client.methods.HttpGet;
import com.lhh.model.NewModel;
import com.lhh.parse.BaseParse;
import com.lhh.request.BaseTestRequest;
import com.lhh.util.LoggerUtils;
import com.zhb.ims.utils.httpclient.ClientMethodUtils;
import com.zhb.ims.utils.httpclient.HttpClientManger;
public class Test {
public static void main(String[] args) throws Exception {
//新闻列表url
String newListUrl = "http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=1";
String newListPage = BaseTestRequest.getContent(newListUrl);
//获取新闻列表页面上新闻Url
String xpath = "//div[@id='d_list']/ul/li/span[@class='c_tit']/a/@href";
Object result = BaseParse.parse(newListPage, xpath, XPathConstants.NODESET);
String urlList = BaseParse.getNodeValue(result);
String [] urlArray = urlList.split(" ");
for (int i = 0; i < urlArray.length; i++) {
Map map = new HashMap<>();
//配置新闻标题的xpath
map.put("title", "//*[@id='main_title']/text() | //*[@id='artibodyTitle']/text()");
//配置新闻发布时间的xpath
map.put("time", "//*[@id='page-tools']/span/span[@class='titer']/text() | //*[@id='navtimeSource']/text()");
//配置新闻正文内容的xpath
map.put("content", "//*[@id='artibody']/p/text()");
HttpGet get = new HttpGet(urlArray[i]);
//使用下载器下载页面元素
String page = ClientMethodUtils.getContent(HttpClientManger.newInstance().getClient(), get);
//调用解析取解析页面数据
NewModel weatherPojo = BaseParse.parseObject(NewModel.class, page, map);
LoggerUtils.error(weatherPojo.toString());
}
}
}