聚焦网络爬虫之Xpath+HttpClient快速实现页面新闻抓取

最近因为项目需求,抓取了大大小小多个网站的新闻,刚开始写用的是jsoup解析页面,每个站点都有写一套解析方案,效率较慢,后来利用xpath解析,开发数度有了很大的提升,在一周内完成了一百多个站点的新闻抓取。

下面是我一个简单示例,博主刚毕业,还是个技术小白,如有写的不对或不妥的地方,请评论指出类,大家共同进步,下图是测试效果,不同的网站只需要更改xpath即可

聚焦网络爬虫之Xpath+HttpClient快速实现页面新闻抓取_第1张图片

为了帮助有需要的朋友,下面贴上我写的代码模型,由于新闻网站一般没有反爬,所有demo中没有反爬的相关策略,一般的爬虫项目由下载器、调度器、解析器组成,本demo中没有实现调度器。

1、项目是基于maven搭建的,首先引入相关依赖


		UTF-8
		
		4.5.1
		
		2.16
		
		1.2.17
	

	
		
		
			log4j
			log4j
			${log4j_version}
		
		
			com.google.guava
			guava
			19.0
		
		
			commons-io
			commons-io
			2.4
		
		
			commons-beanutils
			commons-beanutils
			1.9.2
		
		
			commons-lang
			commons-lang
			2.6
		
		
		
			net.sourceforge.htmlcleaner
			htmlcleaner
			${htmlcleaner_version}
		
		
			junit
			junit
			3.8.1
			test
		
		
			org.apache.httpcomponents
			fluent-hc
			${httpclient_version}
		

		
			org.apache.httpcomponents
			httpclient
			${httpclient_version}
		
		
			org.apache.httpcomponents
			httpcore
			4.4.3
		
		
			org.apache.httpcomponents
			httpmime
			${httpclient_version}
		
	

2、下载器使用的是apache的开源项目httpclient,包含了httpclient连接池,工具类等

2.1 HttpClient连接池

package com.zhb.ims.utils.httpclient;

import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.ssl.TrustStrategy;

public class HttpClientManger {
	private PoolingHttpClientConnectionManager connectionManager;
	private HttpRequestRetryHandler httpRequestRetryHandler;
	private static HttpClientManger httpClientManger;
	private static Lock lock = new ReentrantLock();
	private volatile AtomicBoolean isShutDown;
	public void init() {
		try {
			SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new DefaultTrustStrategy()).build();
			@SuppressWarnings("deprecation")
			HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER;
			SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, hostnameVerifier);
			Registry socketFactoryRegistry = RegistryBuilder. create()
					.register("http", PlainConnectionSocketFactory.getSocketFactory()).register("https", sslsf).build();
			connectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
			connectionManager.setMaxTotal(800);
			connectionManager.setDefaultMaxPerRoute(20);
			httpRequestRetryHandler = new DefaultRequestRetryHandler();
			this.isShutDown = new AtomicBoolean(false);
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	private HttpClientManger() {
		super();
		this.isShutDown = new AtomicBoolean(true);
		init();
	}

	public static HttpClientManger newInstance() {
		lock.lock();
		if (httpClientManger == null) {
			httpClientManger = new HttpClientManger();
		}
		lock.unlock();
		return httpClientManger;
	}

	public CloseableHttpClient getClient() {
		CloseableHttpClient client = null;
		lock.lock();
		if (this.isShutDown.compareAndSet(false, true)) {
			client = HttpClients.custom().setConnectionManager(this.connectionManager).setRetryHandler(httpRequestRetryHandler).build();
		}else {
			init();
			client = HttpClients.custom().setConnectionManager(connectionManager).setRetryHandler(httpRequestRetryHandler).build();
		}
		lock.unlock();
		return client;
	}
	
	public void destory() {
		if (this.isShutDown.compareAndSet(false, true)) {
			this.connectionManager.shutdown();
		}
		isShutDown = new AtomicBoolean(true);
	}
	class DefaultTrustStrategy implements TrustStrategy{
		@Override
		public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
			return true;
		}

	}
}
2.2默认的重连策略

package com.zhb.ims.utils.httpclient;

import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
import java.util.Iterator;
import java.util.List;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.protocol.HttpContext;
import com.google.common.collect.Lists;

public class DefaultRequestRetryHandler implements HttpRequestRetryHandler {

	private int executionCount;
	List> ignoreException;
	List> dealException;
	public DefaultRequestRetryHandler() {
		super();
		Init();
	}

	@SuppressWarnings("unchecked")
	public DefaultRequestRetryHandler(int executionCount) {
		super();
		this.executionCount = executionCount;
		ignoreException = ignoreException.isEmpty()? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class
				,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;
		dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;
	}

	@SuppressWarnings("unchecked")
	public DefaultRequestRetryHandler(int executionCount, List> ignoreException) {
		super();
		this.executionCount = executionCount;
		this.ignoreException = ignoreException;
		dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;
	}

	public DefaultRequestRetryHandler(int executionCount, List> ignoreException,
			List> dealException) {
		super();
		this.executionCount = executionCount;
		this.ignoreException = ignoreException;
		this.dealException = dealException;
	}

	@SuppressWarnings("unchecked")
	private void Init() {
		executionCount = executionCount <= 0 ? 5 : executionCount;
		ignoreException = (ignoreException == null ||ignoreException.isEmpty())? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class
				,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;
		dealException = (dealException ==null || dealException.isEmpty())? Lists.newArrayList(NoHttpResponseException.class): dealException;
	}

	@Override
	public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
		if (executionCount >= this.executionCount) {
			return false;
		}
		for (Iterator> iterator = ignoreException.iterator(); iterator.hasNext();) {
			Class clazz = (Class) iterator.next();
			if (exception.getClass().isAssignableFrom(clazz)) {
				return false;
			}
		}
		for (Iterator> iterator = dealException.iterator(); iterator.hasNext();) {
			Class clazz = (Class) iterator.next();
			if (exception.getClass().isAssignableFrom(clazz)) {
				return true;
			}
		}
		exception.printStackTrace();
		return false;
	}
}
2.3工具类
package com.zhb.ims.utils.httpclient;

import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

public class ClientMethodUtils {

	private static final String DefaultCharSet = "utf-8";

	/**
	 * 给post或者get添加header参数的泛型方法
	 * 
	 * @param c
	 *            HttpPost/HttpGet对象
	 * @param map
	 *            存放header的key-value的Map
	 * @return
	 * @throws Exception
	 * @throws InstantiationException
	 */
	public static  T addHeader(T t, Map map) {
		if (t != null && map != null && map.size() > 0) {
			Iterator> iterable = map.entrySet().iterator();
			while (iterable.hasNext()) {
				try {
					Entry entry = iterable.next();
					t.setHeader(entry.getKey(), entry.getValue());
				} catch (Exception e) {
					System.out.println(e == null ? "HttpRequestBase Add Params Error!" : e.getMessage());
				}
			}
		} else {
			System.out.println("Parama Is Illegal!");
		}
		return t;
	}

	/**
	 * 给post方法添加参数
	 * 
	 * @param post
	 * @param paramsMap
	 * @param charSet
	 * @return
	 */
	public static HttpPost addPostWithParams(HttpPost post, Map paramsMap, String charSet) {
		if (post != null && paramsMap != null && paramsMap.size() > 0) {
			List nvps = new ArrayList<>();
			Iterator> iterator = paramsMap.entrySet().iterator();
			try {
				while (iterator.hasNext()) {
					Entry entry = iterator.next();
					String key = entry.getKey();
					String value = entry.getValue();
					if (key != null) {
						nvps.add(new BasicNameValuePair(key, value == null ? "" : value));
					} else {
						continue;
					}
				}
				post.setEntity(new UrlEncodedFormEntity(nvps, charSet));
			} catch (Exception e) {
				System.out.println("Add Params Error!");
			}
		} else {
			System.out.println("Params Is Illegal!");
		}
		return post;
	}
	
	/**
	 * 从页面中解析字体编码
	 * @param htmlPage
	 * @return
	 */
	private static String getCharSet(final String htmlPage) {
		String regex1 = "

2.4简单下载器

package com.lhh.request;


import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;


import com.zhb.ims.utils.httpclient.ClientMethodUtils;
import com.zhb.ims.utils.httpclient.HttpClientManger;


public class BaseTestRequest {
	
	public static String getContent(String url) {
		HttpGet get = new HttpGet(url);
		CloseableHttpClient client = HttpClientManger.newInstance().getClient();
		String page = ClientMethodUtils.getContent(client, get);
		return page;
	}


	
	public static String postContent(String url) {
		HttpPost get = new HttpPost(url);
		CloseableHttpClient client = HttpClientManger.newInstance().getClient();
		String page = ClientMethodUtils.getContent(client, get);
		return page;
	}
	
}


3、解析器代码

package com.lhh.parse;

import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.namespace.QName;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.lhh.util.LoggerUtils;
import com.lhh.util.ObjectUtils;

public class BaseParse {
	
	public static String getNodeValue(final Object result) {
		if (result != null) {
			if (result instanceof NodeList) {
				final StringBuffer stringBuffer = new StringBuffer();
				NodeList nodeList = (NodeList) result;
				for (int i = 0; i < nodeList.getLength(); i++) {
					Node node = nodeList.item(i);
					stringBuffer.append(node.getNodeValue().trim().replaceAll("\n", "") + " ");
				}
				return stringBuffer.toString();
			}else {
				LoggerUtils.warn("Result Is Not A Node Or NodeList");
			}
		}else {
			LoggerUtils.warn("Result Is Null");
		}
		return null;
	}
	
	public static  T parseObject(Class clazz,String htmlPage, Map itemMap) throws Exception {
		HtmlCleaner hcCleaner = new HtmlCleaner();
		TagNode tagNode = hcCleaner.clean(htmlPage);
		Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
		Iterator> iterator = itemMap.entrySet().iterator();
		XPath xPath = XPathFactory.newInstance().newXPath();
		Map resultMap = new HashMap<>();
		while (iterator.hasNext()) {
			Entry entry = iterator.next();
			String key = entry.getKey();
			String xpathStr = entry.getValue();
			if (StringUtils.isNotBlank(key) && StringUtils.isNotBlank(xpathStr)) {
				Object result = xPath.evaluate(xpathStr, dom, XPathConstants.NODESET);
				resultMap.put(key, getNodeValue(result));
			}else {
				LoggerUtils.warn("Key Or Xpath Is Blank!");
			}
		}
		T t = clazz.newInstance();
		ObjectUtils.copyWithMap(t, resultMap);
		return t;
	}

	public static Object parse(String htmlPage,String xPathStr,QName qName) throws Exception{
		Object result = null;
		if (StringUtils.isNotBlank(htmlPage) && StringUtils.isNotBlank(xPathStr)) {
			HtmlCleaner hcCleaner = new HtmlCleaner();
			TagNode tagNode = hcCleaner.clean(htmlPage);
			Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
			XPath xPath = XPathFactory.newInstance().newXPath();
			result = xPath.evaluate(xPathStr, dom, qName);
		}else {
			LoggerUtils.warn("Key Or Xpath Is Blank!");
		}
		return result;
	}

}
4、用到的工具类

4.1、Logger工具类

package com.lhh.util;

import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.log4j.Logger;


//import org.apache.log4j.Logger;

/**
 * 
 * @author liuhang
 *
 */
public class LoggerUtils {

	static class LoggerWapper {

		private Logger logger;

		private StackTraceElement stackTraceElement;

		private String methodName;

		private int lineNum;

		private Object message;

		private Object wapperMessage;

		private Class clazz;

		public Logger getLogger() {
			return logger;
		}

		public void setLogger(Logger logger) {
			this.logger = logger;
		}

		public StackTraceElement getStackTraceElement() {
			return stackTraceElement;
		}

		public void setStackTraceElement(StackTraceElement stackTraceElement) {
			this.stackTraceElement = stackTraceElement;
		}

		public String getMethodName() {
			return methodName;
		}

		public void setMethodName(String methodName) {
			this.methodName = methodName;
		}

		public int getLineNum() {
			return lineNum;
		}

		public void setLineNum(int lineNum) {
			this.lineNum = lineNum;
		}

		public Object getMessage() {
			return message;
		}

		public void setMessage(Object message) {
			this.message = message;
		}

		public Object getWapperMessage() {
			return wapperMessage;
		}

		public void setWapperMessage(Object wapperMessage) {
			this.wapperMessage = wapperMessage;
		}

		public Class getClazz() {
			return clazz;
		}

		public void setClazz(Class clazz) {
			this.clazz = clazz;
		}

		public LoggerWapper(Object message) {
			super();
			this.message = message;
		}

		@Override
		public String toString() {
			return "LoggerWapper [logger=" + logger + ", stackTraceElement=" + stackTraceElement + ", methodName="
					+ methodName + ", lineNum=" + lineNum + ", message=" + message + ", wapperMessage=" + wapperMessage
					+ ", clazz=" + clazz + "]";
		}
		
	}

	private static Class getInvokeClass(StackTraceElement stackTraceElement) {
		if (stackTraceElement != null) {
			Class clazz;
			try {
				clazz = Class.forName(stackTraceElement.getClassName());
				return clazz;
			} catch (ClassNotFoundException e) {
				e.printStackTrace();
			}
		}
		return null;
	}

	private static String getInvokeMethodName(StackTraceElement stackTraceElement) {
		if (stackTraceElement != null) {
			String methodName = null;
			methodName = stackTraceElement.getMethodName();
			return methodName;
		}
		return null;
	}

	private static Object msgWapper(Object message, StackTraceElement stackTraceElement) {
		if (stackTraceElement != null) {
			StringBuffer stringBuffer = new StringBuffer("");
			int lineNum = getInvokeLineNum(stackTraceElement);
			String methodName = getInvokeMethodName(stackTraceElement);
			Class clazz = getInvokeClass(stackTraceElement);
			if (lineNum > 0) {
				stringBuffer.append(
						clazz.getName() + "." + methodName + "(" + clazz.getSimpleName() + ".java:" + lineNum + ")");
				stringBuffer.append(" -  " + message);
			}
			return stringBuffer.toString();
		}
		return message;
	}

	private static int getInvokeLineNum(StackTraceElement stackTraceElement) {
		int num = 0;
		if (stackTraceElement != null) {
			num = stackTraceElement.getLineNumber();
		}
		return num;
	}

	private static StackTraceElement getInvokeInfo(int num) {
		if (num > -1) {
			Lock lock = new ReentrantLock();
			lock.lock();
			StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace();
			lock.unlock();
			if (stackTraceElements != null && stackTraceElements.length > num) {
				StackTraceElement stackTraceElement = stackTraceElements[num];
				return stackTraceElement;
			}
		}
		return null;
	}

	private static LoggerWapper getLoggerWapper(Object message) {
		LoggerWapper loggerWapper = new LoggerWapper(message);
		StackTraceElement stackTraceElement = getInvokeInfo(4);
		loggerWapper.setStackTraceElement(stackTraceElement);
		Class clazz = getInvokeClass(stackTraceElement);
		loggerWapper.setClazz(clazz);
		Logger logger = Logger.getLogger(clazz);
		loggerWapper.setLogger(logger);
		String methodName = getInvokeMethodName(stackTraceElement);
		loggerWapper.setMethodName(methodName);
		int lineNum = getInvokeLineNum(stackTraceElement);
		loggerWapper.setLineNum(lineNum);
		Object wapperMessage = msgWapper(message, loggerWapper.getStackTraceElement());
		loggerWapper.setWapperMessage(wapperMessage);;
		return loggerWapper;
	}

	public static void debug(Object message) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().debug(loggerWapper.getWapperMessage());
	}

	public static void debug(Object message, Throwable t) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().debug(loggerWapper.getWapperMessage(), t);
	}

	public static void error(Object message) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().error(loggerWapper.getWapperMessage());
	}

	public static void error(Object message, Throwable t) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().error(loggerWapper.getWapperMessage(), t);
	}

	public static void fatal(Object message) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage());
	}

	public static void fatal(Object message, Throwable t) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage(), t);
	}

	public static void info(Object message) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().info(loggerWapper.getWapperMessage());
	}

	public static void info(Object message, Throwable t) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().info(loggerWapper.getWapperMessage(), t);
	}

	public static void warn(Object message) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().warn(loggerWapper.getWapperMessage());
	}

	public static void warn(Object message, Throwable t) {
		LoggerWapper loggerWapper = getLoggerWapper(message);
		loggerWapper.getLogger().warn(loggerWapper.getWapperMessage(), t);
	}

}
4.2、字符串工具类

package com.lhh.util;


import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class StringUtils extends org.apache.commons.lang.StringUtils {


	public static String getRegexIndex(final String str, final String regex, final int index) {
		if (isNotBlank(regex) && isNotBlank(str)) {
			if (index >= 1) {
				Pattern pattern = Pattern.compile(regex);
				Matcher matcher = pattern.matcher(str);
				while (matcher.find()) {
					if (matcher.groupCount() < index) {
						LoggerUtils.warn("Index Is OutOfBounds!");
					} else {
						return matcher.group(index);
					}
					return "";
				}
			} else {
				LoggerUtils.warn("Index Is Illegal!");
			}
		} else {
			LoggerUtils.warn("Str Or Regex Is Blank!");
		}
		return null;
	}
}
4.3、对象工具类

package com.lhh.util;


import java.lang.reflect.Field;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;


public class ObjectUtils {


	/**
	 * 把r对象的所有属性拷贝到t对象中
	 * @param t
	 * @param r
	 */
	public static  void copy(final T t, final R r) {
		if (t != null && r != null) {
			Field[] rfields = r.getClass().getDeclaredFields();
			Field[] tfields = t.getClass().getDeclaredFields();
			L: for (Field rfield : rfields) {
				rfield.setAccessible(true);
				for (Field tfield : tfields) {
					if (rfield.getName().equals(tfield.getName())) {
						tfield.setAccessible(true);
						try {
							tfield.set(t, rfield.get(r));
						} catch (Exception e) {
							continue L;
						}
					}
				}
			}
		}
	}
	/**
	 * 把map对象的key-value拷贝到t对象中
	 * @param t
	 * @param r
	 */
	public static  void copyWithMap(final T t, final Map resMap) {
		if (t != null && resMap != null) {
			Field[] tfields = t.getClass().getDeclaredFields();
			Iterator> iterator = resMap.entrySet().iterator();
			L: while (iterator.hasNext()) {
				Entry entry = iterator.next();
				if (entry != null) {
					Object key = entry.getKey();
					Object value = entry.getValue();
					if (key != null && entry != null) {
						for (Field tfield : tfields) {
							if (key.toString().equals(tfield.getName())) {
								tfield.setAccessible(true);
								try {
									tfield.set(t, value);
								} catch (Exception e) {
									continue L;
								}
							}
						}
					}
				}
			}
		}
	}


}
5、模型对象

package com.lhh.model;


public class NewModel {
	
	private String title;
	
	private String content;
	
	private String time;
	
	private String source;

	public String getTitle() {
		return title;
	}

	public void setTitle(String title) {
		this.title = title;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getTime() {
		return time;
	}

	public void setTime(String time) {
		this.time = time;
	}

	public String getSource() {
		return source;
	}

	public void setSource(String source) {
		this.source = source;
	}

	@Override
	public String toString() {
		return "NewModel [title=" + title + ", content=" + content + ", time=" + time + ", source=" + source + "]";
	}
}
6、测试

package com.lhh.test;

import java.util.HashMap;
import java.util.Map;

import javax.xml.xpath.XPathConstants;

import org.apache.http.client.methods.HttpGet;
import com.lhh.model.NewModel;
import com.lhh.parse.BaseParse;
import com.lhh.request.BaseTestRequest;
import com.lhh.util.LoggerUtils;
import com.zhb.ims.utils.httpclient.ClientMethodUtils;
import com.zhb.ims.utils.httpclient.HttpClientManger;

public class Test {
	
	public static void main(String[] args) throws Exception {
		//新闻列表url
		String newListUrl = "http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=1";
		String newListPage = BaseTestRequest.getContent(newListUrl);
		//获取新闻列表页面上新闻Url
		String xpath = "//div[@id='d_list']/ul/li/span[@class='c_tit']/a/@href";
		Object result = BaseParse.parse(newListPage, xpath, XPathConstants.NODESET);
		String urlList = BaseParse.getNodeValue(result);
		String [] urlArray = urlList.split(" ");
		for (int i = 0; i < urlArray.length; i++) {
			Map map = new HashMap<>();
			//配置新闻标题的xpath
			map.put("title", "//*[@id='main_title']/text() | //*[@id='artibodyTitle']/text()");
			//配置新闻发布时间的xpath
			map.put("time", "//*[@id='page-tools']/span/span[@class='titer']/text() | //*[@id='navtimeSource']/text()");
			//配置新闻正文内容的xpath
			map.put("content", "//*[@id='artibody']/p/text()");
			HttpGet get = new HttpGet(urlArray[i]);
			//使用下载器下载页面元素
			String page = ClientMethodUtils.getContent(HttpClientManger.newInstance().getClient(), get);
			//调用解析取解析页面数据
			NewModel weatherPojo = BaseParse.parseObject(NewModel.class, page, map);
			LoggerUtils.error(weatherPojo.toString());
		}
		
	}
}










你可能感兴趣的:(聚焦网络爬虫之Xpath+HttpClient快速实现页面新闻抓取)