java-httpclient通过title实现从baidu爬取相关网页链接


思路是:

1.构造百度搜索的链接


2.初始化一个httpclient对象


3.用httpclient发送请求,返回页面content   (要判断页面编码) 这里用的是get方式 post方式可参考文章:HTTPClient模块的HttpGet和HttpPost


4.提取页面上的链接(可用正则式也可用jsoup)


需要的包链接:

http://download.csdn.net/detail/q383965374/5960953


新建一个工程 以下有两个class  同时引用需要的包 得到结构如下:

java-httpclient通过title实现从baidu爬取相关网页链接_第1张图片


具体代码如下:

CrawbaiduLink_test.java

package CrawbaiduLink;

import info.monitorenter.cpdetector.io.ASCIIDetector;
import info.monitorenter.cpdetector.io.ByteOrderMarkDetector;
import info.monitorenter.cpdetector.io.CodepageDetectorProxy;
import info.monitorenter.cpdetector.io.JChardetFacade;
import info.monitorenter.cpdetector.io.UnicodeDetector;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.routing.HttpRoute;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.conn.ssl.SSLSocketFactory;
import org.apache.http.impl.client.DecompressingHttpClient;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.PoolingClientConnectionManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.params.HttpParams;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;


/**
 * 需要commons-logging.jar和commons-lang.jar包才能运行
 */








public class CrawbaiduLink_test {
	
	final private static String URL= "http://www.baidu.com/s?wd=";
	
	
	/**
	 * HTTPCLIENT 连接管理
	 */
	/**
	 * 最大连接数
	 */
	public final static int MAX_TOTAL_CONNECTIONS = 800;
	/**
	 * 获取连接的最大等待时间
	 */
	public final static int WAIT_TIMEOUT = 60000;
	/**
	 * 每个路由最大连接数
	 */
	public final static int MAX_ROUTE_CONNECTIONS = 400;
	/**
	 * 连接超时时间
	 */
	public final static int CONNECT_TIMEOUT = 10000;
	/**
	 * 读取超时时间
	 */
	public final static int READ_TIMEOUT = 60000;
	
	private static HttpClient httpClient;

	private static DecompressingHttpClient decompressHttpClient;
	

	
	
			

	

/**
 * 初始化HTTPCLIENT 需要包httpclient-4.2.5.jar和httpcore-4.2.4.jar
 */
public static void  initHttpClient(){
	
	HttpParams params = new BasicHttpParams();
	SchemeRegistry schemeRegistry = new SchemeRegistry();
	schemeRegistry.register(new Scheme("http", 80, PlainSocketFactory.getSocketFactory()));
	schemeRegistry.register(new Scheme("https",443,SSLSocketFactory.getSocketFactory()));
	PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry);
	httpClient = new DefaultHttpClient(cm, params);
	decompressHttpClient = new DecompressingHttpClient(httpClient);
	cm.setMaxTotal(MAX_TOTAL_CONNECTIONS);
	cm.setDefaultMaxPerRoute(MAX_ROUTE_CONNECTIONS);
	HttpHost localhost = new HttpHost("locahost", 80);
	cm.setMaxPerRoute(new HttpRoute(localhost), 50);
	httpClient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, CONNECT_TIMEOUT);
	httpClient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, READ_TIMEOUT);
	
}

/**
 * 初始化消息头 
 * @param httpGet
 * @param url
 * @throws URISyntaxException
 */
public static void initHeader(HttpGet httpGet,String url) throws URISyntaxException{
	httpGet.setURI(new URI(url));
	httpGet.addHeader("Accept-Language", "en-us");
//	httpGet.addHeader("Accept-Encoding", "gzip,deflate");
}


/**
 * 爬取网页 上所有内容
 * @param httpClient
 * @param url
 * @return
 */
public static String crawlPageContent(HttpClient httpClient, String url){
	HttpGet httpGet = new HttpGet();
	InputStream inputStream = null;
	try {
		initHeader(httpGet,url);
		HttpResponse response = httpClient.execute(httpGet);
		HttpEntity entity = response.getEntity();
		String encode = getEncoding(url);
		 if(encode.equals("windows-1252")){
			 encode = "GBK";
		 }
		if (entity != null) {
			inputStream = entity.getContent();
			String content = EntityUtils.toString(entity,encode);
			return content;
		}
		return null;
	} catch (ClientProtocolException e) {
		e.printStackTrace();
	} catch (IOException e) {
		e.printStackTrace();
	} catch (URISyntaxException e) {
		e.printStackTrace();
	} finally {
		if (inputStream != null) {
			try {
				inputStream.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
	return null;
}


/**
 *分析页面编码 用到包cpdetector.jar,chardet.jar
 */

private static CodepageDetectorProxy detector;

public static String getEncoding(File document) {
	
	CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
	detector.add(new ByteOrderMarkDetector());
	detector.add(ASCIIDetector.getInstance());
	detector.add(UnicodeDetector.getInstance());
	detector.add(JChardetFacade.getInstance());
	java.nio.charset.Charset charset = null;
	try {
		charset = detector.detectCodepage(document.toURI().toURL());
	} catch (MalformedURLException e1) {
		e1.printStackTrace();
	} catch (IOException e1) {
		e1.printStackTrace();
	}
	return charset.toString();
}

public static String getEncoding(String url) {
	java.nio.charset.Charset charset = null;
	detector = CodepageDetectorProxy.getInstance();
	detector.add(new ByteOrderMarkDetector());
	detector.add(ASCIIDetector.getInstance());
	detector.add(UnicodeDetector.getInstance());
	detector.add(JChardetFacade.getInstance());
	try {
		charset = detector.detectCodepage(new URL(url));
	} catch (MalformedURLException e1) {
		e1.printStackTrace();
	} catch (IOException e1) {
		e1.printStackTrace();
	}
	if (charset == null) {
		return "utf-8";
	}
	return charset.toString();
}

private final static Pattern regBaidu = Pattern
.compile("(?:(?:site:([^']+))?'\\s*}\"\\s*href=\")(http://www\\.baidu\\.com/link\\?url=[^\"]+)");


/**
 * 解析百度搜索出的页面提取链接
 * 
 * @param content
 * @return
 */
public static List<Link> parseBaiduSearchLinks(String content) {
	List<Link> rst = new ArrayList<Link>();
	Matcher mt = regBaidu.matcher(content);
	while (mt.find()) {
		Link tlink = new Link();
		tlink.setDepth(0);
		tlink.setParent(initPrimiryLink("www.baidu.com"));
		if (mt.group(1) != null) {
			tlink.setSource(mt.group(1));
		}
		if (mt.group(2) != null) {
			tlink.setUrl(mt.group(2));
			rst.add(tlink);
		}
	}
	return rst;
}

private static Link initPrimiryLink(String url){
	Link link = new Link();
	link.setDepth(0);
	link.setParent(null);
	link.setUrl(url);
	return link;
}

public static void main(String[] args) {
	
	String keyword="httpclient"; //要查找的关键字
	String Title="+博客园";//要找的网页的title内容
	String url = URL + keyword + Title;
	
	initHttpClient();
	
	String content =crawlPageContent(httpClient,url);
	
	List<Link> links = parseBaiduSearchLinks(content);
	
	for(Link l : links ){
		String pageContent = crawlPageContent(httpClient,l.getUrl());
		Document doc = Jsoup.parse(pageContent);
		String title = doc.title();
		System.out.println(l.getUrl() + "  " + title);
	}
	
	
}


}

Link.java 用来定义获得链接的结构

package CrawbaiduLink;
import java.util.Date;



public class Link {
	private String uuid;
	private String url; 
	private Date lastAccess; //上次访问时间
	private Link parent; //父节点
	private String source;
	private int depth;
	
	public int getDepth() {
		return depth;
	}
	public void setDepth(int depth) {
		this.depth = depth;
	}
	public String getUuid() {
		return uuid;
	}
	public void setUuid(String uuid) {
		this.uuid = uuid;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	public Date getLastAccess() {
		return lastAccess;
	}
	public void setLastAccess(Date lastAccess) {
		this.lastAccess = lastAccess;
	}
	public Link getParent() {
		return parent;
	}
	public void setParent(Link parent) {
		this.parent = parent;
	}
	public String getSource() {
		return source;
	}
	public void setSource(String source) {
		this.source = source;
	}
}



运行的结果如下:

java-httpclient通过title实现从baidu爬取相关网页链接_第2张图片

除了一些拒绝访问的网页 其他的相关的网页基本上可以抓下来

如果要明确要哪一个网页 可以对链接的title进行筛选 或者 把title 定的更详细  



ps:用title来做相关搜索  谷歌搜索 效果要好一点 比较准确  百度的可能受网站排名的影响



你可能感兴趣的:(java,爬虫)