正则表达式解析出页面所有链接,并得到链接的内容

Main类的main方法得到所有链接,此方法是带链接状态的
package com.logistics;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;

public class Main1 {

	/**
	 * @param args
	 * @throws IOException
	 * @throws ClientProtocolException
	 */
	public static void main(String[] args) throws Exception {
		 // Create and initialize HTTP parameters  
        HttpParams params = new BasicHttpParams();  
        ConnManagerParams.setMaxTotalConnections(params, 10);  
        HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);  
  
        // Create and initialize scheme registry  
        SchemeRegistry schemeRegistry = new SchemeRegistry();  
        schemeRegistry.register(  
                new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));  
  
        ClientConnectionManager cm = new ThreadSafeClientConnManager(params, schemeRegistry);  
		HttpClient client = new DefaultHttpClient(cm, params);
		HttpGet get = new HttpGet("http://localhost:8080/docs/");
		HttpResponse response = client.execute(get);
		HttpEntity entity = response.getEntity();
		byte[] b = new byte[1024];
		ByteArrayOutputStream stream = new ByteArrayOutputStream();
		if (entity != null) {
			InputStream is = entity.getContent();
			while (is.read(b) != -1) {
				stream.write(b);
			}
		}
		Pattern pattern = Pattern.compile("\\w+\\.html");
		Matcher matcher = pattern.matcher(stream.toString("utf-8"));
		ArrayList<String> list=new ArrayList<String>();
		while (matcher.find()) {
			list.add("http://localhost:8080/docs/"+matcher.group());
		}
		for (int i = 0; i < list.size(); i++) {
			new SpiderThread(client, new HttpGet(list.get(i)), i + 1).run();
		}
	}
}


然后使用线程得到链接内容
package com.logistics;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.protocol.BasicHttpContext;
import org.apache.http.protocol.HttpContext;
import org.apache.http.util.EntityUtils;

public class SpiderThread extends Thread {
	 private final HttpClient httpClient;  
     private final HttpContext context;  
     private final HttpGet httpGet;  
     private final int id;  

     public SpiderThread(HttpClient httpClient, HttpGet httpGet, int id) {  
         this.httpClient = httpClient;  
         this.context = new BasicHttpContext();  
         this.httpGet = httpGet;  
         this.id = id;  
     }  

     /** 
      * Executes the GetMethod and prints some status information. 
      */  
     @Override  
     public void run() {  
    	Long start = System.currentTimeMillis();
         try {  
             HttpResponse response = httpClient.execute(httpGet);  
             HttpEntity entity = response.getEntity();  
             if (entity != null) {  
                 byte[] bytes = EntityUtils.toByteArray(entity);  
//                 System.out.println(new String(bytes,"utf-8"));  
                 System.out.println(httpGet.getURI().getPath());  
             }  
         } catch (Exception e) {  
             httpGet.abort();  
             System.out.println(id + " - error: " + e);  
         }  
         Long end = System.currentTimeMillis();
 		System.out.println(id +"  --  用时:"+(end-start));
     }  
}

你可能感兴趣的:(httpclient,thread,Pattern,Matcher)