java下载网页并读取内容

下载回来怎么也得读取内容:

package com.core.crawl;

import java.io.IOException;

import com.util.file.Files;

public class Crawl {

    /**
     * @param args
     * @throws IOException 
     * @throws InterruptedException 
     */
    public static void main(String[] args) throws IOException, InterruptedException {

	long begin = System.currentTimeMillis();
	//WebSpider spider2 = new WebSpider();
	WebSpider spider1 = new WebSpider();
	spider1.setWebAddress("http://www.w3c.org/robots.txt");
	spider1.setDestFile(Files.getSysPath() + "/"+"robots.");
	
	//spider2.setWebAddress("http://blog.csdn.net/longronglin");
	//spider2.setDestFile(Files.getSysPath() + "/"+"spider2.");

	Thread t1 = new Thread(spider1);
	//Thread t2 = new Thread(spider2);
	t1.start();
	//t2.start();

	t1.join();
	//t2.join();
	
	System.out.println("the end");
	System.out.println(System.currentTimeMillis() - begin);
    }
    


}
 
package com.core.crawl;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;

import com.core.http.Http;

public class WebSpider implements Runnable{
    
    private Http http = new Http();

    private String webAddress = "";
    private String destFile = "";
    
    public void setWebAddress(String webAddress){
	this.webAddress = webAddress;
    }
    
    public void setDestFile (String destFile){
	this.destFile = destFile;
    }
    
    public boolean download() throws IOException, InterruptedException {

	HttpURLConnection httpConn = null;

	try {
	    URL url = new URL(webAddress);
	  
	    httpConn = (HttpURLConnection) url.openConnection();
	    httpConn.setRequestMethod("GET");
	    httpConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.14) Gecko/20080404 Firefox/2.0.0.14");
	    InputStream in = httpConn.getInputStream();
	    String fileType = http.fileType(httpConn.getContentType());
	    System.out.println(fileType);
	    FileOutputStream out = new FileOutputStream(new File(destFile + fileType));
	    int chByte = in.read();
		
	    BufferedReader bf = new BufferedReader(new InputStreamReader(in));
	    String result = null;

	    while ((result = bf.readLine()) != null) {
		System.out.println(result);
	    }
		
//            while (chByte != -1) {
//		out.write(chByte);
//		
//		System.out.println(chByte);
//		chByte = in.read();
//	    }
            
	} catch (Exception ex) {
	    System.out.println(ex.toString());
	} finally {
	    httpConn.disconnect();
	}
	return true;
    }

    public void run() {
	try {
	    System.out.println(Thread.currentThread().getName());
	    download();    
	} catch (IOException e) {
	    e.printStackTrace();
	} catch (InterruptedException e) {
	    e.printStackTrace();
	}
    }
}

 

package com.util.file;

public class Files {

    /***
     * 获取应用程序的根目录
     * @return 应用程序根目录
     */
    public static String getSysPath(){
	return  System.getProperty("user.dir"); 
    }

    
}
 
results:

Thread-0
html

 

# robots.txt for http://www.w3.org/
#
# $Id: robots.txt,v 1.50 2007/12/13 17:09:37 ted Exp $
#

 

# For use by search.w3.org
User-agent: W3C-gsa
Disallow: /Out-Of-Date

 

User-agent: W3T_SE
Disallow: /Out-Of-Date

 

User-agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT; MS Search 4.0 Robot)
Disallow: /

 

# W3C Link checker
User-agent: W3C-checklink
Disallow:

 

# exclude some access-controlled areas
User-agent: *
Disallow: /2004/ontaria/basic
Disallow: /Team
Disallow: /Project
Disallow: /Web
Disallow: /Systems
Disallow: /History
Disallow: /Out-Of-Date
Disallow: /2002/02/mid
Disallow: /mid/
Disallow: /2004/08/W3CTalks
Disallow: /2007/11/Talks/search
Disallow: /People/all/
Disallow: /RDF/Validator/ARPServlet
Disallow: /2003/03/Translations/byLanguage
Disallow: /2003/03/Translations/byTechnology
Disallow: /2005/11/Translations/Query
Disallow: /2003/glossary/subglossary/
#Disallow: /2005/06/blog/
#Disallow: /2001/07/pubrules-checker
#shouldnt get transparent proxies but will ml links of things like pubrules
Disallow: /2000/06/webdata/xslt
Disallow: /2000/09/webdata/xslt
Disallow: /2005/08/online_xslt/xslt
Disallow: /Bugs/
Disallow: /Search/Mail/Public/
Disallow: /2006/02/chartergen
the end
10485

 

 spider1.setWebAddress("http://www.w3c.org/");
 spider1.setDestFile(Files.getSysPath() + "/"+"w3c.");
的设置自己测试

你可能感兴趣的:(java,thread,windows,exception,String,download)