简单爬虫程序

因为以后信息检索要做个作业,今天正好有空,写了个简单的爬虫,真是不能再简单了。先爬了1000个网页,留作以后处理。

接口:WebPage.java

import java.io.File;
import java.net.MalformedURLException;

/*
 * 定义了WebPage对象的基本操作
 */
public interface WebPage
{
	/**根据网页地址将该网页转换成本地文件*/
	public File getPageFile();
	
	/**分析网页的内容
	 * @throws MalformedURLException */
	public void parse() throws MalformedURLException;
}

 实现以测试类:HTMLPage.java

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
 * 简单的蜘蛛爬虫,运行起来异常会很多,暂时先不管
 * @author gbk
 *
 */
public class HTMLPage extends Thread implements WebPage
{
	private static int pageId = 0;
	private static int MAX_PAGENUM = 1000;
	//存放处理过的URL,保证不重复
	private static Set urls = new HashSet();
	private File localFile;
	private StringBuffer contents;
	private URL url;

	public HTMLPage(URL url)
	{
		this.url = url;
	}
	
	/**
	 * 将网页下载到本地,用来以后分析
	 */
	public File getPageFile()
	{
		int ch = 0;
		contents = new StringBuffer();
		pageId++;
		localFile = new File("d:/html/"+pageId+".txt");
		try
		{
			InputStream inputStream = url.openStream();
			InputStreamReader inputStreamReader = new InputStreamReader(inputStream);
			FileOutputStream fileOutputStream = new FileOutputStream(localFile);
			OutputStreamWriter outputStreamWriter = new OutputStreamWriter(fileOutputStream);
			while((ch = inputStreamReader.read()) != -1) 
			{ 
				contents.append((char)ch);
				outputStreamWriter.write(ch); 
			} 
			outputStreamWriter.close();
			fileOutputStream.close();
			inputStreamReader.close();
			inputStream.close();
		} catch (FileNotFoundException e)
		{
			e.printStackTrace();
		} catch (IOException e)
		{
			e.printStackTrace();
		}
		return localFile;
	}
	
    /**
     * 分析网页,将不重复的url地址添加到候选叶中
     */
	public void parse() throws MalformedURLException
	{
        //无法处理内部链接,即不带http
		String regex =".*?";
        Pattern pt=Pattern.compile(regex,Pattern.CASE_INSENSITIVE);
        Matcher mt=pt.matcher(contents);
        while(mt.find())
        {         
            //获取网址
            Matcher myurl=Pattern.compile("href=.*?>").matcher(mt.group()); 
            while(myurl.find())
            {
            	String url = myurl.group().replaceAll("href=|>","");
            	//没有做同步,所以最后会稍微多出几个文件
            	if(!urls.contains(url)&&pageId
 

你可能感兴趣的:(简单爬虫程序)