Java 简单的BFS爬虫

大致意思就是先给定一个URL,然后用HttpParser开源工具提取其网页上的链接,并丢到队列里,然后取队列的首URL重复此操作。

需要用到HttpClient和HttpParser包。

这里先给出所有.Java文件的源码。

其中还有些问题,今天不想看了,所有问题明天再说。

今天做了些修改,还是不能保存到文件,但是应该没有其他的问题了,主要修改了运用Queue来操作,还有修改了提取的Url可能不正确,这里面加了http的判断

-------------------------------------------------------------------------------------MyCrawler.java---------------------------------------------------------------------------------

package MyCrawler;

import java.util.Set;

/**
 *
 * @author xkey
 */
public class MyCrawler {

    /**
     * 使用种子初始化URL队列
     * @return
     * @param seeds 种子URL
     */
    private static void initCrawlerWithSeeds(String[] seeds)
    {
        for(int i = 0 ; i < seeds.length ;  i ++)
        {
            LinkQueue.addUnVisitedUrl(seeds[i]);
        }
    }
    
    /**
     * 抓取过程
     * @return
     * @param seeds
     */
    public static void crawling (String[] seeds)
    {
        initCrawlerWithSeeds(seeds);
        while(!LinkQueue.unVisitedUrlIsEmpty() && LinkQueue.getVisitedUrlNum() < 1000)
        {
            //队头URL
            String visitUrl = (String)LinkQueue.unVisitedUrlDequeue();
            System.out.println(visitUrl);
            if(visitUrl == null) continue;
            DownLoaderFile downLoader = new DownLoaderFile();
            downLoader.downLoaderFile(visitUrl);
            LinkQueue.addVisitedUrl(visitUrl);
            //提取出下载网页中的URL
            LinkFilter filter = new LinkFilter();
            Set<String>links = HtmlParserTool.extracLinks(visitUrl);
            //新的未访问的URL入对
            for(String link:links)
            {
                System.out.println("xkey: "+link);
                LinkQueue.addUnVisitedUrl(link);
            }
            
        }
    }
    public static  void main(String[] args)
    {
        MyCrawler crawler = new MyCrawler();
        crawler.crawling(new String[]{"http://www.baidu.com"});
    }
}


-------------------------------------------------------------------------------------LinkQueue.java------------------------------------------------------------------------------------

public class LinkQueue {
    //已访问的URL集合
    private static Set visitedUrl = new HashSet();
    //待访问的URL集合
    private static Queue <String> unVisitedUrl = new ConcurrentLinkedQueue<String>();
    //获得URL队列
     public static Queue getUnVisitedUrl()
    {
        return unVisitedUrl;
    }
    //添加到访问过的URL队列中
    public static void addVisitedUrl(String Url)
    {
        visitedUrl.add(Url);
    }
    //移除访问过的URL
    public static void removeVisitedUrl(String Url)
    {
        visitedUrl.remove(Url);
    }
    //未访问过的URL出队列
    public static Object unVisitedUrlDequeue()
    {
        return unVisitedUrl.poll();
    }
    //保证每个URL只被访问一次
    public static void addUnVisitedUrl(String Url)
    {
        if(Url != null && !Url.trim().equals("") && !visitedUrl.contains(Url) && !unVisitedUrl.contains(Url))
            unVisitedUrl.add(Url);
    }
    //获得已经访问的URL数目
    public static int getVisitedUrlNum()
    {
        return visitedUrl.size();
    }
    //判断未访问的URL队列是否为空
    public static boolean unVisitedUrlIsEmpty()
    {
        return unVisitedUrl.isEmpty();
    }
}




-------------------------------------------------------------------------------------DownLoaderFile.java---------------------------------------------------------------------------------

public class DownLoaderFile {
    public String getFileNameByUrl(String url,String contentType)
    {
        //移除http
       url = url.substring(7);
       //text.html类型
       if(contentType.indexOf("html") != -1)
       {
           url = url.replaceAll("[\\?/:*|<>\"]","_")+".html";
           return url;
       }
       //pdf类型
       else 
       {
           return url.replaceAll("[\\?/:*|<>\"]","_")+"."+contentType.substring(contentType.lastIndexOf("/")+1);
       }
    }
    /**
     * 保存网页字节数组到本地文件,filepath为要保存的文件相对地址
     */
    private void saveToLocal(byte[] data,String filePath)
    {
        try{
            DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));
            for(int i = 0 ; i < data.length ; i ++)
            {
                out.write(data[i]);
            }
            out.flush();
            out.close();
        }catch(IOException e)
        {
            e.printStackTrace();
        }
    }
    //下载URL指向的网页
    public String downLoaderFile(String url)
    {
        String filePath = null;
        //1.生成HttpClient对象并设置参数
        HttpClient httpClient = new HttpClient();
        httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000);
        //2.生成GetMethod对象并设置参数
        GetMethod getMethod = new GetMethod(url);
        //设置get 请求超时5s
        getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000);
        getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler());
        //3.执行http get qi请求
        try{
            int statusCode  = httpClient.executeMethod(getMethod);
            if(statusCode != HttpStatus.SC_OK)
            {
                System.err.println("Method failed: " + getMethod.getStatusLine());
                filePath = null;
            }
            //4.处理HTTP响应内容
            byte[] responseBody = getMethod.getResponseBody();
            //根据网页URL生成保存时的文件名
           // String ans = new String (responseBody);
           // System.out.println(ans);
            //filePath = "D;/xkey";
          // filePath = "D:\\xkey\\" + getFileNameByUrl(url,getMethod.getResponseHeader("Content-Type").getValue());
           // saveToLocal(responseBody,filePath);
        }catch(HttpException e){
            System.out.println("Please check your provided http address!");
            e.printStackTrace();
        }catch (IOException e)
        {
            e.printStackTrace();
        }finally {
            getMethod.releaseConnection();
        }
        return filePath;
    }
}


-------------------------------------------------------------------------------------HttpParserTool.java--------------------------------------------------------------------------------------------------------------

public class HtmlParserTool {
    //获取一个网站上的URL,filter用来过滤链接
    public static Set<String>extracLinks(String url)
    {
        Set<String>links = new HashSet<String>();
        try{
            Parser parser = new Parser(url);
            parser.setEncoding("utf-8");
            NodeFilter frameFilter = new NodeFilter(){
                public boolean accept(Node node)
                {
                    if(node.getText().startsWith("frame src=")){
                        return true;
                    }
                    else return false;
                }
            };
            
            OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);
            NodeList list = parser.extractAllNodesThatMatch(linkFilter);
            
            for(int i = 0 ; i < list.size() ; i ++)
            {
                Node tag = list.elementAt(i);
                if(tag instanceof LinkTag) //<a>标签
                {
                    LinkTag link = (LinkTag) tag;
                    String linkUrl = link.getLink();//URL
                   if(linkUrl.contains("http"))
                        links.add(linkUrl);
                }else {//<frame>标签
                    //提取frame 里src属性的链接,如<frame src = "test.html">
                    String frame = tag.getText();
                    int start = frame.indexOf("src=");
                    frame = frame.substring(start);
                    int end = frame.indexOf(" ");
                    if(end == -1) end = frame.indexOf(">");
                    String frameUrl = frame.substring(5,end - 1);
                   if(frameUrl.contains("http"))
                            links.add(frameUrl);
                    
                }
            }
        }catch(ParserException e)
        {
            e.printStackTrace();
        }
        return links;
    }
}


你可能感兴趣的:(java,String,filter,null,url,Class)