宽度优先爬虫-爬虫学习(四)

  我们用爬虫去遍历互联网的时候,可以将互联网看作是一个有向图,链接就是图中的有向边,所以我们可以利用图的遍历方法去遍历这个巨大的互联网,图的遍历有宽度优先遍历和深度优先遍历。但是深度优先遍历可能会在遍历的时候遍历过深,导致浪费资源,所以我们这里采用宽度优先遍历。

  下面使用Java实现一个简单的宽度优先爬虫的例子,其中用到来HttpClient和HtmlParser两个开源的工具包。

/**

 * @introduction 队列 保留将要访问的URL 

 * @author Guo

 */

public class Queue {

    /**

     * @introduction 底层使用LinkedList去封装一个队列,用来存储未访问的URL

     */

    private LinkedList<String> queue = new LinkedList<String>();

    

    /**

     * @introduction 想队列中加入URL

     * @param url

     */

    public void in(String url) {

        queue.addLast(url);

    }

    

    /**

     * @introduction 移除队列中的URL

     * @return

     */

    public String out() {

        return queue.removeFirst();

    }

    

    /**

     * @introduction 判断是否包含URL

     * @param url

     * @return

     */

    public boolean contains(String url) {

        return queue.contains(url);

    }

    

    /**

     * @introduction 判断队列是否为空

     * @return

     */

    public boolean isEmpty() {

        return queue.isEmpty();

    }

}
/**

 * @introduction 一个处理用过的URL和没用过的URL的类

 * @author Guo

 */

public class LinkQueue {

    /**

     * @introduction 用HashSet去封装一个访问过的URL队列

     */

    private static Set<String> visitedURL = new HashSet<String>();

    /**

     * @introduction 未访问的URL队列

     */

    private static Queue unVisitedURL = new Queue();

    

    /**

     * @introduction 添加已经访问过的URL

     * @param url

     */

    public static void addVisitedURL(String url) {

        visitedURL.add(url);

    }

    

    /**

     * @introduction 移除访问过的URL

     * @param url

     */

    public static void removeVisitedURL(String url) {

        visitedURL.remove(url);

    }

    

    /**

     * @introduction 拿出未访问的URL

     * @return

     */

    public static String unVisitedURLOutQueue() {

        return unVisitedURL.out();

    }

    

    /**

     * @introduction 添加未访问的URL

     * @param url

     */

    public static void addUnVisitedURL(String url) {

        if(url != null && !url.trim().equals("") && 

                !visitedURL.contains(url) && !unVisitedURL.contains(url)) {

            unVisitedURL.in(url);

        }

    }

    

    /**

     * @introduction 拿到访问过的URL的数目

     * @return

     */

    public static int getVisitedURLNumber() {

        return visitedURL.size();

    }



    public static Set<String> getVisitedURL() {

        return visitedURL;

    }



    public static void setVisitedURL(Set<String> visitedURL) {

        LinkQueue.visitedURL = visitedURL;

    }



    public static Queue getUnVisitedURL() {

        return unVisitedURL;

    }



    public static void setUnVisitedURL(Queue unVisitedURL) {

        LinkQueue.unVisitedURL = unVisitedURL;

    }

}
public class DownloadFile {

    /**

     * @introduction 将文件存储到本地

     * @param input

     * @param filePath

     */

    public void saveToLocal(InputStream input, String filePath) {

        try {

            DataOutputStream out = new DataOutputStream(new FileOutputStream(new File(filePath)));

            byte[] data = new byte[1024];

            int l;

            while ((l = input.read(data)) != -1) {

                out.write(data);

            }

            out.flush();

            out.close();

            input.close();

        } catch (Exception e) {

            e.printStackTrace();

        }

    }

    

    /**

     * @introduction 下载该链接的URL

     * @param url

     * @return

     */

    public String downloadFile(String url) {

        String filePath = null;

        try {

            HttpClient httpClient = new DefaultHttpClient();

            HttpGet httpGet = new HttpGet(url);

            HttpResponse response = httpClient.execute(httpGet);

            HttpEntity entity = response.getEntity();

            if(entity != null) {

                InputStream input = entity.getContent();

                filePath = "G:/temp/" + response.getAllHeaders().toString() + ".html";

                saveToLocal(input, filePath);

            }

            return filePath;

        }catch(Exception e) {

            e.printStackTrace();

            return filePath;

        }

    }

}
/**

 * @introduction 用来过滤链接的Filter 

 * @author Guo

 */

public interface LinkFilter {

    public boolean accept(String url);

}
/**

 * @introduction 专门对HTML页面进行处理 

 * @author Guo

 */

public class HtmlParserTool {

    /**

     * @introduction 根据url这个参数,拿到这个url中的链接

     * @param url

     * @param filter

     * @return

     */

    public static Set<String> getLinks(String url, LinkFilter filter) {

        Set<String> links = new HashSet<String>();

        try {

            Parser parser = new Parser(url);

            parser.setEncoding("utf-8");

            NodeFilter frameFilter = new NodeFilter() {

                public boolean accept(Node node) {

                    if(node.getText().startsWith("frame src=")) {

                        return true;

                    }else {

                        return false;

                    }

                }

            };

            OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);

            NodeList list = parser.extractAllNodesThatMatch(linkFilter);

            for(int i = 0; i < list.size(); i++) {

                Node tag = list.elementAt(i);

                if(tag instanceof LinkTag) {

                    LinkTag link = (LinkTag)tag;

                    String linkUrl = link.getLink();

                    if(filter.accept(linkUrl))

                        links.add(linkUrl);

                }else {

                    String frame = tag.getText();

                    int start = frame.indexOf("src=");

                    frame = frame.substring(start);

                    int end = frame.indexOf(" ");

                    if(end == -1)

                        end = frame.indexOf(">");

                    String frameUrl = frame.substring(5, end - 1);

                    if(filter.accept(frameUrl))

                        links.add(frameUrl);

                }

            }

        }catch(Exception e) {

            e.printStackTrace();

        }

        return links;

    }

}
public class MyCrawler {

    /**

     * @introduction 初始化LinkQueue

     * @param seeds

     */

    private void initCrawlerWithSeeds(String[] seeds) {

        for(String seed : seeds) 

            LinkQueue.addUnVisitedURL(seed);

    }

    

    /**

     * @introduction 爬虫具体的爬行

     * @param seeds

     */

    public void crawling(String[] seeds) {

        LinkFilter filter = new LinkFilter() {

            public boolean accept(String url) {

                if(url.startsWith("http://www.baidu.com"))

                    return true;

                else

                    return false;

            }

        };

        

        initCrawlerWithSeeds(seeds);

        

        while(!LinkQueue.getUnVisitedURL().isEmpty() && LinkQueue.getVisitedURLNumber() <= 1000) {

            String visitUrl = LinkQueue.unVisitedURLOutQueue();

            if(visitUrl == null)

                continue;

            DownloadFile downloader = new DownloadFile();

            downloader.downloadFile(visitUrl);

            LinkQueue.addVisitedURL(visitUrl);

            Set<String> links = HtmlParserTool.getLinks(visitUrl, filter);

            for(String link : links) 

                LinkQueue.addUnVisitedURL(link);

        }

    }

    

    public static void main(String[] args) {

        MyCrawler crawler = new MyCrawler();

        crawler.crawling(new String[]{"http://www.baidu.com"});

    }

}

 

 

你可能感兴趣的:(学习)