网页爬虫抓取URL简单实现


关键字:网页爬虫抓取URL简单实现 .

//开始......

package com.ogilvy.sayes.util; 
 
import java.io.InputStream; 
import java.net.URL; 
import java.util.ArrayList; 
import java.util.Hashtable; 
 
/* 
Description:     爬网页用 
Author     :     long.tang
*/ 
 
public class SearchCrawler { 
 
    public String myGetHttpFile2(String url) { 
 
        String urlSource = url; 
        StringBuffer htmlBuffer = new StringBuffer(); 
        String returnStr = null; 
        try { 
            InputStream imageSource = new URL(urlSource).openStream(); 
            int ch; 
            while ((ch = imageSource.read()) > -1) { 
                htmlBuffer.append((char) ch); 
            } 
            imageSource.close(); 
            returnStr = new String(htmlBuffer); 
            returnStr = new String(returnStr.getBytes("ISO8859_1"), "GBK"); 
        } catch (Exception e) { 
            System.out.println("error>>>>"); 
            e.printStackTrace(); 
        } 
 
        //System.out.println("@@@:" + returnStr);  
        if (returnStr != null) { 
            return returnStr; 
        } else { 
            return "nothing"; 
        } 
 
    } 
 
    public void doit(String content, int depth) throws Exception { 
         
        depth--; 
        if (depth < 1) { 
            //System.out.println("break::::");  
            return; 
        } 
 
        SearchCrawler search = new SearchCrawler(); 
        ArrayList list = new ArrayList(); 
        int j = 0; 
        String start = "href="; 
        String end = "\""; 
        String url = ""; 
        String type = "http"; 
        String[] urls; 
        while (content.indexOf(start, j) > -1) { 
 
                url = content.substring(content.indexOf(start, j) + 6, content.indexOf(end, content.indexOf(start, j) + 6));//+6 href="  
                if (url.indexOf(type) > -1) { 
                    if (url.indexOf(".css") == -1&&url.indexOf(".ico") == -1&&url.indexOf(".exe") == -1) { 
                        System.out.println(url); 
                         
                        list.add(url); 
 
                        if (list != null && list.size() > 0) { 
 
                            for (int k = 0; k < list.size(); k++) { 
                                doit(search.myGetHttpFile2(String.valueOf(list.get(k))), depth); 
                         
                            } 
 
                        } 
                    } 
 
                } 
 
             
            j = content.indexOf(start, j) + 1; 
             
        } 
 
    } 
 
    public static void main(String arg[]) { 
 
        SearchCrawler search = new SearchCrawler(); 
        try { 
            search.doit(search.myGetHttpFile2("http://www.2345.com/"),3); 
        } catch (Exception e) { 
            // TODO Auto-generated catch block  
            e.printStackTrace(); 
        } 
 
    } 
 




//结束.....



你可能感兴趣的:(url)