java 爬虫爬取酷狗歌手数据

记录防止忘记

包:

jsoup-1.4.1 html解析

httpcore-4.0.1_1

httpclient-4.0.1

代码:

已经访问的url队列

//已经访问链接队列
public class VisitedUrlQueue {
	public static HashSet visitedUrlQueue = new HashSet();  
	  
    public synchronized static void addElem(String url) {  
        visitedUrlQueue.add(url);  
    }  
  
    public synchronized static boolean isContains(String url) {  
        return visitedUrlQueue.contains(url);  
    }  
  
    public synchronized static int size() {  
        return visitedUrlQueue.size();  
    }  
}
未访问的队列

//未访问url队列
public class UrlQueue {
	/** 超链接队列 */  
    public static LinkedList urlQueue = new LinkedList();  
  
    /** 队列中对应最多的超链接数量 */  
    public static final int MAX_SIZE = 10000;  
  
    public synchronized static void addElem(String url) {  
        urlQueue.add(url);  
    }  
  
    public synchronized static String outElem() {  
        return urlQueue.removeFirst();  
    }  
  
    public synchronized static boolean isEmpty() {  
        return urlQueue.isEmpty();  
    }  
  
    public static int size() {  
        return urlQueue.size();  
    }  
  
    public static boolean isContains(String url) {  
        return urlQueue.contains(url);  
    }  
}
通过url得到页面html代码

public class DownloadPage {
	
	public static String getContentFormUrl(String url) throws Exception {  
		HttpClient client = new DefaultHttpClient();  
        HttpGet getHttp = new HttpGet(url);  
  
        String content = null;  
  
        HttpResponse response;  
        try {  
            /* 获得信息载体 */  
            response = client.execute(getHttp);  
            HttpEntity entity = response.getEntity();  
            
            //已经访问url
            VisitedUrlQueue.addElem(url);  
  
            if (entity != null) {  
                /* 转化为文本信息 */  
                content = EntityUtils.toString(entity);  
            }  
  
        } catch (ClientProtocolException e) {  
            e.printStackTrace();  
        } catch (IOException e) {  
            e.printStackTrace();  
        } finally {  
            client.getConnectionManager().shutdown();  
        }  
  
        return content;  
    }  
	
}

页面解析

public class ParseOfPage {
	 /** 
     * 获得url页面源代码中超链接 
	 * @throws Exception 
     */  
    public static void getHrefOfContent(String content) throws Exception { 
    	Document doc = Jsoup.parse(content);
    	for(Element e:doc.getElementsByTag("a")){
    		String linkHref = e.attr("href");
    		if(linkHref.startsWith("/album")){ //进行链接筛选
    			linkHref = "http://www.kuwo.cn"+linkHref;//进行链接补充
    		}
    		if(linkHref.startsWith("http://www.kuwo.cn/album")){ //链接筛选,队列判断重复后加入队列
    			if (!UrlQueue.isContains(linkHref)   
    	                 && !VisitedUrlQueue.isContains(linkHref)) { 
    				String urlNew = linkHref.replace(" ","%20");
    				//System.out.println(urlNew);
    	             UrlQueue.addElem(urlNew);  
    	        }
    		}
    		
    	}
    } 
  

    
    //进行自定义解析
    public static void getDataOfContentForSinger(String content) throws Exception {
    	SingerPo po = new SingerPo();
    	
    	Document doc = Jsoup.parse(content);
    	for(Element e:doc.getElementsByClass("artistTop")){
    		po.setPhotourl(e.childNode(1).attr("data-src")); //设置图片
    	}
    	
    
    	

    	
    }
}




你可能感兴趣的:(javaEE)