java爬虫实例--爬取网页的url

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebSpider {
    //获得源码
    public static String getURLContext(String urlStr,String charsetName){
        StringBuffer sb=new StringBuffer();
        try{
            URL url =new URL(urlStr);
            BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), Charset.forName( charsetName)));
            String temp="";
            while((temp=reader.readLine())!=null){
                // System.out.println(temp);
                sb.append(temp);
            }
        }catch(MalformedURLException e){
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return sb.toString();
    }
    public static List<String> getSubStrs(String destStr,String regexStr){
        Pattern p=Pattern.compile(regexStr);//超链接的地址
        Matcher m=p.matcher(destStr);
        List<String>result = new ArrayList<String>();
        while (m.find()) {
            result.add(m.group());
        }
        return result;
    }
    public static void main(String args[]){
        String urlStr="https://daohang.qq.com/?fr=hmpage";

        String destStr=getURLContext(urlStr,"utf-8");
        String regexStr="href=\"([\\w\\s./:]+?)\"";     //获取该网页内的超链接
        List<String>result=getSubStrs(destStr, regexStr);
        for(String temp:result ){
             System.out.println(temp);
        }
    }
}

你可能感兴趣的:(java)