http协议下的网络爬虫

主管让做个抓取淘宝数据的功能,但是淘宝的比较难,我先从扒新浪新闻开始。

环境,Apache 提供免费的 HTTPClien t源码和 JAR 包下载,可以登陆这里下载,笔者用的是4.51版本。

参考apache提供的例子,使用正则表达式做出如下程序。


public class Main {
    
    public static void Detail(String url) throws Exception {
        CloseableHttpClient httpclient = HttpClients.createDefault();
        String oldStr;
        try {
            HttpGet httpget = new HttpGet(url);
            String encoding="gbk";
            if(url.contains("comments")){
                
                encoding = "utf-8";
            }
            System.out.println(encoding);
            System.out.println("Executing request " + httpget.getURI());
            CloseableHttpResponse response = httpclient.execute(httpget);
          
            try {
                System.out.println("----------------------------------------");
                System.out.println(response.getStatusLine());
                HttpEntity entity = response.getEntity();
                oldStr = EntityUtils.toString(response.getEntity(),encoding);
 
                // Call abort on the request object
                httpget.abort();
            } finally {
                response.close();
            }
        } finally {
            httpclient.close();
        }

        Pattern pattern =  Pattern.compile("[^<]*");
        Matcher matcher = pattern.matcher(oldStr);
        if(matcher.find()){
            String str = matcher.group();
            str = str.substring(7,str.length()-8);
            System.out.println("---"+str);
        }
        
        pattern =  Pattern.compile("

[^<]*

"); matcher = pattern.matcher(oldStr); while(matcher.find()){ String str = matcher.group(); str = str.substring(3,str.length()-4); System.out.println(str); } } public static void main(String[] args) throws Exception { CloseableHttpClient httpclient = HttpClients.createDefault(); String oldStr; try { String str = null; str ="http://news.sina.com.cn/hotnews/"; HttpGet httpget = new HttpGet(str); System.out.println("Executing request " + httpget.getURI()); CloseableHttpResponse response = httpclient.execute(httpget); try { System.out.println("----------------------------------------"); System.out.println(response.getStatusLine()); HttpEntity entity = response.getEntity(); oldStr = EntityUtils.toString(response.getEntity(),"UTF-8"); // Call abort on the request object httpget.abort(); } finally { response.close(); } } finally { httpclient.close(); } Pattern pattern = Pattern.compile("href='http://[^']*'"); Matcher matcher = pattern.matcher(oldStr); int i= 1; while(matcher.find()){ String str = matcher.group(); str = str.substring(6,str.length()-1); System.out.println(str); Detail(str); System.out.println(i++); } } } ```

你可能感兴趣的:(http协议下的网络爬虫)