java 一个简单的爬虫项目详解

我们在不久的将来就会相遇的,在那之前,我要成为一名优秀的程序员

爬虫对我们来说是一个既爱又恨的家伙。我记得我大学时期,有个朋友学会了python的爬虫之后,整天就去爬各种网站,不亦乐乎。在工程中,经常需要做爬虫相关的业务。爬虫一般是和多线程挂钩的,今天先详细介绍一个简单版,后续增加一个线程池版本的。

实现的功能:从文件中读取url列表,然后把每个页面的结果都存储在一个文件中,同时去掉html中的标签和javascript代码。

首先,创建一个maven工程,然后引入依赖

<dependency>
            <groupId>net.sourceforge.htmlunitgroupId>
            <artifactId>htmlunitartifactId>
            <version>2.32version>
dependency>

这个包是专门用于爬取页面的。

//web页面获取的实体类
public class WebEntity {
    private WebClient       webclient;      //模拟浏览器对象
    private String          url;            //请求url

    public WebEntity(String url) {
        WebClient webclient = new WebClient(BrowserVersion.CHROME);
        webclient.getOptions().setJavaScriptEnabled(false);
        webclient.getOptions().setCssEnabled(false);
        webclient.getOptions().setUseInsecureSSL(false);
        webclient.addRequestHeader("User-Agent", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0");
        this.webclient = webclient;
        this.url = url;
    }

    //获取页面
    public HtmlPage executeReq() {
        int time = 1;
        while (time <= 5) {
            try {
                return webclient.getPage(url);
            } catch (IOException e) {
            }
            time++;
        }
        return null;
    }


    //打印信息
    @Override
    public String toString() {
        return String.format("url = %s", url);
    }

    //get、set方法
    public WebClient getWebclient() {
        return webclient;
    }

    public void setWebclient(WebClient webclient) {
        this.webclient = webclient;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }
}
//爬取页面
public class Craw {
    private static volatile Craw singleton;
    private Craw(){

    }
    public static Craw getInstance() {
        if (singleton == null) {
            synchronized (Craw.class) {
                if (singleton == null) {
                    singleton = new Craw();
                    return singleton;
                }
            }
        }
        return singleton;
    }

    public HtmlPage parsePage(WebEntity webEntity){
        HtmlPage page = null;
        try{
            page = webEntity.executeReq();
        }catch (Exception e){
            System.err.println("获取页面失败");
        }
        return page;
    }

}

//主类,包含了从文件中读取和写入东西
public class Pachong {


    static void outfile(File file, HtmlPage page){
        FileOutputStream fileOutputStream = null;
        BufferedOutputStream bufferedOutputStream = null;
        try {
            if (file != null){
                fileOutputStream = new FileOutputStream(file);
                bufferedOutputStream = new BufferedOutputStream(fileOutputStream);
                if (page != null){
                    String ans = page.asXml();
                    String anstrue = LabelUtil.handleHtmlLabel(ans);
                    bufferedOutputStream.write(anstrue.getBytes());
                }
                bufferedOutputStream.flush();
                bufferedOutputStream.close();
                fileOutputStream.close();

            }
        }catch (IOException e){
            e.printStackTrace();
        }
    }

    public static void main(String args[]){
        String filename = "d://test/in.txt";
        File file = new File(filename);
        InputStream inputStream = null;
        try {
            inputStream = new FileInputStream(file);
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
            String line = null;
            int cnt = 0;
            while ((line = bufferedReader.readLine()) != null){
                WebEntity webEntity = new WebEntity(line);
                Craw craw = Craw.getInstance();
                HtmlPage page = craw.parsePage(webEntity);

                if (page != null){
                    String filenametmp = "d://test/"+cnt + ".txt";
                    File file1 = new File(filenametmp);
                        if (!file1.exists()){
                            file1.createNewFile();
                            outfile(file1,page);
                        }else {
                            outfile(file1,page);
                        }
                }
                cnt ++ ;
            }
            inputStream.close();
            bufferedReader.close();
        }catch (IOException e){
            e.printStackTrace();

        }

    }
}

//去掉页面中的html标签和js代码
public class LabelUtil {
    public static String handleHtmlLabel(String html){
        String noHTMLString = "";
        html = html.replaceAll("&", "&");
        Matcher m = Pattern
                .compile("&#(\\d+);", Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
                .matcher(html);
        boolean b = false;
        int i = 0;
        while (m.find()) {
            if (i > 500) {
                System.out.println(i);
            }
            i++;
            html = html.replace("&#" + m.group(1) + ";", (char) Integer.parseInt(m.group(1)) + "");
            b = true;
        }
        if (!b) {
            m = Pattern
                    .compile("&#x([\\da-f]+);",
                            Pattern.CASE_INSENSITIVE | Pattern.MULTILINE | Pattern.DOTALL | Pattern.CANON_EQ)
                    .matcher(html);
            int j = 0;
            while (m.find()) {
                if (j > 500) {
                    System.out.println(j);
                }
                j++;
                html = html.replaceAll("&#[x|X]" + m.group(1) + ";", (char) Integer.parseInt(m.group(1), 16) + "");
            }
        }
        String scl = ";//8
        String scr = "";//9
        int indexl = -1;
        indexl = html.indexOf(scl);
        long mm = html.length();
        while (indexl != -1){
            int indexr = -1;
            indexr = html.indexOf(scr);
            if (indexl != 0){
                String x = html.substring(0,indexl);
                int n = html.length();
                if (indexr != n-9 && indexr != -1) { ;
                    String y = html.substring(indexr+9,n-1);
                    html = x+y;

                }else if (indexr == n-9 || indexr == -1){
                    html= x;

                }
            }else {
                int n = html.length();
                if (indexr != n-9 && indexr != -1){
                    String y = html.substring(indexr+9,n-1);
                    html = y;

                }else if(indexr == n-9){
                    html = "";
                }else if(indexr == -1){
                    html = "";
                }
            }
            indexl = -1;
            indexl = html.indexOf(scl);
        }
        noHTMLString = html.replaceAll("<\\s*(?:br|Br|BR|bR|div|DIV|Div|p|P|td|TD|Td)\\s*(?:[^>])*\\s*>", "")
                .replaceAll("", "").replaceAll(" ", "").replaceAll("\\<.*?\\>", "")
                .replaceAll("&(?:g|l)t", "");
        String x = "";
        Pattern pattern = Pattern.compile("\\s*|\t|\r|\n");
        Matcher matcher = pattern.matcher(noHTMLString);
        x = matcher.replaceAll("");
        return x.trim();
    }
}

你可能感兴趣的:(java)