java抓取古诗文的单线程爬虫

准备知识

1.HTML, CSS, HTML DOM树

参考http://www.w3school.com.cn/htmldom/

2.Jsoup的使用,使用DOM方法遍历一个document对象,使用选择器语法来选择一个元素,从元素中抽取数据。

参考www.open-open.com/jsoup/example-list-links.htm

3.java正则表达式及其语法

参考http://www.cnblogs.com/chuiyuan/p/5187359.html

 

我们先来做一个单线程的爬虫。

整体步骤:

1.定义爬取内容的对象Poem结构。

2.完成从网上爬取Document对象的模块HttpService功能。

3.从Document对象中解析出所有唐诗的href,并保存到List<Poem>中。

4.从3中得到的href再爬取出每首古诗的内容。

 

下面帖一下代码实现。

1.Poem对象只列出其属性

java抓取古诗文的单线程爬虫_第1张图片

2.抓取Document对象模块HttpService

  首先定义一个Rule,封装所有的请求,不管是get还是post。

java抓取古诗文的单线程爬虫_第2张图片

在HttpService中,使用Rule来抓取document对象。

/**
 * Created by chuiyuan on 2/11/16.
 */
public class HttpService {
    /**
     *
     * @param rule
     * @return doc
     */
    public  Document extrace(Rule rule){
        validateRule(rule);

        String url = rule.getUrl();
        String [] params = rule.getParams() ;
        String [] values = rule.getValues() ;
        String resultTagName = rule.getResultTagName() ;
        int type = rule.getType();
        int requestMethod = rule.getRequestMethod() ;
        Document doc =null ;
        try {
            Connection conn = Jsoup.connect(url);
            conn.userAgent("Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0 Iceweasel/38.5.0");
            if (params!= null){
                for (int i =0; i<params.length ;i++){
                    conn.data(params[i], values[i]);
                }
            }
            //Document doc = null ;
            switch (requestMethod){
                case  Rule.GET:
                    doc =conn.timeout(10000).get();
                    break;
                case Rule.POST:
                    doc =conn.timeout(10000).post();
                break;
            }
        }catch (IOException e ){
            System.out.println("No network");
            //e.printStackTrace();
        }
        return doc ;
    }

    /**
     * validate input params
     */
    private static void validateRule(Rule rule){
        String url = rule.getUrl() ;
        if (url == null || url.length()==0){
            throw new RuleException("url can't be null");
        }
        if (!url.startsWith("http://")){
            throw  new RuleException("url not in correct format");
        }
        /**
         * not consider total right
         */
        if (rule.getParams()!= null && rule.getValues()!= null){
            if (rule.getParams().length!= rule.getValues().length){
                throw new RuleException("params length!= values length");
            }
        }
    }
}

 可以看到,我们先对rule进行了验证,rule中的resultTagName并没有用到,可以去掉。

3.从Document中解析出所有唐诗的href

4.从每首诗文的href中提到详细的内容

public class ProcessDoc {

    /**
     *
     * @param url
     * @param httpService
     * @return List<Poem></Poem>
     */
    public List<Poem> processGuShiWen(String url, HttpService httpService ){
        Rule rule = new Rule(url,
                null,null,
                null,
                -1,
                Rule.GET);
        Document doc = httpService.extrace(rule);
        if (doc == null){
            System.out.println("doc null");
            return null;
        }
        //left panel
        Elements mainEles = doc.select("div.leftlei");
        //String dynasty = mainEles.select("div.son1").first().text();
        Elements poemsEles = mainEles.select("div.son2");//size=7 category
        //System.out.println(poemsEles.size());

        List<Poem> poemList = new ArrayList<Poem>();
        //Pattern p =Pattern.compile("\\(|\\)");

        //some times there is no author
        for (Element poemsEle : poemsEles){
            Elements poemEles = poemsEle.select("span");
            String category1 = poemEles.get(0).text(); //category with ":"
            String category = category1.substring(0,category1.length()-1);
            for (int i =1;i<poemEles.size();i++){
                Poem poem = new Poem() ;
                //poem.setDynasty(dynasty);
                poem.setCategory(category);
                poem.setHref(poemEles.get(i).select("a").attr("abs:href")+"/");//ref
                poem.setTitle(poemEles.get(i).select("a").text());
                poemList.add(poem);
            }
        }

        /*for (Poem poem : poemList){
            System.out.println(poem.getCategory()+
                    " "+poem.getTitle()+
                    " "+poem.getHref());
        }*/
        return poemList ;

    }

    /**
     * get details of poem
     * @param poem
     * @param httpService
     */
    public void processDetails(Poem poem, HttpService httpService ){
        String url = poem.getHref() ;
        //System.out.println(url);
        Rule rule = new Rule(url,
                null,null,
                null,
                -1,
                Rule.GET);
        Document doc = httpService.extrace(rule);
        if (doc == null) {
            System.out.println("doc=null");
            return;
        }

        Elements mainEles = doc.select("div.son2");
        //title already ok
        Elements poemDetailEles = mainEles.select("p");
        String dynasty = poemDetailEles.get(0).text().split(":")[1];//note,chinese :
        //Stng dynasty = poemDetailEles.get(0).getElementsByTag("span").text();
        //System.out.println(dynasty);
        String author = poemDetailEles.get(1).getElementsByTag("a").text();
        //System.out.println(author);
        String content = mainEles.text().split("原文:")[1];//note
        //System.out.println(content);
        //do not consider translation
        //Element translationEle = doc.select("#")

        poem.setDynasty(dynasty);
        poem.setAuthor(author);
        poem.setContent(content);

        //System.out.println(poem.toString());

    }
}

整体调用如下

/**
     * single thread model,
     * save to mysql
     */
    public void getGuShiWenSingleThread(){
        String url ="http://so.gushiwen.org/gushi/tangshi.aspx/";

        HttpService httpService = new HttpService();

        ProcessDoc processDoc= new ProcessDoc();

        List<Poem> poemList = processDoc.processGuShiWen(url, httpService);


        //get poem content details
        for (Poem poem : poemList){
            processDoc.processDetails(poem,httpService);
            System.out.println(poem.toString());
        }

        //store to mysql
        PoemDao poemDao = new PoemDaoImpl() ;//not PoemDaoImpl
        for (Poem poem: poemList){
            try {
                poemDao.add(poem);
            }catch (SQLException e){
                e.printStackTrace();
            }
        }
    }

 最后结果保存到了MySQL中,数据库部分将在下一篇文章中讲解。

 

 

你可能感兴趣的:(java抓取古诗文的单线程爬虫)