使用Gecco写Java简单爬虫

  • 简介
      Gecco是一个国人写的Java轻量级爬虫框架,官网请戳:Gecco官网。
      个人以前用python的爬虫框架也写过一些简单爬虫,但是因为自己不常用python,所以用python写起来并不顺手。偶然情况下,接触Gecco这个Java爬虫框架,尝试用了一下,感觉非常不错,用起来非常简单,非常适合不太懂爬虫基础,又想快速写个爬虫来爬取数据的人。

  • 开发环境简单说明
      (1) 环境说明:
       – JDK版本:JDK 1.8.0
       – 开发工具:IntelliJ idea
       – 项目类别:Maven项目

      (2) 初始化项目环境:
       step1: 建立Maven项目,并选择quickstart原型模板
    使用Gecco写Java简单爬虫_第1张图片

       step2:pom文件引入Gecco核心依赖

<dependency>
    <groupId>com.geccocrawlergroupId>
    <artifactId>geccoartifactId>
    <version>1.1.0version>
dependency>
  • 使用Gecoo写简单爬虫
      这里以爬取M站为例。现在要将该网站的列表数据和列表详情页数据都爬取下来。
      (1) 建立主入口类NewsSpiderEntry
// Gecco注解中的matchUrl代表可被解析的url路径,pipelines代表一个处理管道,
//当路径页面被下载下来之后,能够通过管道类对封装到HtmlBean中的数据进行处理
//{page}代表一个路径匹配变量,这个变量可以在类中变量通过@RequestParam注解解析获得
@Gecco(matchUrl = "http://news.missevan.com/news/index?p={page}", 
                      pipelines = {"mainPipeline", "newsDetailPipe"})
public class NewsSpiderEntry implements HtmlBean {

    @Request
    private HttpRequest request;

    //cssPath语法和Jquery的选择器类似,用于获取mathurl文档流中符合的元素数据
    //  //NewsSummaryView类看下面,这个类用于封装解析后的列表页数据
    @HtmlField(cssPath = ".newslist")
    private List newsSummaryViews;

    //Text代表解析text文档(html代表解析html文档,attr代表解析标签属性等)
    @Text
    @HtmlField(cssPath = ".selected > a")
    private String nextPage;


    public HttpRequest getRequest() {
        return request;
    }

    public void setRequest(HttpRequest request) {
        this.request = request;
    }

    public String getNextPage() {
        return nextPage;
    }

    public void setNextPage(String nextPage) {
        this.nextPage = nextPage;
    }

    public List getNewsSummaryViews() {
        return newsSummaryViews;
    }

    public void setNewsSummaryViews(List newsSummaryViews) {
        this.newsSummaryViews = newsSummaryViews;
    }

    public static void main(String[] args) {
        GeccoEngine.create()
                // Gecco搜索的包路径,我这里的包是com.news.spider,可以根据自己的情况改。
                //但需要注意,如果管道类HtmlBean类不在一个包下,就需要配置多个包,否则无法搜索到bean。
                .classpath("com.news.spider")
                // 开始抓取的页面地址
              .start("http://news.missevan.com/news/index?p=1")
                // 开启几个爬虫线程
                .thread(1)
                // 单个爬虫每次抓取完一个请求后的间隔时间
                .interval(500).start();
    }
}

  (2) 建立主列表页bean: NewsSummaryView(需要实现HtmlBean接口, Gecco框架才能对它进行解析)。

private int id;

    @Attr("href")
    @HtmlField(cssPath = ".newstitle > a")
    private String newsId;

    @Text
    @HtmlField(cssPath = ".newstitle > a")
    private String title;

    @Text
    @HtmlField(cssPath = ".newscontent > p")
    private String content;

    @Attr("data-original")
    @HtmlField(cssPath = ".newstag > a > img")
    private String imgUrl;

    //Href(click = true)能够让解析出来的链接直接放入待解析队列
    @Href(click = true)
    @HtmlField(cssPath = ".newstitle > a")
    private String detailUrl;

    public String getDetailUrl() {
        return detailUrl;
    }

    public void setDetailUrl(String detailUrl) {
        this.detailUrl = detailUrl;
    }

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getNewsId() {
        return newsId;
    }

    public void setNewsId(String newsId) {
        this.newsId = newsId.substring(newsId.indexOf("newsid") + 7);
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }

    public String getImgUrl() {
        return imgUrl;
    }

    public void setImgUrl(String imgUrl) {
        this.imgUrl = imgUrl;
    }

    //打印数据,方便调试
    @Override
    public String toString() {
        return "NewsSummaryView{" +
                "id=" + id +
                ", newsId=" + newsId +
                ", title='" + title + '\'' +
                ", content='" + content + '\'' +
                ", imgUrl='" + imgUrl + '\'' +
                ", detailUrl='" + detailUrl + '\'' +
                '}';
    }
}

  (3) 建立列表详情页入口:NewsDetailEntry(以供@Href(click = true)加入的链接解析)

@Gecco(matchUrl = "http://news.missevan.com/news/article?newsid={newsid}", pipelines = "newsDetailPipe")
public class NewsDetailEntry implements HtmlBean {

    @HtmlField(cssPath = "#articlebox")
    private NewsDetail newsDetail;

    //匹配解析路径删的{newsid}
    @RequestParameter("newsid")
    private int newsid;

    public int getNewsid() {
        return newsid;
    }

    public void setNewsid(int newsid) {
        this.newsid = newsid;
    }

    public NewsDetail getNewsDetail() {
        return newsDetail;
    }

    public void setNewsDetail(NewsDetail newsDetail) {
        this.newsDetail = newsDetail;
    }
}

  (4) 建立详情页Bean: NewsDetail


    private String author;
    private String source;

    @Text
    @HtmlField(cssPath = "#articletitle")
    private String title;

    @Html
    @HtmlField(cssPath = "#articlecontent")
    private String content;

    @Text
    @HtmlField(cssPath = ".newsinfo2")
    private List createdDate;


    private int commentCount;

    @Text
    @HtmlField(cssPath = ".newstags1 > a")
    private List tags;

    public int getId() {
        return id;
    }

    public void setId(int id) {
        this.id = id;
    }

    public String getAuthor() {
        return author;
    }

    public void setAuthor(String author) {
        this.author = author;
    }

    public String getSource() {
        return source;
    }

    public void setSource(String source) {
        this.source = source;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        //对数据做正则处理,过滤掉不要的内容
        content  = content.replaceAll("
",""); this.content = content; } public List getCreatedDate() { return createdDate; } public void setCreatedDate(List createdDate) { this.createdDate = createdDate; } public int getCommentCount() { return commentCount; } public void setCommentCount(int commentCount) { this.commentCount = commentCount; } public List getTags() { return tags; } public void setTags(List tags) { this.tags = tags; } @Override public String toString() { return "NewsDetail{" + "id=" + id + ", author='" + author + '\'' + ", source='" + source + '\'' + ", title='" + title + '\'' + ", content='" + content + '\'' + ", createdDate=" + createdDate + ", commentCount=" + commentCount + ", tags='" + tags + '\'' + '}'; }

  (5) 建立管道处理类:MainPipeline和NewsDetailPipe

//注意,这里的PipelineName中的值必须与入口类中的pipelines对应
@PipelineName("mainPipeline")
public class MainPipeline implements Pipeline<NewsSpiderEntry> {

    public void process(NewsSpiderEntry newsSpiderEntry) {
        HttpRequest request = newsSpiderEntry.getRequest();
        List newsSummaryViews = newsSpiderEntry.getNewsSummaryViews();
        for(NewsSummaryView newsSummaryView : newsSummaryViews) {
            System.out.println(newsSummaryView);
        }
        //获取下一页url
        int nextPage = Integer.parseInt(newsSpiderEntry.getNextPage()) + 1;
        //将下一页的url加入待解析队列
        String nextPageurl = "http://news.missevan.com/news/index?p=" + nextPage;
        SchedulerContext.into(request.subRequest(nextPageurl));
    }
}
@PipelineName("newsDetailPipe")
public class NewsDetailPipe implements Pipeline<NewsDetailEntry> {

    public void process(NewsDetailEntry newsDetailEntry) {
        NewsDetail newsDetail = newsDetailEntry.getNewsDetail();      
        System.out.println(newsDetail);
    }
}
  • 效果展示
      通过上面几个类,就能够将M站的列表页数据和详情页数据爬取下来了,是不是很简单。
      如果要对爬取的数据进行持久化处理,在管道类里加上数据持久化的处理逻辑就可以了。
    使用Gecco写Java简单爬虫_第2张图片

你可能感兴趣的:(爬虫)