简介
Gecco是一个国人写的Java轻量级爬虫框架,官网请戳:Gecco官网。
个人以前用python的爬虫框架也写过一些简单爬虫,但是因为自己不常用python,所以用python写起来并不顺手。偶然情况下,接触Gecco这个Java爬虫框架,尝试用了一下,感觉非常不错,用起来非常简单,非常适合不太懂爬虫基础,又想快速写个爬虫来爬取数据的人。
开发环境简单说明
(1) 环境说明:
– JDK版本:JDK 1.8.0
– 开发工具:IntelliJ idea
– 项目类别:Maven项目
(2) 初始化项目环境:
step1: 建立Maven项目,并选择quickstart原型模板
step2:pom文件引入Gecco核心依赖
<dependency>
<groupId>com.geccocrawlergroupId>
<artifactId>geccoartifactId>
<version>1.1.0version>
dependency>
// Gecco注解中的matchUrl代表可被解析的url路径,pipelines代表一个处理管道,
//当路径页面被下载下来之后,能够通过管道类对封装到HtmlBean中的数据进行处理
//{page}代表一个路径匹配变量,这个变量可以在类中变量通过@RequestParam注解解析获得
@Gecco(matchUrl = "http://news.missevan.com/news/index?p={page}",
pipelines = {"mainPipeline", "newsDetailPipe"})
public class NewsSpiderEntry implements HtmlBean {
@Request
private HttpRequest request;
//cssPath语法和Jquery的选择器类似,用于获取mathurl文档流中符合的元素数据
// //NewsSummaryView类看下面,这个类用于封装解析后的列表页数据
@HtmlField(cssPath = ".newslist")
private List newsSummaryViews;
//Text代表解析text文档(html代表解析html文档,attr代表解析标签属性等)
@Text
@HtmlField(cssPath = ".selected > a")
private String nextPage;
public HttpRequest getRequest() {
return request;
}
public void setRequest(HttpRequest request) {
this.request = request;
}
public String getNextPage() {
return nextPage;
}
public void setNextPage(String nextPage) {
this.nextPage = nextPage;
}
public List getNewsSummaryViews() {
return newsSummaryViews;
}
public void setNewsSummaryViews(List newsSummaryViews) {
this.newsSummaryViews = newsSummaryViews;
}
public static void main(String[] args) {
GeccoEngine.create()
// Gecco搜索的包路径,我这里的包是com.news.spider,可以根据自己的情况改。
//但需要注意,如果管道类HtmlBean类不在一个包下,就需要配置多个包,否则无法搜索到bean。
.classpath("com.news.spider")
// 开始抓取的页面地址
.start("http://news.missevan.com/news/index?p=1")
// 开启几个爬虫线程
.thread(1)
// 单个爬虫每次抓取完一个请求后的间隔时间
.interval(500).start();
}
}
(2) 建立主列表页bean: NewsSummaryView(需要实现HtmlBean接口, Gecco框架才能对它进行解析)。
private int id;
@Attr("href")
@HtmlField(cssPath = ".newstitle > a")
private String newsId;
@Text
@HtmlField(cssPath = ".newstitle > a")
private String title;
@Text
@HtmlField(cssPath = ".newscontent > p")
private String content;
@Attr("data-original")
@HtmlField(cssPath = ".newstag > a > img")
private String imgUrl;
//Href(click = true)能够让解析出来的链接直接放入待解析队列
@Href(click = true)
@HtmlField(cssPath = ".newstitle > a")
private String detailUrl;
public String getDetailUrl() {
return detailUrl;
}
public void setDetailUrl(String detailUrl) {
this.detailUrl = detailUrl;
}
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getNewsId() {
return newsId;
}
public void setNewsId(String newsId) {
this.newsId = newsId.substring(newsId.indexOf("newsid") + 7);
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getImgUrl() {
return imgUrl;
}
public void setImgUrl(String imgUrl) {
this.imgUrl = imgUrl;
}
//打印数据,方便调试
@Override
public String toString() {
return "NewsSummaryView{" +
"id=" + id +
", newsId=" + newsId +
", title='" + title + '\'' +
", content='" + content + '\'' +
", imgUrl='" + imgUrl + '\'' +
", detailUrl='" + detailUrl + '\'' +
'}';
}
}
(3) 建立列表详情页入口:NewsDetailEntry(以供@Href(click = true)加入的链接解析)
@Gecco(matchUrl = "http://news.missevan.com/news/article?newsid={newsid}", pipelines = "newsDetailPipe")
public class NewsDetailEntry implements HtmlBean {
@HtmlField(cssPath = "#articlebox")
private NewsDetail newsDetail;
//匹配解析路径删的{newsid}
@RequestParameter("newsid")
private int newsid;
public int getNewsid() {
return newsid;
}
public void setNewsid(int newsid) {
this.newsid = newsid;
}
public NewsDetail getNewsDetail() {
return newsDetail;
}
public void setNewsDetail(NewsDetail newsDetail) {
this.newsDetail = newsDetail;
}
}
(4) 建立详情页Bean: NewsDetail
private String author;
private String source;
@Text
@HtmlField(cssPath = "#articletitle")
private String title;
@Html
@HtmlField(cssPath = "#articlecontent")
private String content;
@Text
@HtmlField(cssPath = ".newsinfo2")
private List createdDate;
private int commentCount;
@Text
@HtmlField(cssPath = ".newstags1 > a")
private List tags;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
//对数据做正则处理,过滤掉不要的内容
content = content.replaceAll("","");
this.content = content;
}
public List getCreatedDate() {
return createdDate;
}
public void setCreatedDate(List createdDate) {
this.createdDate = createdDate;
}
public int getCommentCount() {
return commentCount;
}
public void setCommentCount(int commentCount) {
this.commentCount = commentCount;
}
public List getTags() {
return tags;
}
public void setTags(List tags) {
this.tags = tags;
}
@Override
public String toString() {
return "NewsDetail{" +
"id=" + id +
", author='" + author + '\'' +
", source='" + source + '\'' +
", title='" + title + '\'' +
", content='" + content + '\'' +
", createdDate=" + createdDate +
", commentCount=" + commentCount +
", tags='" + tags + '\'' +
'}';
}
(5) 建立管道处理类:MainPipeline和NewsDetailPipe
//注意,这里的PipelineName中的值必须与入口类中的pipelines对应
@PipelineName("mainPipeline")
public class MainPipeline implements Pipeline<NewsSpiderEntry> {
public void process(NewsSpiderEntry newsSpiderEntry) {
HttpRequest request = newsSpiderEntry.getRequest();
List newsSummaryViews = newsSpiderEntry.getNewsSummaryViews();
for(NewsSummaryView newsSummaryView : newsSummaryViews) {
System.out.println(newsSummaryView);
}
//获取下一页url
int nextPage = Integer.parseInt(newsSpiderEntry.getNextPage()) + 1;
//将下一页的url加入待解析队列
String nextPageurl = "http://news.missevan.com/news/index?p=" + nextPage;
SchedulerContext.into(request.subRequest(nextPageurl));
}
}
@PipelineName("newsDetailPipe")
public class NewsDetailPipe implements Pipeline<NewsDetailEntry> {
public void process(NewsDetailEntry newsDetailEntry) {
NewsDetail newsDetail = newsDetailEntry.getNewsDetail();
System.out.println(newsDetail);
}
}