如题:
目标:爬取天善最热博文列表(https://blog.hellobi.com/hot/weekly)对应的博文信息存入mysql数据库中。
暂定的博文相关信息有:
根据相应的信息建表:
CREATE TABLE `hot_weekly_blogs` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`url` VARCHAR(100) DEFAULT NULL,
`title` VARCHAR(100) DEFAULT NULL,
`author` VARCHAR(50) DEFAULT NULL,
`readNum` INT(11) DEFAULT NULL,
`recommendNum` INT(11) DEFAULT NULL,
`blogHomeUrl` VARCHAR(100) DEFAULT NULL,
`commentNum` INT(11) DEFAULT NULL,
`publishTime` VARCHAR(20) DEFAULT NULL,
`content` MEDIUMTEXT,
PRIMARY KEY (`id`)
) ENGINE=INNODB AUTO_INCREMENT=69 DEFAULT CHARSET=utf8;
1)新建maven工程,添加依赖:
us.codecraft
webmagic-core
0.7.3
us.codecraft
webmagic-extension
0.7.3
2)编写数据库工具类:
package com.qingqiuyue.ashura.util;
import java.sql.*;
import java.util.List;
public class DBHelper {
public static final String driver_class = "com.mysql.jdbc.Driver";
public static final String driver_url = "jdbc:mysql://localhost/ashura?useunicode=true&characterEncoding=utf8";
public static final String user = "root";
public static final String password = "root";
private static Connection conn = null;
private PreparedStatement pst = null;
private ResultSet rst = null;
/**
* Connection
*/
public DBHelper() {
try {
conn = DBHelper.getConnInstance();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 单例模式
* 线程同步
*
* @return
*/
private static synchronized Connection getConnInstance() {
if (conn == null) {
try {
Class.forName(driver_class);
conn = DriverManager.getConnection(driver_url, user, password);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (SQLException e) {
e.printStackTrace();
}
System.out.println("连接数据库成功");
}
return conn;
}
/**
* close
*/
public void close() {
try {
if (conn != null) {
DBHelper.conn.close();
}
if (pst != null) {
this.pst.close();
}
if (rst != null) {
this.rst.close();
}
System.out.println("关闭数据库成功");
} catch (SQLException e) {
e.printStackTrace();
}
}
/**
* query
*
* @param sql
* @param sqlValues
* @return ResultSet
*/
public ResultSet executeQuery(String sql, List sqlValues) {
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
rst = pst.executeQuery();
} catch (SQLException e) {
e.printStackTrace();
}
return rst;
}
/**
* update
*
* @param sql
* @param sqlValues
* @return result
*/
public int executeUpdate(String sql, List sqlValues) {
int result = -1;
try {
pst = conn.prepareStatement(sql);
if (sqlValues != null && sqlValues.size() > 0) {
setSqlValues(pst, sqlValues);
}
result = pst.executeUpdate();
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
/**
* sql set value
*
* @param pst
* @param sqlValues
*/
private void setSqlValues(PreparedStatement pst, List sqlValues) {
for (int i = 0; i < sqlValues.size(); i++) {
try {
pst.setObject(i + 1, sqlValues.get(i));
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
3)创建对应的实体对象:
package com.qingqiuyue.ashura.domain;
public class BlogInfo {
private String url;
private String title;
private String author;
private String readNum;
private String recommendNum;
private String blogHomeUrl;
private String commentNum;
private String publishTime;
private String content;
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getReadNum() {
return readNum;
}
public void setReadNum(String readNum) {
this.readNum = readNum;
}
public String getRecommendNum() {
return recommendNum;
}
public void setRecommendNum(String recommendNum) {
this.recommendNum = recommendNum;
}
public String getBlogHomeUrl() {
return blogHomeUrl;
}
public void setBlogHomeUrl(String blogHomeUrl) {
this.blogHomeUrl = blogHomeUrl;
}
public String getCommentNum() {
return commentNum;
}
public void setCommentNum(String commentNum) {
this.commentNum = commentNum;
}
public String getPublishTime() {
return publishTime;
}
public void setPublishTime(String publishTime) {
this.publishTime = publishTime;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
}
4)Dao接口层
package com.qingqiuyue.ashura.service;
import com.qingqiuyue.ashura.domain.BlogInfo;
/**
* 博文 数据持久化 接口
* @author Jasmine
*/
public interface BlogDao {
/**
* 保存博文信息
* @param blog
* @return
*/
public int saveBlog(BlogInfo blog);
}
5)Dao实现类
package com.qingqiuyue.ashura.service.impl;
import com.qingqiuyue.ashura.domain.BlogInfo;
import com.qingqiuyue.ashura.service.BlogDao;
import com.qingqiuyue.ashura.util.DBHelper;
import java.util.ArrayList;
import java.util.List;
/**
* 博客 数据库持久化接口 实现
* @author Jasmine
*/
public class BlogDaoImpl implements BlogDao {
@Override
public int saveBlog(BlogInfo blog) {
DBHelper dbhelper = new DBHelper();
StringBuffer sql = new StringBuffer();
sql.append("INSERT INTO hot_weekly_blogs(url,title,author,readNum,recommendNum,blogHomeUrl,commentNum,publishTime,content)")
.append("VALUES (? , ? , ? , ? , ? , ? , ? , ? , ? ) ");
//设置 sql values 的值
List sqlValues = new ArrayList<>();
sqlValues.add(blog.getUrl());
sqlValues.add(blog.getTitle());
sqlValues.add(blog.getAuthor());
sqlValues.add(""+blog.getReadNum());
sqlValues.add(""+blog.getRecommendNum());
sqlValues.add(blog.getBlogHomeUrl());
sqlValues.add(""+blog.getCommentNum());
sqlValues.add(blog.getPublishTime());
sqlValues.add(blog.getContent());
int result = dbhelper.executeUpdate(sql.toString(), sqlValues);
return result;
}
}
6)编写PageProcessor:
PageProcessor中的process方法是webmagic的核心,负责抽取目标url的逻辑。
package com.qingqiuyue.ashura.webmagic;
import com.qingqiuyue.ashura.service.BlogDao;
import com.qingqiuyue.ashura.service.impl.BlogDaoImpl;
import com.qingqiuyue.ashura.domain.BlogInfo;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created by Administrator on 2017/6/11.
*/
public class BlogPageProcessor implements PageProcessor {
//抓取网站的相关配置,包括:编码、抓取间隔、重试次数等
private Site site = Site.me().setRetryTimes(10).setSleepTime(1000);
//博文数量
private static int num = 0;
//数据库持久化对象,用于将博文信息存入数据库
private BlogDao blogDao = new BlogDaoImpl();
public static void main(String[] args) throws Exception {
long startTime, endTime;
System.out.println("========天善最热博客小爬虫【启动】喽!=========");
startTime = new Date().getTime();
Spider.create(new BlogPageProcessor()).addUrl("https://blog.hellobi.com/hot/weekly?page=1").thread(5).run();
endTime = new Date().getTime();
System.out.println("========天善最热博客小爬虫【结束】喽!=========");
System.out.println("一共爬到" + num + "篇博客!用时为:" + (endTime - startTime) / 1000 + "s");
}
@Override
public void process(Page page) {
//1. 如果是博文列表页面 【入口页面】,将所有博文的详细页面的url放入target集合中。
// 并且添加下一页的url放入target集合中。
if (page.getUrl().regex("https://blog\\.hellobi\\.com/hot/weekly\\?page=\\d+").match()) {
//目标链接
page.addTargetRequests(page.getHtml().xpath("//h2[@class='title']/a").links().all());
//下一页博文列表页链接
page.addTargetRequests(page.getHtml().xpath("//a[@rel='next']").links().all());
}
//2. 如果是博文详细页面
else {
// String content1 = page.getHtml().get();
try {
/*实例化BlogInfo,方便持久化存储。*/
BlogInfo blog = new BlogInfo();
//博文标题
String title = page.getHtml().xpath("//h1[@class='clearfix']/a/text()").get();
//博文url
String url = page.getHtml().xpath("//h1[@class='clearfix']/a/@href").get();
//博文作者
String author = page.getHtml().xpath("//section[@class='sidebar']/div/div/a[@class='aw-user-name']/text()").get();
//作者博客地址
String blogHomeUrl = page.getHtml().xpath("//section[@class='sidebar']/div/div/a[@class='aw-user-name']/@href").get();
//博文内容,这里只获取带html标签的内容,后续可再进一步处理
String content = page.getHtml().xpath("//div[@class='message-content editor-style']").get();
//推荐数(点赞数)
String recommendNum = page.getHtml().xpath("//a[@class='agree']/b/text()").get();
//评论数
String commentNum = page.getHtml().xpath("//div[@class='aw-mod']/div/h2/text()").get().split("个")[0].trim();
//阅读数(浏览数)
String readNum = page.getHtml().xpath("//div[@class='row']/div/div/div/div/span/text()").get().split(":")[1].trim();
//发布时间,发布时间需要处理,这一步获取原始信息
String time = page.getHtml().xpath("//time[@class='time']/text()").get().split(":")[1].trim();
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");
Calendar cal = Calendar.getInstance();// 取当前日期。
cal = Calendar.getInstance();
String publishTime = null;
Pattern p = Pattern.compile("^\\d{4}-\\d{2}-\\d{2}$");
Matcher m = p.matcher(time);
//如果time是“YYYY-mm-dd”这种格式的,则不需要处理
if (m.matches()) {
publishTime = time;
} else if (time.contains("天")) { //如果time包含“天”,如1天前,
int days = Integer.parseInt(time.split("天")[0].trim());//则获取对应的天数
cal.add(Calendar.DAY_OF_MONTH, -days);// 取当前日期的前days天.
publishTime = df.format(cal.getTime()); //并将时间转换为“YYYY-mm-dd”这个格式
} else {//time是其他格式,如几分钟前,几小时前,都为当日日期
publishTime = df.format(cal.getTime());
}
//对象赋值
blog.setUrl(url);
blog.setTitle(title);
blog.setAuthor(author);
blog.setBlogHomeUrl(blogHomeUrl);
blog.setCommentNum(commentNum);
blog.setRecommendNum(recommendNum);
blog.setReadNum(readNum);
blog.setContent(content);
blog.setPublishTime(publishTime);
num++;//博文数++
System.out.println("num:" + num + " " + blog.toString());//输出对象
blogDao.saveBlog(blog);//保存博文信息到数据库
} catch (Exception e) {
e.printStackTrace();
}
}
}
@Override
public Site getSite() {
return this.site;
}
}
完事,截图目录结构:
运行以及效果:
PS:1.本来直接转载的,发现有些问题,又截了一遍图
2.代码顺序变了一下,后面的需要引用前面的……
原文链接:
基于webmagic框架的爬虫小Demo:https://ask.hellobi.com/blog/jasmine3happy/8537