代码清单:
==============================SQL====================================
计划任务表
==============================定时任务模块类====================================
计划管理DAO接口 CmsSchedulerDao.java
计划管理DAO接口实现类 CmsSchedulerDaoImpl.java
计划任务管理服务接口 CmsSchedulerMng.java
计划任务管理服务接口实现类 CmsSchedulerMngImpl.java
定时任务管理接口 SchedulerTaskManageSvc.java
定时任务管理接口实现类 SchedulerTaskManageSvcImpl.java
定时任务接口 SchedulerTaskSvc.java
定时任务抽象实现类 AbstractSchedulerTaskSvc.java
定时任务接口-采集器实现类-多线程版 SchedulerAcquisitionSvcImpl.java
定时服务关联任务bean SchedulerTaskBean.java
计划任务Controller CmsSchedulerAct.java
持久对象基类 BaseCmsScheduler.java
持久对象 CmsScheduler.java
HBM文件 CmsScheduler.hbm.xml
==============================定时任务模块相关互助类====================================
计划框架
计划框架-任务调度 Scheduler.java
计划框架-时间生成器接口 ScheduleIterator.java
计划任务抽象类 SchedulerTask.java
计划框架-时间生成器接口实现类 SimpleScheduleIterator.java
时间计划参数bean ScheduleParamBean.java
采集相关
HTML解析工具类接口 ParseHtmlTool.java
HTML解析工具,HtmlParser实现类 HtmlParserImpl.java
采集参数封装bean ParamBean.java
队列 Queue.java
URL队列 UrlQueue.java
接下来是XML配置
==============================定时任务模块XML配置====================================
dao配置
<bean id="cmsSchedulerDao" class="com.jeecms.cms.dao.assist.impl.CmsSchedulerDaoImpl"/>
manage配置
<bean id="cmsSchedulerMng" class="com.jeecms.cms.manager.assist.impl.CmsSchedulerMngImpl"/>
SERVICE配置
<bean id="schedulerAcquisitionSvc" class="com.jeecms.cms.service.scheduler.SchedulerAcquisitionSvcImpl"/>
<bean id="schedulerTaskManageSvc" class="com.jeecms.cms.service.scheduler.SchedulerTaskManageSvcImpl"/>
接下来是messages_zh_CN.properties 添加了常量
==============================messages_zh_CN.properties====================================
messages_zh_CN.properties
接下来是模板
==============================模板====================================
generate_left.html 有修改
scheduler/add.html
scheduler/edit.html
scheduler/list.html
具体代码如下:
==============================SQL====================================
1:计划任务表
/*
MySQL Data Transfer
Source Host: localhost
Source Database: jeecms
Target Host: localhost
Target Database: jeecms
Date: 2011-11-8 11:36:55
*/
SET FOREIGN_KEY_CHECKS=0;
-- ----------------------------
-- Table structure for jc_scheduler
-- ----------------------------
CREATE TABLE `jc_scheduler` (
`scheduler_id` int(11) NOT NULL AUTO_INCREMENT COMMENT '任务主键',
`site_id` int(11) DEFAULT NULL,
`associate_id` int(11) DEFAULT NULL COMMENT '相关ID',
`module_type` varchar(100) DEFAULT NULL COMMENT '模块类型',
`name` varchar(100) DEFAULT NULL COMMENT '任务名称',
`start_time` datetime DEFAULT NULL COMMENT '开始时间',
`end_time` datetime DEFAULT NULL COMMENT '结束时间',
`status` int(1) NOT NULL DEFAULT '0' COMMENT '当前状态(0:静止;1:采集)',
`expression` varchar(50) NOT NULL COMMENT '计划表达式',
PRIMARY KEY (`scheduler_id`)
) ENGINE=InnoDB AUTO_INCREMENT=10 DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records
-- ----------------------------
INSERT INTO `jc_scheduler` VALUES ('4', '1', '1', 'schedulerAcquisitionSvc', '测试', '2011-11-07 18:02:30', '2011-11-07 18:04:00', '0', '*,*,*,*,3,0');
INSERT INTO `jc_scheduler` VALUES ('8', '1', '5', 'schedulerAcquisitionSvc', '测试采集java', '2011-11-08 10:25:15', '2011-11-08 10:27:04', '0', '*,*,*,*,26,0');
INSERT INTO `jc_scheduler` VALUES ('9', '1', '1', 'schedulerAcquisitionSvc', '测试采集新闻', '2011-11-08 10:37:58', '2011-11-08 10:38:11', '0', '*,*,*,*,38,0');
==============================定时任务模块类====================================
计划管理DAO接口 CmsSchedulerDao.java
package com.jeecms.cms.dao.assist;
import java.util.List;
import com.jeecms.cms.entity.assist.CmsScheduler;
import com.jeecms.common.hibernate3.Updater;
/**
* 计划管理DAO接口
* @author javacoo
* @since 2011-11-07
*/
public interface CmsSchedulerDao {
public List<CmsScheduler> getList();
public List<CmsScheduler> getListBy(CmsScheduler bean);
public CmsScheduler findById(Integer id);
public CmsScheduler save(CmsScheduler bean);
public CmsScheduler updateByUpdater(Updater<CmsScheduler> updater);
public CmsScheduler deleteById(Integer id);
}
计划管理DAO接口实现类 CmsSchedulerDaoImpl.java
package com.jeecms.cms.dao.assist.impl;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.springframework.stereotype.Repository;
import com.jeecms.cms.dao.assist.CmsSchedulerDao;
import com.jeecms.cms.entity.assist.CmsScheduler;
import com.jeecms.common.hibernate3.Finder;
import com.jeecms.common.hibernate3.HibernateBaseDao;
@Repository
public class CmsSchedulerDaoImpl extends
HibernateBaseDao<CmsScheduler, Integer> implements CmsSchedulerDao {
@SuppressWarnings("unchecked")
public List<CmsScheduler> getList() {
Finder f = Finder.create("from CmsScheduler bean order by bean.id asc");
return find(f);
}
@SuppressWarnings("unchecked")
public List<CmsScheduler> getListBy(CmsScheduler bean) {
Finder f = Finder.create("from CmsScheduler bean");
if(StringUtils.isNotEmpty(bean.getModuleType()) && bean.getSite().getId() != null) {
f.append(" where bean.moduleType=:moduleType and bean.site.id=:siteId");
f.setParam("moduleType", bean.getModuleType());
f.setParam("siteId", bean.getSite().getId());
}
f.append(" order by bean.id asc");
return find(f);
}
public CmsScheduler findById(Integer id) {
CmsScheduler entity = get(id);
return entity;
}
public CmsScheduler save(CmsScheduler bean) {
getSession().save(bean);
return bean;
}
public CmsScheduler deleteById(Integer id) {
CmsScheduler entity = super.get(id);
if (entity != null) {
getSession().delete(entity);
}
return entity;
}
@Override
protected Class<CmsScheduler> getEntityClass() {
return CmsScheduler.class;
}
}
计划任务管理服务接口 CmsSchedulerMng.java
package com.jeecms.cms.manager.assist;
import java.util.List;
import com.jeecms.cms.entity.assist.CmsScheduler;
/**
* 计划任务管理服务接口
* @author javacoo
* @since 2011-11-07
* @version 1.0
*/
public interface CmsSchedulerMng {
/**
* 取得所有计划任务
* @return 所有计划任务
*/
List<CmsScheduler> getList();
/**
* 取得指定站点,指定模块所有计划任务
* @param bean 计划任务bean
* @return 所有计划任务
*/
List<CmsScheduler> getListBy(CmsScheduler bean);
/**
* 根据ID取得计划任务
* @param id
* @return 计划任务
*/
CmsScheduler findById(Integer id);
/**
* 停止指定的计划任务
* @param id
*/
void stop(Integer id);
/**
* 开始指定的计划任务
* @param id
*/
CmsScheduler start(Integer id);
/**
* 停止指定的计划任务
* @param id
*/
void end(Integer id);
/**
* 保存计划任务
* @param bean
* @return
*/
CmsScheduler save(CmsScheduler bean);
/**
* 更新计划任务
* @param bean
* @return
*/
CmsScheduler update(CmsScheduler bean);
/**
* 删除计划任务
* @param bean
* @return
*/
CmsScheduler deleteById(Integer id);
/**
* 批量删除计划任务
* @param bean
* @return
*/
CmsScheduler[] deleteByIds(Integer[] ids);
}
计划任务管理服务接口实现类 CmsSchedulerMngImpl.java
package com.jeecms.cms.manager.assist.impl;
import java.util.Date;
import java.util.List;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import com.jeecms.cms.dao.assist.CmsSchedulerDao;
import com.jeecms.cms.entity.assist.CmsAcquisition;
import com.jeecms.cms.entity.assist.CmsScheduler;
import com.jeecms.cms.manager.assist.CmsSchedulerMng;
import com.jeecms.common.hibernate3.Updater;
/**
* 计划任务管理服务接口实现类
* @author javacoo
* @since 2011-11-07
* @version 1.0
*/
@Service
@Transactional
public class CmsSchedulerMngImpl implements CmsSchedulerMng{
@Transactional(readOnly = true)
public List<CmsScheduler> getList() {
return dao.getList();
}
@Transactional(readOnly = true)
public List<CmsScheduler> getListBy(CmsScheduler bean) {
return dao.getListBy(bean);
}
@Transactional(readOnly = true)
public CmsScheduler findById(Integer id) {
CmsScheduler entity = dao.findById(id);
return entity;
}
public void stop(Integer id) {
CmsScheduler acqu = findById(id);
if (acqu == null) {
return;
}
if (acqu.getStatus() == CmsScheduler.START) {
acqu.setStatus(CmsScheduler.STOP);
}
}
public CmsScheduler start(Integer id) {
CmsScheduler scheduler = findById(id);
if (scheduler == null) {
return scheduler;
}
scheduler.setStatus(CmsAcquisition.START);
scheduler.setStartTime(new Date());
scheduler.setEndTime(null);
return scheduler;
}
public void end(Integer id) {
CmsScheduler scheduler = findById(id);
if (scheduler == null) {
return;
}
scheduler.setStatus(CmsAcquisition.STOP);
scheduler.setEndTime(new Date());
}
public CmsScheduler save(CmsScheduler bean) {
bean.init();
dao.save(bean);
return bean;
}
public CmsScheduler update(CmsScheduler bean) {
Updater<CmsScheduler> updater = new Updater<CmsScheduler>(bean);
bean = dao.updateByUpdater(updater);
return bean;
}
public CmsScheduler deleteById(Integer id) {
CmsScheduler bean = dao.deleteById(id);
return bean;
}
public CmsScheduler[] deleteByIds(Integer[] ids) {
CmsScheduler[] beans = new CmsScheduler[ids.length];
for (int i = 0, len = ids.length; i < len; i++) {
beans[i] = deleteById(ids[i]);
}
return beans;
}
private CmsSchedulerDao dao;
@Autowired
public void setDao(CmsSchedulerDao dao) {
this.dao = dao;
}
}
定时任务管理接口 SchedulerTaskManageSvc.java
package com.jeecms.cms.service.scheduler;
import java.util.List;
import com.jeecms.cms.entity.assist.CmsScheduler;
/**
* 定时任务管理接口
* @author javacoo
* @since 2011-11-07
*/
public interface SchedulerTaskManageSvc {
/**
* 开始计划任务
* @param scheduler 任务对象
* @return true/false
*/
boolean start(CmsScheduler scheduler);
/**
* 结束计划任务
* @param scheduler 任务对象
* @return true/false
*/
boolean stop(CmsScheduler scheduler);
/**
* 取得关联任务map
* @param scheduler 任务对象
* @return 关联任务map
*/
List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler);
}
定时任务管理接口实现类 SchedulerTaskManageSvcImpl.java
package com.jeecms.cms.service.scheduler;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.lang.StringUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.jeecms.cms.entity.assist.CmsScheduler;
import com.jeecms.common.scheduling.core.Scheduler;
import com.jeecms.common.scheduling.core.SchedulerTask;
import com.jeecms.common.scheduling.impl.ScheduleParamBean;
import com.jeecms.common.scheduling.impl.SimpleScheduleIterator;
/**
* 定时任务管理服务接口实现类
* @author javacoo
* @since 2011-11-07
*/
@Service
public class SchedulerTaskManageSvcImpl implements SchedulerTaskManageSvc {
/**任务管理对象MAP*/
private static Map<Integer,TaskManage> taskManageMap = new ConcurrentHashMap<Integer, TaskManage>();
/**定时任务服务对象MAP*/
@Autowired
private Map<String,SchedulerTaskSvc> schedulerTaskSvcMap;
/**
* 任务管理对象
* @author javacoo
* @since 2011-11-07
*/
private class TaskManage{
/**任务调度*/
private final Scheduler scheduler = new Scheduler();
/**任务参数bean*/
private ScheduleParamBean scheduleParamBean;
/**定时任务*/
private final SchedulerTaskSvc schedulerTaskSvc;
private CmsScheduler cmsScheduler;
public TaskManage(SchedulerTaskSvc schedulerSvc,CmsScheduler cmsScheduler){
this.schedulerTaskSvc = schedulerSvc;
this.cmsScheduler = cmsScheduler;
}
/**
* 解析计划表达式
* @return
*/
private boolean parseSchedulerParam(){
scheduleParamBean = new ScheduleParamBean();
System.out.println("计划表达式:"+cmsScheduler.getExpression());
String schedulerParamStr = cmsScheduler.getExpression();
if(StringUtils.isNotEmpty(schedulerParamStr) && schedulerParamStr.contains(",")){
String[] strAarr = schedulerParamStr.split(",");
if(strAarr.length == 6){
if(StringUtils.isNumeric(strAarr[0])){
scheduleParamBean.setWeekOfMonth(Integer.valueOf(strAarr[0]));
}
if(StringUtils.isNumeric(strAarr[1])){
scheduleParamBean.setDayOfWeek(Integer.valueOf(strAarr[1]));
}
if(StringUtils.isNumeric(strAarr[2])){
scheduleParamBean.setDayOfMonth(Integer.valueOf(strAarr[2]));
}
if(StringUtils.isNumeric(strAarr[3])){
scheduleParamBean.setHourOfDay(Integer.valueOf(strAarr[3]));
}
if(StringUtils.isNumeric(strAarr[4])){
scheduleParamBean.setMinute(Integer.valueOf(strAarr[4]));
}
if(StringUtils.isNumeric(strAarr[5])){
scheduleParamBean.setSecond(Integer.valueOf(strAarr[5]));
}
}else{
return false;
}
}else{
return false;
}
return true;
}
/**
* 开始
*/
public void start() {
if(parseSchedulerParam()){
scheduler.schedule(new SchedulerTask() {
public void run() {
processer();
}
private void processer() {
System.out.println("============开始执行计划任务=================");
schedulerTaskSvc.start(cmsScheduler);
}
}, new SimpleScheduleIterator(scheduleParamBean));
}
}
/**
* 取消
*/
public void cancel() {
schedulerTaskSvc.stop(cmsScheduler);
scheduler.cancel();
}
}
/**
* 开始执行计划
* @param scheduler 计划对象
*/
public boolean start(CmsScheduler scheduler) {
SchedulerTaskSvc schedulerSvc = getSchedulerTaskSvcByModuleType(scheduler.getModuleType());
TaskManage taskManage = new TaskManage(schedulerSvc,scheduler);
taskManage.start();
taskManageMap.put(scheduler.getId(), taskManage);
return true;
}
/**
* 停止执行计划
* @param scheduler 计划对象
*/
public boolean stop(CmsScheduler scheduler) {
TaskManage taskManage = taskManageMap.get(scheduler.getId());
taskManage.cancel();
return true;
}
/**
* 取得计划关联的任务对象集合
* @param scheduler 计划对象
*/
public List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler) {
SchedulerTaskSvc schedulerSvc = getSchedulerTaskSvcByModuleType(scheduler.getModuleType());
return schedulerSvc.associateTaskList(scheduler);
}
/**
* 根据模块的类型,取得定时任务服务对象
* @param moduleType 模块类型
*/
private SchedulerTaskSvc getSchedulerTaskSvcByModuleType(String moduleType){
return schedulerTaskSvcMap.get(moduleType);
}
}
定时任务接口 SchedulerTaskSvc.java
package com.jeecms.cms.service.scheduler;
import java.util.List;
import com.jeecms.cms.entity.assist.CmsScheduler;
/**
* 定时任务接口
* @author javacoo
* @since 2011-11-04
*/
public interface SchedulerTaskSvc {
/**
* 开始计划任务
* @param cmsScheduler 任务对象
* @return true/false
*/
boolean start(CmsScheduler cmsScheduler);
/**
* 结束计划任务
* @param cmsScheduler 任务对象
* @return true/false
*/
boolean stop(CmsScheduler cmsScheduler);
/**
* 取得关联任务map
* @param cmsScheduler 任务对象
* @return 关联任务map
*/
List<SchedulerTaskBean> associateTaskList(CmsScheduler cmsScheduler);
}
定时任务抽象实现类 AbstractSchedulerTaskSvc.java
package com.jeecms.cms.service.scheduler;
import java.util.List;
import com.jeecms.cms.entity.assist.CmsScheduler;
/**
* 定时任务抽象实现类
* @author javacoo
* @since 2011-11-08
*/
public abstract class AbstractSchedulerTaskSvc implements SchedulerTaskSvc{
/**
* 开始计划任务
* @return true/false
*/
public boolean start(CmsScheduler scheduler){
return execute(scheduler);
}
/**
* 开始计划任务
* @return true/false
*/
public boolean stop(CmsScheduler scheduler){
return true;
}
/**
* 取得关联任务map
* @return 关联任务map
*/
public List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler){
return null;
}
protected abstract boolean execute(CmsScheduler scheduler);
}
定时任务接口-采集器实现类-多线程版 SchedulerAcquisitionSvcImpl.java
package com.jeecms.cms.service.scheduler;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.commons.lang.StringUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.StatusLine;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.conn.params.ConnRoutePNames;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import com.jeecms.cms.entity.assist.CmsAcquisition;
import com.jeecms.cms.entity.main.Content;
import com.jeecms.cms.manager.assist.CmsAcquisitionMng;
import com.jeecms.common.crawler.UrlQueue;
import com.jeecms.common.crawler.util.HtmlParserImpl;
import com.jeecms.common.crawler.util.ParseHtmlTool;
/**
* 计划任务接口-采集器实现类-多线程版
* @author javacoo
* @since 2011-11-02
* @version 1.0
*/
@Service
public class SchedulerAcquisitionSvcImpl extends AbstractSchedulerTaskSvc {
private Logger log = LoggerFactory.getLogger(SchedulerAcquisitionSvcImpl.class);
/**开启线程数*/
private static int THREAD_NUM = 2;
/**每个线程休眠毫秒数*/
private static int SLEEP_TIME = 100;
/**连接集合标志*/
private static String LINK_KEY = "linkKey";
/**标题集合标志*/
private static String TITLE_KEY = "titleKey";
/**采集管理对象*/
private CmsAcquisitionMng cmsAcquisitionMng;
/**存放HttpClient的ThreadLocal对象*/
private static ThreadLocal<HttpClient> httpClientThreadLocal = new ThreadLocal<HttpClient>();
/**存放ParseHtmlTool的ThreadLocal对象*/
private static ThreadLocal<ParseHtmlTool> parseHtmlToolThreadLocal = new ThreadLocal<ParseHtmlTool>();
/**存放UrlQueue的ThreadLocal对象*/
private static ThreadLocal<UrlQueue> urlQueueThreadLocal = new ThreadLocal<UrlQueue>();
/**存放计划UrlQueue的ThreadLocal对象*/
private static ThreadLocal<UrlQueue> planUrlQueueThreadLocal = new ThreadLocal<UrlQueue>();
@Autowired
public void setCmsAcquisitionMng(CmsAcquisitionMng cmsAcquisitionMng) {
this.cmsAcquisitionMng = cmsAcquisitionMng;
}
@Override
protected boolean execute(CmsScheduler scheduler) {
CmsAcquisition acqu = cmsAcquisitionMng.findById(scheduler.getAssociateId());
if (acqu == null) {
return false;
}
System.out.println("===============开始执行采集任务");
new Thread(new MainThreadProcesser(this,acqu)).start();
return true;
}
/**
* 取得关联任务map
* @return 关联任务map
*/
public List<SchedulerTaskBean> associateTaskList(CmsScheduler scheduler){
List<CmsAcquisition> list = cmsAcquisitionMng.getList(scheduler.getSite().getId());
List<SchedulerTaskBean> resultList = new ArrayList<SchedulerTaskBean>();
SchedulerTaskBean schedulerTaskBean = null;
for(CmsAcquisition acquisition : list){
schedulerTaskBean = new SchedulerTaskBean();
schedulerTaskBean.setId(acquisition.getId());
schedulerTaskBean.setName(acquisition.getName());
resultList.add(schedulerTaskBean);
}
return resultList;
}
/**
* 主线程处理类
* @author javacoo
* @since 2011-11-02
*/
private class MainThreadProcesser implements Runnable {
private CmsAcquisition acqu;
private SchedulerTaskSvc schedulerAcquisitionSvc;
public MainThreadProcesser(SchedulerTaskSvc schedulerAcquisitionSvc,CmsAcquisition acqu) {
this.acqu = acqu;
this.schedulerAcquisitionSvc = schedulerAcquisitionSvc;
}
//线程锁
Object threadLock = new Object();
public void run() {
long tStart = System.currentTimeMillis();
System.out.println("主线程:"+Thread.currentThread().getName() + "开始...");
try {
CountDownLatch latch = new CountDownLatch(THREAD_NUM);
ExecutorService exec = Executors.newCachedThreadPool();
getHttpClient().getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY,new HttpHost("128.160.64.5", 1235));
CharsetHandler handler = new CharsetHandler(acqu.getPageEncoding());
//取得当前任务所有计划
getAllPlans(acqu,getPlanUrlQueue());
//开启一线程执行抓取计划下URL
Thread thread = new Thread(new FetchUrlThread(schedulerAcquisitionSvc,latch,getHttpClient(),getPlanUrlQueue(),getUrlQueue(),getParseHtmlTool(acqu),handler,threadLock));
exec.execute(thread);
//开启指定数目线程执行采集内容
for(int i=0;i<THREAD_NUM;i++){
thread = new Thread(new FetchContentThread(schedulerAcquisitionSvc,acqu,latch,getHttpClient(),getUrlQueue(),getParseHtmlTool(acqu),handler,threadLock));
exec.execute(thread);
}
latch.await();
exec.shutdown();
} catch (InterruptedException e) {
e.printStackTrace();
} finally{
httpClientThreadLocal.get().getConnectionManager().shutdown();
httpClientThreadLocal.remove();
parseHtmlToolThreadLocal.remove();
urlQueueThreadLocal.remove();
planUrlQueueThreadLocal.remove();
long tEnd = System.currentTimeMillis();
System.out.println("主线程:"+Thread.currentThread().getName() + "结束...");
System.out.println("主线程:"+Thread.currentThread().getName() + "总共用时:" + (tEnd - tStart) + "ms");
}
}
}
/**
* 采集URL线程
* @author javacoo
* @since 2011-11-04
*/
private class FetchUrlThread implements Runnable{
private SchedulerTaskSvc acquisitionSvc;
private CountDownLatch latch;
private UrlQueue urlQueue;
private UrlQueue planUrlQueue;
private HttpClient httpClient;
private ParseHtmlTool parseHtmlTool;
private CharsetHandler handler;
private Object threadLock;
public FetchUrlThread(SchedulerTaskSvc acquisitionSvc,CountDownLatch latch,HttpClient httpClient,UrlQueue planUrlQueue,UrlQueue urlQueue,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Object threadLock){
this.acquisitionSvc = acquisitionSvc;
this.latch = latch;
this.urlQueue = urlQueue;
this.planUrlQueue = planUrlQueue;
this.httpClient = httpClient;
this.parseHtmlTool = parseHtmlTool;
this.handler = handler;
this.threadLock = threadLock;
}
public void run() {
System.out.println("======================采集URL子线程:"+Thread.currentThread().getName() + "开始...");
try {
Map<String,String> urlMap = null;
while(!urlAndTitleMapIsEmpty(planUrlQueue)) {
urlMap = getUrlAndTitleMap(planUrlQueue);
getAllUrls(httpClient,parseHtmlTool,handler,urlQueue,urlMap);
Thread.sleep(SLEEP_TIME);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (URISyntaxException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
System.out.println("======================采集URL子线程:"+Thread.currentThread().getName() + "结束.");
//通知采集内容线程开始执行
synchronized(threadLock) {
threadLock.notifyAll();
}
latch.countDown();
}
}
}
/**
* 采集内容线程
* @author javacoo
* @since 2011-11-02
*/
private class FetchContentThread implements Runnable {
private SchedulerTaskSvc acquisitionSvc;
private CmsAcquisition acqu;
private CountDownLatch latch;
private UrlQueue urlQueue;
private HttpClient httpClient;
private ParseHtmlTool parseHtmlTool;
private CharsetHandler handler;
private Object threadLock;
public FetchContentThread(SchedulerTaskSvc acquisitionSvc,CmsAcquisition acqu,CountDownLatch latch,HttpClient httpClient,UrlQueue urlQueue,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Object threadLock) {
this.acquisitionSvc = acquisitionSvc;
this.acqu = acqu;
this.latch = latch;
this.urlQueue = urlQueue;
this.httpClient = httpClient;
this.parseHtmlTool = parseHtmlTool;
this.handler = handler;
this.threadLock = threadLock;
}
public void run() {
System.out.println("======================采集内容子线程:"+Thread.currentThread().getName() + "开始...");
try {
//等待采集URL线程执行完毕
synchronized(threadLock) {
threadLock.wait();
}
Map<String,String> urlMap = null;
while(!urlAndTitleMapIsEmpty(urlQueue)) {
urlMap = getUrlAndTitleMap(urlQueue);
saveContent(acqu,httpClient,parseHtmlTool,handler,urlMap);
Thread.sleep(SLEEP_TIME);
}
} catch (Exception e) {
e.printStackTrace();
log.warn(null, e);
} finally {
System.out.println("======================采集内容子线程:"+Thread.currentThread().getName() + "结束.");
log.info("Acquisition#{} complete", acqu.getId());
latch.countDown();
}
}
}
/**
* 取得当前主线程的HttpClient对象
* @return 当前主线程的HttpClient对象
*/
private static HttpClient getHttpClient(){
if(httpClientThreadLocal.get() == null){
HttpClient client = new DefaultHttpClient();
httpClientThreadLocal.set(client);
return client;
}else{
return httpClientThreadLocal.get();
}
}
/**
* 取得当前主线程的UrlQueue对象
* @return 当前主线程的UrlQueue对象
*/
private static UrlQueue getUrlQueue(){
if(urlQueueThreadLocal.get() == null){
UrlQueue urlQueue = new UrlQueue();
urlQueueThreadLocal.set(urlQueue);
return urlQueue;
}else{
return urlQueueThreadLocal.get();
}
}
/**
* 取得当前主线程的计划UrlQueue对象
* @return 当前主线程的计划UrlQueue对象
*/
private static UrlQueue getPlanUrlQueue(){
if(planUrlQueueThreadLocal.get() == null){
UrlQueue urlQueue = new UrlQueue();
planUrlQueueThreadLocal.set(urlQueue);
return urlQueue;
}else{
return planUrlQueueThreadLocal.get();
}
}
/**
* 取得当前主线程的ParseHtmlTool对象
* @param acqu 采集参数对象
* @return 当前主线程的ParseHtmlTool对象
*/
private static ParseHtmlTool getParseHtmlTool(CmsAcquisition acqu){
if(parseHtmlToolThreadLocal.get() == null){
ParseHtmlTool parseHtmlTool = new HtmlParserImpl(acqu);
parseHtmlToolThreadLocal.set(parseHtmlTool);
return parseHtmlTool;
}else{
return parseHtmlToolThreadLocal.get();
}
}
/**
* 连接和标题map对象入队列
* @param map 连接和标题map对象
*/
private synchronized void addUrlAndTitleMap(Map<String,String> map,UrlQueue urlQueue){
System.out.println("====线程:"+Thread.currentThread().getName() + ",添加 urlQueue:"+urlQueue);
urlQueue.addUnVisitedUrl(map);
}
/**
* 连接和标题map对象出队列
* @param urlQueue 当前线程的队列
* @return 连接和标题map对象
*/
private synchronized Map<String,String> getUrlAndTitleMap(UrlQueue urlQueue){
System.out.println("====线程:"+Thread.currentThread().getName() + ",取得 urlQueue:"+urlQueue);
return urlQueue.unVisitedUrlDeQueue();
}
/**
* 判断当前对象是否为空
* @param urlQueue 当前线程的队列
* @return true/flase
*/
private synchronized boolean urlAndTitleMapIsEmpty(UrlQueue urlQueue){
System.out.println("====线程:"+Thread.currentThread().getName() + ",判断 urlQueue:"+urlQueue);
return urlQueue.isEmpty();
}
/**
* 取得当前线程下所有计划,并加入队列
* @param acqu 采集参数对象
* @param urlQueue 队列
* @throws URISyntaxException
* @throws IOException
* @throws ClientProtocolException
*/
private void getAllPlans(CmsAcquisition acqu,UrlQueue urlQueue){
String[] plans = acqu.getAllPlans();
Map<String,String> planMap = new HashMap<String,String>();
for (int i = plans.length - 1; i >= 0; i--) {
planMap.put(LINK_KEY, plans[i]);
planMap.put(TITLE_KEY, acqu.getName());
addUrlAndTitleMap(planMap,urlQueue);
}
System.out.println("=======当前线程:"+Thread.currentThread().getName() + "计划URL连接数:"+urlQueue.getUnVisitedUrlNum());
}
/**
* 取得当前线程下所有计划的连接,并加入队列
* @param acqu 采集参数对象
* @param handler 字符集对象
* @param urlQueue 队列
* @throws URISyntaxException
* @throws IOException
* @throws ClientProtocolException
*/
private void getAllUrls(HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,UrlQueue urlQueue,Map<String,String> map) throws URISyntaxException, ClientProtocolException, IOException{
HttpGet httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));
String html = httpClient.execute(httpGet, handler);
for(Map<String,String> planMap : parseHtmlTool.getUrlAndTitleMap(html)){
addUrlAndTitleMap(planMap,urlQueue);
}
System.out.println("=======当前线程:"+Thread.currentThread().getName() + "URL连接数:"+urlQueue.getUnVisitedUrlNum());
}
/**
* 保存内容
* @param acqu 请求参数对象
* @param httpClient httpClient对象
* @param parseHtmlTool parseHtmlTool对象
* @param handler CharsetHandler对象
* @param map 连接和标题map对象
* @return Content
*/
private synchronized Content saveContent(CmsAcquisition acqu,HttpClient httpClient,ParseHtmlTool parseHtmlTool,CharsetHandler handler,Map<String,String> map) {
try {
HttpGet httpGet = null;
if(map.get(LINK_KEY).contains("http://")){
httpGet = new HttpGet(new URI(map.get(LINK_KEY).trim()));
}else{
httpGet = new HttpGet(new URI("http://localhost/v7/"+map.get(LINK_KEY).trim()));
}
String html = httpClient.execute(httpGet, handler);
System.out.println("=============================子线程:"+Thread.currentThread().getName() + "执行");
String txt = parseHtmlTool.getHtml(html);
//return cmsAcquisitionMng.saveContent(map.get(TITLE_KEY), txt,acqu.getId());
return null;
} catch (Exception e) {
log.warn(null, e);
e.printStackTrace();
return null;
}
}
/**
* 字符集帮助类
* @author Administrator
*
*/
private class CharsetHandler implements ResponseHandler<String> {
private String charset;
public CharsetHandler(String charset) {
this.charset = charset;
}
public String handleResponse(HttpResponse response)
throws ClientProtocolException, IOException {
StatusLine statusLine = response.getStatusLine();
if (statusLine.getStatusCode() >= 300) {
throw new HttpResponseException(statusLine.getStatusCode(),
statusLine.getReasonPhrase());
}
HttpEntity entity = response.getEntity();
if (entity != null) {
if (!StringUtils.isBlank(charset)) {
return EntityUtils.toString(entity, charset);
} else {
return EntityUtils.toString(entity);
}
} else {
return null;
}
}
}
}
定时服务关联任务bean SchedulerTaskBean.java
package com.jeecms.cms.service.scheduler;
/**
* 定时服务关联任务bean
* @author javacoo
* @since 2011-11-07
*/
public class SchedulerTaskBean {
/**任务主键*/
private Integer id;
/**任务名称*/
private String name;
public Integer getId() {
return id;
}
public void setId(Integer id) {
this.id = id;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
计划任务Controller CmsSchedulerAct.java
package com.jeecms.cms.action.admin.assist;
import java.util.List;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.ui.ModelMap;
import org.springframework.web.bind.annotation.RequestMapping;
import com.jeecms.cms.entity.assist.CmsAcquisition;
import com.jeecms.cms.entity.assist.CmsScheduler;
import com.jeecms.cms.entity.main.CmsSite;
import com.jeecms.cms.manager.assist.CmsSchedulerMng;
import com.jeecms.cms.manager.main.CmsLogMng;
import com.jeecms.cms.service.scheduler.SchedulerTaskManageSvc;
import com.jeecms.cms.service.scheduler.SchedulerTaskBean;
import com.jeecms.cms.web.CmsUtils;
import com.jeecms.cms.web.WebErrors;
/**
* 计划任务Controller
* @author javacoo
* @since 2011-11-7
*/
@Controller
public class CmsSchedulerAct {
private static final Logger log = LoggerFactory
.getLogger(CmsSchedulerAct.class);
/**日志服务*/
@Autowired
private CmsLogMng cmsLogMng;
/**计划管理服务*/
@Autowired
private CmsSchedulerMng manager;
/**计划任务管理服务*/
@Autowired
private SchedulerTaskManageSvc schedulerTaskManageSvc;
@RequestMapping("/scheduler/v_list.do")
public String list(HttpServletRequest request, ModelMap model) {
List<CmsScheduler> list = manager.getList();
model.addAttribute("list", list);
return "scheduler/list";
}
@RequestMapping("/scheduler/v_listBy.do")
public String listBy(String moduleType,HttpServletRequest request, ModelMap model) {
CmsSite site = CmsUtils.getSite(request);
CmsScheduler scheduler = new CmsScheduler();
scheduler.setModuleType(moduleType);
scheduler.setSite(site);
List<CmsScheduler> list = manager.getListBy(scheduler);
model.addAttribute("list", list);
model.addAttribute("moduleType", moduleType);
return "scheduler/list";
}
@RequestMapping("/scheduler/v_add.do")
public String add(String moduleType,HttpServletRequest request, ModelMap model) {
CmsSite site = CmsUtils.getSite(request);
CmsScheduler scheduler = new CmsScheduler();
scheduler.setModuleType(moduleType);
scheduler.setSite(site);
List<SchedulerTaskBean> schedulerTaskList = schedulerTaskManageSvc.associateTaskList(scheduler);
model.addAttribute("schedulerTaskList", schedulerTaskList);
model.addAttribute("moduleType", moduleType);
return "scheduler/add";
}
@RequestMapping("/scheduler/v_edit.do")
public String edit(Integer id, HttpServletRequest request, ModelMap model) {
WebErrors errors = validateEdit(id, request);
if (errors.hasErrors()) {
return errors.showErrorPage(model);
}
CmsSite site = CmsUtils.getSite(request);
CmsScheduler scheduler = manager.findById(id);
scheduler.setSite(site);
List<SchedulerTaskBean> schedulerTaskList = schedulerTaskManageSvc.associateTaskList(scheduler);
model.addAttribute("schedulerTaskList", schedulerTaskList);
model.addAttribute("cmsScheduler", scheduler);
return "scheduler/edit";
}
@RequestMapping("/scheduler/o_save.do")
public String save(CmsScheduler bean,HttpServletRequest request, ModelMap model) {
CmsSite site = CmsUtils.getSite(request);
bean.setSite(site);
bean = manager.save(bean);
model.addAttribute("moduleType", bean.getModuleType());
log.info("save CmsScheduler id={}", bean.getId());
cmsLogMng.operating(request, "cmsAcquisition.log.save", "id="
+ bean.getId() + ";name=" + bean.getName());
return "redirect:v_listBy.do";
}
@RequestMapping("/scheduler/o_update.do")
public String update(CmsScheduler bean, HttpServletRequest request, ModelMap model) {
WebErrors errors = validateUpdate(bean.getId(), request);
if (errors.hasErrors()) {
return errors.showErrorPage(model);
}
bean = manager.update(bean);
log.info("update CmsAcquisition id={}.", bean.getId());
cmsLogMng.operating(request, "cmsAcquisition.log.update", "id="
+ bean.getId() + ";name=" + bean.getName());
return listBy(bean.getModuleType(),request, model);
}
@RequestMapping("/scheduler/o_delete.do")
public String delete(String moduleType,Integer[] ids, HttpServletRequest request,
ModelMap model) {
WebErrors errors = validateDelete(ids, request);
if (errors.hasErrors()) {
return errors.showErrorPage(model);
}
CmsScheduler[] beans = manager.deleteByIds(ids);
for (CmsScheduler bean : beans) {
log.info("delete CmsAcquisition id={}", bean.getId());
cmsLogMng.operating(request, "cmsScheduler.log.delete", "id="
+ bean.getId() + ";name=" + bean.getName());
}
return listBy(moduleType,request, model);
}
@RequestMapping("/scheduler/o_start.do")
public String start(Integer id, HttpServletRequest request,
HttpServletResponse response, ModelMap model) {
CmsScheduler scheduler = manager.findById(id);
schedulerTaskManageSvc.start(scheduler);
manager.start(id);
model.addAttribute("moduleType", scheduler.getModuleType());
log.info("start CmsAcquisition id={}", id);
return "redirect:v_listBy.do";
}
@RequestMapping("/scheduler/o_end.do")
public String end(Integer id, HttpServletRequest request,
HttpServletResponse response, ModelMap model) {
manager.end(id);
CmsScheduler scheduler = manager.findById(id);
schedulerTaskManageSvc.stop(scheduler);
model.addAttribute("moduleType", scheduler.getModuleType());
log.info("end CmsScheduler id={}", id);
return "redirect:v_listBy.do";
}
private WebErrors validateEdit(Integer id, HttpServletRequest request) {
WebErrors errors = WebErrors.create(request);
CmsSite site = CmsUtils.getSite(request);
if (vldExist(id, site.getId(), errors)) {
return errors;
}
return errors;
}
private WebErrors validateUpdate(Integer id, HttpServletRequest request) {
WebErrors errors = WebErrors.create(request);
CmsSite site = CmsUtils.getSite(request);
if (vldExist(id, site.getId(), errors)) {
return errors;
}
return errors;
}
private WebErrors validateDelete(Integer[] ids, HttpServletRequest request) {
WebErrors errors = WebErrors.create(request);
CmsSite site = CmsUtils.getSite(request);
if (errors.ifEmpty(ids, "ids")) {
return errors;
}
for (Integer id : ids) {
vldExist(id, site.getId(), errors);
}
return errors;
}
private boolean vldExist(Integer id, Integer siteId, WebErrors errors) {
if (errors.ifNull(id, "id")) {
return true;
}
CmsScheduler entity = manager.findById(id);
if (errors.ifNotExist(entity, CmsAcquisition.class, id)) {
return true;
}
return false;
}
}
持久对象基类 BaseCmsScheduler.java
package com.jeecms.cms.entity.assist.base;
import java.io.Serializable;
import java.util.Date;
public abstract class BaseCmsScheduler implements Serializable {
public static String REF = "CmsScheduler";
public static String PROP_ID = "id";
public static String PROP_SITE = "site";
public static String PROP_ASSOCIATE_ID = "associateId";
public static String PROP_MODULE_TYPE = "moduleType";
public static String PROP_NAME = "name";
public static String PROP_START_TIME = "startTime";
public static String PROP_END_TIME = "endTime";
public static String PROP_STATUS = "status";
public static String PROP_EXPRESSION = "expression";
// constructors
public BaseCmsScheduler () {
initialize();
}
/**
* Constructor for primary key
*/
public BaseCmsScheduler (java.lang.Integer id) {
this.setId(id);
initialize();
}
public BaseCmsScheduler(Integer id,String name, Date startTime, Date endTime,
Integer status, Integer associateId, String moduleType, String expression,com.jeecms.cms.entity.main.CmsSite site) {
super();
this.id = id;
this.name = name;
this.startTime = startTime;
this.endTime = endTime;
this.status = status;
this.associateId = associateId;
this.moduleType = moduleType;
this.expression = expression;
this.site = site;
}
protected void initialize () {}
private int hashCode = Integer.MIN_VALUE;
// primary key
private java.lang.Integer id;
// fields
private java.lang.String name;
private java.util.Date startTime;
private java.util.Date endTime;
private java.lang.Integer status;
private java.lang.Integer associateId;
private java.lang.String moduleType;
private java.lang.String expression;
private com.jeecms.cms.entity.main.CmsSite site;
public int getHashCode() {
return hashCode;
}
public void setHashCode(int hashCode) {
this.hashCode = hashCode;
}
public java.lang.Integer getId() {
return id;
}
public void setId(java.lang.Integer id) {
this.id = id;
}
public java.lang.String getName() {
return name;
}
public void setName(java.lang.String name) {
this.name = name;
}
public java.util.Date getStartTime() {
return startTime;
}
public void setStartTime(java.util.Date startTime) {
this.startTime = startTime;
}
public java.util.Date getEndTime() {
return endTime;
}
public void setEndTime(java.util.Date endTime) {
this.endTime = endTime;
}
public java.lang.Integer getStatus() {
return status;
}
public void setStatus(java.lang.Integer status) {
this.status = status;
}
public java.lang.Integer getAssociateId() {
return associateId;
}
public void setAssociateId(java.lang.Integer associateId) {
this.associateId = associateId;
}
public java.lang.String getModuleType() {
return moduleType;
}
public void setModuleType(java.lang.String moduleType) {
this.moduleType = moduleType;
}
public java.lang.String getExpression() {
return expression;
}
public void setExpression(java.lang.String expression) {
this.expression = expression;
}
public com.jeecms.cms.entity.main.CmsSite getSite() {
return site;
}
public void setSite(com.jeecms.cms.entity.main.CmsSite site) {
this.site = site;
}
}
持久对象 CmsScheduler.java
package com.jeecms.cms.entity.assist;
import java.util.Date;
import com.jeecms.cms.entity.assist.base.BaseCmsScheduler;
/**
* 计划持久对象
* @author javacoo
* @since 2011-11-07
*/
public class CmsScheduler extends BaseCmsScheduler {
private static final long serialVersionUID = 1L;
/**
* 停止状态
*/
public static final int STOP = 0;
/**
* 采集状态
*/
public static final int START = 1;
/**
* 是否停止
*
* @return
*/
public boolean isStop() {
int status = getStatus();
return status == 0;
}
public void init() {
if (getStatus() == null) {
setStatus(STOP);
}
}
public CmsScheduler(){
super();
}
public CmsScheduler(java.lang.Integer id){
super(id);
}
public CmsScheduler(Integer id,String name, Date startTime, Date endTime,
Integer status, Integer associateId, String moduleType, String expression,com.jeecms.cms.entity.main.CmsSite site) {
super(id,name,startTime,endTime,status,associateId,moduleType,expression,site);
}
}
HBM文件 CmsScheduler.hbm.xml
<?xml version="1.0"?>
<!DOCTYPE hibernate-mapping PUBLIC "-//Hibernate/Hibernate Mapping DTD//EN" "http://hibernate.sourceforge.net/hibernate-mapping-3.0.dtd">
<hibernate-mapping package="com.jeecms.cms.entity.assist">
<class name="CmsScheduler" table="jc_scheduler">
<meta attribute="sync-DAO">false</meta>
<id name="id" type="integer" column="scheduler_id"><generator class="identity"/></id>
<property name="associateId" column="associate_id" type="integer" not-null="true" length="11"/>
<property name="moduleType" column="module_type" type="string" not-null="false" length="100"/>
<property name="name" column="name" type="string" not-null="false" length="100"/>
<property name="startTime" column="start_time" type="timestamp" not-null="false" length="19"/>
<property name="endTime" column="end_time" type="timestamp" not-null="false" length="19"/>
<property name="status" column="status" type="integer" not-null="true" length="1"/>
<property name="expression" column="expression" type="string" not-null="true" length="50"/>
<many-to-one name="site" column="site_id" class="com.jeecms.cms.entity.main.CmsSite" not-null="true"></many-to-one>
</class>
</hibernate-mapping>
==============================定时任务模块相关互助类====================================
计划框架
计划框架-任务调度 Scheduler.java
package com.jeecms.common.scheduling.core;
import java.util.Date;
import java.util.Timer;
import java.util.TimerTask;
/**
* 计划框架-任务调度
* <li>
* 用于提供必要的计划,Scheduler 的每一个实例都拥有 Timer 的一个实例,用于提供底层计划
* 它将一组单次定时器串接在一起,以便在由 ScheduleIterator 指定的各个时间执行 SchedulerTask 类
* </li>
* @author javacoo
* @since 2011-11-02
*/
public class Scheduler {
/**Timer实例*/
private final Timer timer = new Timer();
/**
* 定时任务计划
* @author javacoo
* @since 2011-11-02
*/
class SchedulerTimerTask extends TimerTask {
private SchedulerTask schedulerTask;
private ScheduleIterator iterator;
public SchedulerTimerTask(SchedulerTask schedulerTask,
ScheduleIterator iterator) {
this.schedulerTask = schedulerTask;
this.iterator = iterator;
}
public void run() {
schedulerTask.run();
reschedule(schedulerTask, iterator);
}
}
public Scheduler() {
}
/**
* 取消执行
*/
public void cancel() {
timer.cancel();
}
/**
* 计划的入口点
* <li>
* 通过调用 ScheduleIterator 接口的 next(),发现第一次执行 SchedulerTask 的时间。
* 然后通过调用底层 Timer 类的单次 schedule() 方法,启动计划在这一时刻执行。
* 为单次执行提供的 TimerTask 对象是嵌入的 SchedulerTimerTask 类的一个实例,
* 它包装了任务和迭代器(iterator)。在指定的时间,调用嵌入类的 run() 方法,
* 它使用包装的任务和迭代器引用以便重新计划任务的下一次执行
* </li>
* @param schedulerTask SchedulerTimerTask 类的一个实例
* @param iterator ScheduleIterator 接口的一个实例
*/
public void schedule(SchedulerTask schedulerTask, ScheduleIterator iterator) {
Date time = iterator.next();
if (time == null) {
schedulerTask.cancel();
} else {
synchronized (schedulerTask.lock) {
if (schedulerTask.state != SchedulerTask.VIRGIN) {
throw new IllegalStateException("任务已经执行/取消");
}
schedulerTask.state = SchedulerTask.SCHEDULED;
schedulerTask.timerTask = new SchedulerTimerTask(schedulerTask,iterator);
timer.schedule(schedulerTask.timerTask, time);
}
}
}
/**
* 重新制定计划
* @param schedulerTask SchedulerTimerTask 类的一个实例
* @param iterator ScheduleIterator 接口的一个实例
*/
private void reschedule(SchedulerTask schedulerTask,
ScheduleIterator iterator) {
Date time = iterator.next();
if (time == null) {
schedulerTask.cancel();
} else {
synchronized (schedulerTask.lock) {
if (schedulerTask.state != SchedulerTask.CANCELLED) {
schedulerTask.timerTask = new SchedulerTimerTask(
schedulerTask, iterator);
timer.schedule(schedulerTask.timerTask, time);
}
}
}
}
}
计划框架-时间生成器接口 ScheduleIterator.java
package com.jeecms.common.scheduling.core;
import java.util.Date;
/**
* 计划框架-时间生成器接口
* <li>将 SchedulerTask 的计划执行时间指定为一系列 java.util.Date 对象的接口
* 然后 next() 方法按时间先后顺序迭代 Date 对象,返回值 null 会使任务取消(即它再也不会运行)</li>
* @author javacoo
* @since 2011-11-02
*/
public interface ScheduleIterator {
/**
* 返回下次计划执行时间
* @return 下次计划执行时间
*/
Date next();
}
计划任务抽象类 SchedulerTask.java
package com.jeecms.common.scheduling.core;
import java.util.TimerTask;
/**
* 计划任务抽象类
* <li>
* SchedulerTask 在其生命周期中要经历一系列的状态。创建后,它处于 VIRGIN 状态,
* 这表明它从没有计划过。计划以后,它就变为 SCHEDULED 状态,
* 再用下面描述的方法之一取消任务后,它就变为 CANCELLED 状态。
* 管理正确的状态转变 —— 如保证不对一个非 VIRGIN 状态的任务进行两次计划 ——
* 增加了 Scheduler 和 SchedulerTask 类的复杂性。在进行可能改变任务状态的操作时,
* 代码必须同步任务的锁对象
* </li>
* @author javacoo
* @since 2011-11-02
*/
public abstract class SchedulerTask implements Runnable {
/**同步任务的锁对象*/
final Object lock = new Object();
/**状态*/
int state = VIRGIN;
/**初始状态*/
static final int VIRGIN = 0;
/**任务状态*/
static final int SCHEDULED = 1;
/**取消状态*/
static final int CANCELLED = 2;
/**TimerTask 对象*/
TimerTask timerTask;
protected SchedulerTask() {
}
/**执行的任务,由子类实现*/
public abstract void run();
/**取消任务
* <li>
* 任务再也不会运行了,不过已经运行的任务仍会运行完成
* </li>
*/
public boolean cancel() {
synchronized (lock) {
if (timerTask != null) {
timerTask.cancel();
}
boolean result = (state == SCHEDULED);
state = CANCELLED;
return result;
}
}
public long scheduledExecutionTime() {
synchronized (lock) {
return timerTask == null ? 0 : timerTask.scheduledExecutionTime();
}
}
}
计划框架-时间生成器接口实现类 SimpleScheduleIterator.java
package com.jeecms.common.scheduling.impl;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import com.jeecms.common.scheduling.core.ScheduleIterator;
/**
* 计划框架-时间生成器接口实现类
* <li>返回 月/周/天/小时/分钟/秒 计划的下一次执行时间</li>
* <li>约定:参数以逗号分隔,*号表示无值</li>
* <li>参数解释:
* <br>第一位:每个月的第几周</br>
* <br>第二位:每周的第几天</br>
* <br>第三位:天(几号)</br>
* <br>第四位:小时(24小时制)</br>
* <br>第五位:分钟</br>
* <br>第六位:秒</br>
* </li>
* <li>参数样例:
* <br> 1,6,4,15,20,30 表示 从今天的15:20:30开始,每隔一个月执行一次,即下次执行时间是 下个月的第一周的第6天的15:20:30</br>
* <br> *,6,4,15,20,30 表示 从今天的15:20:30开始,每隔一周执行一次,即下次执行时间是 下一周的第6天的15:20:30</br>
* <br> *,*,4,15,20,30 表示 从今天的15:20:30开始,每隔一天执行一次,即下次执行时间是 下一天的15:20:30</br>
* <br> *,*,*,15,20,30 表示 从今天的15:20:30开始,每隔一小时执行一次,即下次执行时间是 16:20:30</br>
* <br> *,*,*,*,20,30 表示 从这个小时的20:30开始,每隔一分钟执行一次,即下次执行时间是 *:21:30</br>
* <br> *,*,*,*,*,30 表示 从当前时间的30秒开始,每隔一秒执行一次,即下次执行时间是 *:*:31</br>
* </li>
* @author javacoo
* @since 2011-11-03
*/
public class SimpleScheduleIterator implements ScheduleIterator {
private final ScheduleParamBean scheduleParamBean;
private final Calendar calendar = Calendar.getInstance();
private final Calendar orginCalendar = Calendar.getInstance();
public SimpleScheduleIterator(final ScheduleParamBean scheduleParamBean) {
this(scheduleParamBean, new Date());
}
public SimpleScheduleIterator(final ScheduleParamBean scheduleParamBean, Date date) {
this.scheduleParamBean = scheduleParamBean;
orginCalendar.setTime(date);
calendar.setTime(date);
if(null != scheduleParamBean.getWeekOfMonth()){
calendar.set(Calendar.WEEK_OF_MONTH, scheduleParamBean.getWeekOfMonth());
}
//如果设置了每周的第几天和一个月的第几天,则忽略一个月的第几天
if(null != scheduleParamBean.getDayOfWeek()){
calendar.set(Calendar.DAY_OF_WEEK, scheduleParamBean.getDayOfWeek());
}else if(null != scheduleParamBean.getDayOfMonth()){
calendar.set(Calendar.DAY_OF_MONTH, scheduleParamBean.getDayOfMonth());
}
if(null != scheduleParamBean.getHourOfDay()){
calendar.set(Calendar.HOUR_OF_DAY, scheduleParamBean.getHourOfDay());
}
if(null != scheduleParamBean.getMinute()){
calendar.set(Calendar.MINUTE, scheduleParamBean.getMinute());
}
if(null != scheduleParamBean.getSecond()){
calendar.set(Calendar.SECOND, scheduleParamBean.getSecond());
}
calendar.set(Calendar.MILLISECOND, 0);
//如果设置时间 大于当前时间
if (!calendar.getTime().before(date)) {
System.out.println(calendar.getTime() +"大于当前时间:"+date);
if(null != scheduleParamBean.getWeekOfMonth()){
calendar.add(Calendar.MONTH, -1);
}else if(null != scheduleParamBean.getDayOfWeek()){
calendar.add(Calendar.DAY_OF_WEEK, -6);
}else if(null != scheduleParamBean.getDayOfMonth()){
calendar.add(Calendar.DAY_OF_MONTH, -1);
}else if(null != scheduleParamBean.getHourOfDay()){
calendar.add(Calendar.HOUR_OF_DAY, -1);
}else if(null != scheduleParamBean.getMinute()){
calendar.add(Calendar.MINUTE, -1);
}else if(null != scheduleParamBean.getSecond()){
calendar.add(Calendar.SECOND, -1);
}
}else{//如果小于,则会一下执行多次,所以在天,小时,分钟,秒 都加上相应时间差
System.out.println(calendar.getTime() +"小于当前时间:"+date);
if(null != scheduleParamBean.getDayOfMonth()){
calendar.add(Calendar.DAY_OF_MONTH, orginCalendar.get(Calendar.DAY_OF_MONTH) - scheduleParamBean.getDayOfMonth());
}else if(null != scheduleParamBean.getHourOfDay()){
calendar.add(Calendar.HOUR_OF_DAY, orginCalendar.get(Calendar.HOUR_OF_DAY) - scheduleParamBean.getHourOfDay());
}else if(null != scheduleParamBean.getMinute()){
calendar.add(Calendar.MINUTE, orginCalendar.get(Calendar.MINUTE) - scheduleParamBean.getMinute());
}else if(null != scheduleParamBean.getSecond()){
calendar.add(Calendar.SECOND, orginCalendar.get(Calendar.SECOND) - scheduleParamBean.getSecond());
}
}
}
public Date next() {
if(null != scheduleParamBean.getWeekOfMonth()){
calendar.add(Calendar.MONTH, 1);
}else if(null != scheduleParamBean.getDayOfWeek()){
calendar.add(Calendar.DAY_OF_WEEK, 6);
}else if(null != scheduleParamBean.getDayOfMonth()){
calendar.add(Calendar.DAY_OF_MONTH, 1);
}else if(null != scheduleParamBean.getHourOfDay()){
calendar.add(Calendar.HOUR_OF_DAY, 1);
}else if(null != scheduleParamBean.getMinute()){
calendar.add(Calendar.MINUTE, 1);
}else if(null != scheduleParamBean.getSecond()){
calendar.add(Calendar.SECOND, 1);
}
System.out.println("下次执行时间:"+calendar.getTime());
return calendar.getTime();
}
}
时间计划参数bean ScheduleParamBean.java
package com.jeecms.common.scheduling.impl;
/**
* 时间计划参数bean
* @author javacoo
* @since 2011-11-04
*/
public class ScheduleParamBean {
/**每个月的第几周,每周的第几天,每个月的第几天,小时(24小时制),分钟,秒*/
private Integer weekOfMonth,dayOfWeek,dayOfMonth,hourOfDay, minute, second;
public ScheduleParamBean(){
}
public ScheduleParamBean(Integer weekOfMonth, Integer dayOfWeek,
Integer dayOfMonth, Integer hourOfDay, Integer minute,
Integer second) {
super();
this.weekOfMonth = weekOfMonth;
this.dayOfWeek = dayOfWeek;
this.dayOfMonth = dayOfMonth;
this.hourOfDay = hourOfDay;
this.minute = minute;
this.second = second;
}
public Integer getWeekOfMonth() {
return weekOfMonth;
}
public void setWeekOfMonth(Integer weekOfMonth) {
this.weekOfMonth = weekOfMonth;
}
public Integer getDayOfWeek() {
return dayOfWeek;
}
public void setDayOfWeek(Integer dayOfWeek) {
this.dayOfWeek = dayOfWeek;
}
public Integer getDayOfMonth() {
return dayOfMonth;
}
public void setDayOfMonth(Integer dayOfMonth) {
this.dayOfMonth = dayOfMonth;
}
public Integer getHourOfDay() {
return hourOfDay;
}
public void setHourOfDay(Integer hourOfDay) {
this.hourOfDay = hourOfDay;
}
public Integer getMinute() {
return minute;
}
public void setMinute(Integer minute) {
this.minute = minute;
}
public Integer getSecond() {
return second;
}
public void setSecond(Integer second) {
this.second = second;
}
@Override
public String toString() {
return "ScheduleParamBean [dayOfMonth=" + dayOfMonth + ", dayOfWeek="
+ dayOfWeek + ", hourOfDay=" + hourOfDay + ", minute=" + minute
+ ", second=" + second + ", weekOfMonth=" + weekOfMonth + "]";
}
}
采集相关
HTML解析工具类接口 ParseHtmlTool.java
package com.jeecms.common.crawler.util;
import java.util.List;
import java.util.Map;
/**
* HTML解析工具类接口
* @author javacoo
* @since 2011-10-31
*/
public interface ParseHtmlTool {
/**
* 取得连接集合
* @param orginHtml 原始HTML
* @return 连接集合
*/
List<String> getUrlList( String orginHtml);
/**
* 取得标题集合
* @param orginHtml 原始HTML
* @return 标题集合
*/
List<String> getTitleList(String orginHtml);
/**
* 取得指定区域的HTML内容
* @return 指定区域的HTML内容
*/
String getHtml(String orginHtml);
/**
* 取得连接标题Map集合
* @param orginHtml 原始HTML
* @return 连接标题Map集合
*/
List<Map<String,String>> getUrlAndTitleMap(String orginHtml);
}
HTML解析工具,HtmlParser实现类 HtmlParserImpl.java
package com.jeecms.common.crawler.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.RemarkNode;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.jeecms.cms.entity.assist.CmsAcquisition;
import com.jeecms.common.crawler.ParamBean;
/**
* HTML解析工具,HtmlParser实现类
* @author javacoo
* @since 2011-10-31
*/
public class HtmlParserImpl implements ParseHtmlTool{
/**连接集合标志*/
private static String LINK_KEY = "linkKey";
/**标题集合标志*/
private static String TITLE_KEY = "titleKey";
/**单标签标志*/
private static String SINGLE_TAG = "singleTag";
/**连接正则表达式*/
private static String LINK_REGX = "<a.*href=\"(.*?)\".*>(.*?)</a>";
/**正则表达式对象*/
private Pattern pt = Pattern.compile(LINK_REGX);
/**采集参数bean*/
private ParamBean paramBean;
public HtmlParserImpl(CmsAcquisition acqu){
parseRequestParam(acqu);
}
/**
* 取得标题集合
* @param orginHtml 原始HTML
* @return 标题集合
*/
public List<String> getTitleList(String orginHtml) {
orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);
if (StringUtils.isNotEmpty(orginHtml)) {
return getUrlOrTitleListByType(orginHtml,TITLE_KEY);
}
return null;
}
/**
* 取得连接集合
* @param orginHtml 原始HTML
* @return 连接集合
*/
public List<String> getUrlList(String orginHtml) {
orginHtml = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),orginHtml);
if (StringUtils.isNotEmpty(orginHtml)) {
return getUrlOrTitleListByType(orginHtml,LINK_KEY);
}
return null;
}
/**
* 取得指定区域的HTML内容
* @param orginHtml 原始HTML
* @return 指定区域的HTML内容
* @throws ParserException
*/
public String getHtml(String orginHtml) {
orginHtml = getHtmlByFilter(paramBean.getContentStartMap(), paramBean.getContentEndMap(),orginHtml);
return orginHtml;
}
/**
* 取得连接标题Map
* @param orginHtml 原始HTML
* @return 连接标题Map
*/
public List<Map<String,String>> getUrlAndTitleMap(String orginHtml){
return getUrlAandTitleMap(orginHtml);
}
/**
* 解析采集参数,并封装到ParamBean
* @param acqu 原始采集参数
* @return 采集参数封装bean
*/
private void parseRequestParam(CmsAcquisition acqu){
paramBean = new ParamBean();
if(!StringUtils.isEmpty(acqu.getLinksetStart())){
paramBean.setLinksetStartMap(populateParamMap(acqu.getLinksetStart()));
}
if(!StringUtils.isEmpty(acqu.getLinksetEnd())){
paramBean.setLinksetEndMap(populateParamMap(acqu.getLinksetEnd()));
}
if(!StringUtils.isEmpty(acqu.getContentStart())){
paramBean.setContentStartMap(populateParamMap(acqu.getContentStart()));
}
if(!StringUtils.isEmpty(acqu.getContentEnd())){
paramBean.setContentEndMap(populateParamMap(acqu.getContentEnd()));
}
}
/**
* 得到连接标题MAP
* @param html html内容
* @return 连接或者标题集合
*/
private List<Map<String,String>> getUrlAandTitleMap(String html) {
html = getHtmlByFilter(paramBean.getLinksetStartMap(), paramBean.getLinksetEndMap(),html);
List<Map<String,String>> resultMapList = new ArrayList<Map<String,String>>();
Map<String,String> resultMap = null;
Matcher m = pt.matcher(html);
while (m.find()) {
if(StringUtils.isNotEmpty(m.group(1)) && StringUtils.isNotEmpty(m.group(2))){
resultMap = new HashMap<String, String>();
resultMap.put(LINK_KEY, m.group(1));
resultMap.put(TITLE_KEY, m.group(2));
resultMapList.add(resultMap);
}
}
return resultMapList;
}
/**
* 得到地址集
* @param html html内容
* @param type 1 :取得连接集合,2:取得标题集合
* @return 连接或者标题集合
*/
private List<String> getUrlOrTitleListByType(String html, String type) {
List<String> resultList = new ArrayList<String>();
Matcher m = pt.matcher(html);
String result = "";
int pos = 1;
if(TITLE_KEY.equals(type)){
pos = 2;
}
while (m.find()) {
result = m.group(pos);
resultList.add(result);
}
return resultList;
}
/**
* 取得指定区域的HTML内容
* @param tagMap 标签MAP
* @param removeTagMap 要过滤的标签MAP
* @param orginHtml 原始HTML
* @return 指定区域的HTML内容
* @throws ParserException
*/
private String getHtmlByFilter(Map<String, String> tagMap,
Map<String, String> removeTagMap, String orginHtml) {
try {
Parser parser = new Parser();
parser.setInputHTML(orginHtml);
// 第一步取得指定属性/标签内容
String tempKey = null;
String tempValue = null;
String[] tempValueArr = null;
StringBuilder sb = new StringBuilder();
NodeFilter filter = null;
for(Iterator<String> it = tagMap.keySet().iterator(); it.hasNext();){
tempKey = it.next();
tempValue = tagMap.get(tempKey);
if(tempValue.contains("|")){
tempValueArr = tempValue.split("\\|");
}else{
tempValueArr = new String[]{tempValue};
}
for(String value : tempValueArr){
filter = populateFilter(tempKey,value);
appendHtmlByFilter(parser, filter, sb);
}
}
// 第二步过滤指定属性/标签内容
String contentHtml = sb.toString();
for (Iterator<String> it = removeTagMap.keySet().iterator(); it
.hasNext();) {
tempKey = it.next();
tempValue = removeTagMap.get(tempKey);
if(tempValue.contains("|")){
tempValueArr = tempValue.split("\\|");
}else{
tempValueArr = new String[]{tempValue};
}
for(String value : tempValueArr){
filter = populateFilter(tempKey,value);
contentHtml = removeHtmlByFilter(parser, filter, contentHtml);
}
}
//第三步过滤注释
filter = new NodeClassFilter(RemarkNode.class);
contentHtml = removeHtmlByFilter(parser, filter, contentHtml);
System.out.println("=================================结果=======================================");
System.out.println(contentHtml);
return contentHtml;
} catch (ParserException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return "";
}
/**
* 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
* <li>约定采集参数格式如下</li>
* <li>1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN</li>
* <li>2,标签名称形式,如:div,p,span</li>
* <li>3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span</li>
* @param paramStr 参数字符串
*/
private Map<String, String> populateParamMap(String paramStr) {
Map<String, String> paramMap = new HashMap<String, String>();
String[] paramStrArr = paramStr.split(",");
String[] tempStrArr = null;
StringBuilder sb = new StringBuilder();
for(String temp : paramStrArr){
if(temp.contains("=")){
tempStrArr = temp.split("=");
paramMap.put(tempStrArr[0], tempStrArr[1]);
}else{
if(StringUtils.isNotEmpty(temp)){
sb.append(temp).append("|");
}
}
}
if(StringUtils.isNotEmpty(sb.toString())){
paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));
}
return paramMap;
}
/**
* 组装过滤器
* @param key 键
* @param value 值
* @return 过滤器
*/
private NodeFilter populateFilter(String key,String value) {
NodeFilter filter;
if(SINGLE_TAG.equals(key)){
filter = new TagNameFilter(value);
}else{
filter = new HasAttributeFilter(key,value);
}
return filter;
}
/**
* 过滤指定属性标签HTML
* @param parser 解析器
* @param filter 属性过滤器
* @param orginHtml 原始HTML
* @return 过滤后HTML
* @throws ParserException
*/
private String removeHtmlByFilter(Parser parser, NodeFilter filter,String orginHtml) throws ParserException {
parser.setInputHTML(orginHtml);
NodeList nodes = parser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nodes.size(); i++) {
Node textnode = (Node) nodes.elementAt(i);
orginHtml = StringUtils.remove(orginHtml, textnode.toHtml());
}
return orginHtml;
}
/**
* 取得所有指定属性/标签的HTML
* @param parser 解析器
* @param filter 过滤器
* @param sb
* @throws ParserException
*/
private void appendHtmlByFilter(Parser parser, NodeFilter filter,
StringBuilder sb) throws ParserException {
NodeList nodes = parser.extractAllNodesThatMatch(filter);
for (int i = 0; i < nodes.size(); i++) {
Node textnode = (Node) nodes.elementAt(i);
sb.append(textnode.toHtml());
}
}
/**
* 解析并组装采集参数,支持标签属性/值形式和标签名称形式,可混合使用
* <li>约定采集参数格式如下</li>
* <li>1,标签属性/值形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN</li>
* <li>2,标签名称形式,如:div,p,span</li>
* <li>3,混合形式,如:class=articleList|tips,id=fxwb|fxMSN|fxMSN,div,p,span</li>
* @param paramMap 参数map
* @param str 参数字符串
*/
private void populateParamMap(Map<String, String> paramMap,String paramStr) {
String[] paramStrArr = paramStr.split(",");
String[] tempStrArr = null;
StringBuilder sb = new StringBuilder();
for(String temp : paramStrArr){
if(temp.contains("=")){
tempStrArr = temp.split("=");
paramMap.put(tempStrArr[0], tempStrArr[1]);
}else{
if(StringUtils.isNotEmpty(temp)){
sb.append(temp).append("|");
}
}
}
if(StringUtils.isNotEmpty(sb.toString())){
paramMap.put(SINGLE_TAG, sb.substring(0, sb.length() - 1));
}
}
/**
* 测试方法-打开文件并返回内容
* @param szFileName 文件绝对地址
* @param charset 字符集
* @return 内容
*/
public static String openFile(String szFileName,String charset) {
try {
BufferedReader bis = new BufferedReader(new InputStreamReader(
new FileInputStream(new File(szFileName)), charset));
StringBuilder szContent = new StringBuilder();
String szTemp;
while ((szTemp = bis.readLine()) != null) {
szContent.append(szTemp).append("\n");
}
bis.close();
return szContent.toString();
} catch (Exception e) {
return "";
}
}
/**
* 测试取得连接地址和标题
* @throws ParserException
*/
public void testFetchLinkAndTitle() throws ParserException{
String html = openFile("F:\\4.htm","UTF-8");
String result = "";
Map<String, String> map = new HashMap<String, String>();
map.put("class", "m_list");
Map<String, String> notMap = new HashMap<String, String>();
//notMap.put("class", "atc_ic_f");
result = getHtmlByFilter(map,notMap,html);
System.out.println("=============================result============================");
System.out.println(result);
System.out.println("==========================================================");
Pattern pt = Pattern.compile("<a.*href=\"(.*?)\".*>(.*?)</a>");
Matcher m = pt.matcher(result);
String link = null;
String title = null;
while (m.find()) {
link = m.group(1);
title = m.group(2);
if (StringUtils.isNotEmpty(link)) {
System.out.println("url : " + link);
System.out.println("title : " + title);
}
}
}
/**
* 测试取得内容
* @throws ParserException
*/
public void testFetchContent() throws ParserException{
String html = openFile("F:\\6.shtml","GB2312");
Map<String, String> map = new HashMap<String, String>();
map.put("id", "artibody");
Map<String, String> notMap = new HashMap<String, String>();
notMap.put(SINGLE_TAG, "style|script");
notMap.put("type", "text/javascript");
notMap.put("class", "icon_fx|blkComment otherContent_01");
notMap.put("style", "text-align: right;padding-right:10px;|margin-top:6px;|font-size: 12px ! important;|font-size:12px");
notMap.put("id", "fxwb|fxMSN|fxMSN|comment_t_show_top");
getHtmlByFilter(map,notMap,html);
}
/**
* 测试解析参数
*/
public void testParseParam(){
Map<String, String> map = new HashMap<String, String>();
populateParamMap(map,"class=articleList|tips,p,div");
String tempKey = null;
String tempValue = null;
String[] tempValueArr = null;
for (Iterator<String> it = map.keySet().iterator(); it.hasNext();) {
tempKey = it.next();
tempValue = map.get(tempKey);
if(tempValue.contains("|")){
tempValueArr = tempValue.split("\\|");
}else{
tempValueArr = new String[]{tempValue};
}
for(String value : tempValueArr){
System.out.println("tempKey:" + tempKey);
System.out.println("value:" + value);
}
}
}
/**
* 测试过滤标签
* @throws ParserException
*/
public void testRemarkFilter() throws ParserException{
String html = openFile("F:\\6.shtml","GB2312");
System.out.println("=========================过滤注释前HTML==================================");
System.out.println(html);
NodeFilter filter = new NodeClassFilter(RemarkNode.class);
html = removeHtmlByFilter(new Parser(), filter, html);
System.out.println("=========================过滤注释后HTML==================================");
System.out.println(html);
}
public static void main(String[] args) throws ParserException,
URISyntaxException, IOException {
HtmlParserImpl parseHtmlTool = new HtmlParserImpl(new CmsAcquisition());
//parseHtmlTool.testParseParam();
//parseHtmlTool.testFetchLinkAndTitle();
//parseHtmlTool.testFetchContent();
//parseHtmlTool.testRemarkFilter();
}
}
采集参数封装bean ParamBean.java
package com.jeecms.common.crawler;
import java.util.HashMap;
import java.util.Map;
/**
* 采集参数封装bean
* @author javacoo
* @since 2011-10-31
*/
public class ParamBean {
/**待采集连接区域属性MAP*/
private Map<String, String> linksetStartMap = new HashMap<String, String>();
/**待采集连接区域过滤属性MAP*/
private Map<String, String> linksetEndMap = new HashMap<String, String>();
/**待采集内容区域属性MAP*/
private Map<String, String> contentStartMap = new HashMap<String, String>();
/**待采集内容区域过滤属性MAP*/
private Map<String, String> contentEndMap = new HashMap<String, String>();
public Map<String, String> getLinksetStartMap() {
return linksetStartMap;
}
public void setLinksetStartMap(Map<String, String> linksetStartMap) {
this.linksetStartMap = linksetStartMap;
}
public Map<String, String> getLinksetEndMap() {
return linksetEndMap;
}
public void setLinksetEndMap(Map<String, String> linksetEndMap) {
this.linksetEndMap = linksetEndMap;
}
public Map<String, String> getContentStartMap() {
return contentStartMap;
}
public void setContentStartMap(Map<String, String> contentStartMap) {
this.contentStartMap = contentStartMap;
}
public Map<String, String> getContentEndMap() {
return contentEndMap;
}
public void setContentEndMap(Map<String, String> contentEndMap) {
this.contentEndMap = contentEndMap;
}
}
队列 Queue.java
package com.jeecms.common.crawler;
import java.util.LinkedList;
/**
* 队列
* @author javacoo
* @since 2011-11-01
* @param <T>
*/
public class Queue<T> {
private LinkedList<T> queue = new LinkedList<T>();
/**
* 入队列
* @param t
*/
public void enQueue(T t){
queue.addLast(t);
}
/**
* 出队列
* @return t
*/
public T deQueue(){
return queue.removeFirst();
}
/**
* 判断队列是否为空
* @return
*/
public boolean isEmpty(){
return queue.isEmpty();
}
/**
* 判断队列是否含有t
* @param t
* @return
*/
public boolean contains(T t){
return queue.contains(t);
}
/**
* 取得队列大小
* @return
*/
public int getSize(){
return queue.size();
}
}
URL队列 UrlQueue.java
package com.jeecms.common.crawler;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.springframework.util.CollectionUtils;
/**
* URL队列
* @author javacoo
* @since 2011-11-01
* @param <Map<String, String>>
*/
public class UrlQueue {
/**已访问URL集合*/
private Set<Map<String, String>> visitedUrl = new HashSet<Map<String, String>>();
/**待访问URL集合*/
private Queue<Map<String, String>> unVisitedUrl = new Queue<Map<String, String>>();
/**
* 获得 URL 队列
* @return
*/
public Queue<Map<String, String>> getUnVisitedUrl() {
return unVisitedUrl;
}
/**
* 未访问的 URL 出队列
* @return
*/
public Map<String, String> unVisitedUrlDeQueue() {
return unVisitedUrl.deQueue();
}
/**
* 保证每个 URL 只被访问一次
* @param url
*/
public void addUnVisitedUrl(Map<String, String> urlMap) {
if (!CollectionUtils.isEmpty(urlMap) && !unVisitedUrl.contains(urlMap) && !visitedUrl.contains(urlMap)){
unVisitedUrl.enQueue(urlMap);
}
}
/**
* 判断是否为空
* @return
*/
public boolean isEmpty(){
return unVisitedUrl.isEmpty();
}
/**
* 未访问URL数量
* @return
*/
public int getUnVisitedUrlNum(){
return unVisitedUrl.getSize();
}
/**
* 添加到访问过的URL队列中
* @param urlMap
*/
public void addVisitedUrl(Map<String, String> urlMap){
visitedUrl.add(urlMap);
}
/**
* 删除访问过的URL
* @param urlMap
*/
public void removeVisitedUrl(Map<String, String> urlMap){
visitedUrl.remove(urlMap);
}
/**
* 已访问URL数量
* @return
*/
public int getVisitedUrlNum(){
return visitedUrl.size();
}
}
接下来是XML配置
==============================定时任务模块XML配置====================================
dao配置
<bean id="cmsSchedulerDao" class="com.jeecms.cms.dao.assist.impl.CmsSchedulerDaoImpl"/>
manage配置
<bean id="cmsSchedulerMng" class="com.jeecms.cms.manager.assist.impl.CmsSchedulerMngImpl"/>
SERVICE配置
<bean id="schedulerAcquisitionSvc" class="com.jeecms.cms.service.scheduler.SchedulerAcquisitionSvcImpl"/>
<bean id="schedulerTaskManageSvc" class="com.jeecms.cms.service.scheduler.SchedulerTaskManageSvcImpl"/>
接下来是messages_zh_CN.properties 添加了常量
==============================messages_zh_CN.properties====================================
cmsScheduler.acquisition.function=\u91C7\u96C6\u4EFB\u52A1\u7BA1\u7406
cmsScheduler.name=\u4EFB\u52A1\u540D\u79F0
cmsScheduler.expression=\u8BA1\u5212\u8868\u8FBE\u5F0F
cmsScheduler.expression.help=\u53C2\u6570\u4EE5\u9017\u53F7\u5206\u9694,*\u53F7\u8868\u793A\u65E0\u503C,\u51716\u4F4D\:\u6BCF\u4E2A\u6708\u7684\u7B2C\u51E0\u5468,\u6BCF\u5468\u7684\u7B2C\u51E0\u5929,\u5929(\u51E0\u53F7),\u5C0F\u65F6(24\u5C0F\u65F6\u5236),\u5206\u949F,\u79D2\u3002\u5982\uFF1A1,6,4,15,20,30 \u8868\u793A \u4ECE\u4ECA\u5929\u768415\:20\:30\u5F00\u59CB\uFF0C\u6BCF\u9694\u4E00\u4E2A\u6708\u6267\u884C\u4E00\u6B21,\u5373\u4E0B\u6B21\u6267\u884C\u65F6\u95F4\u662F \u4E0B\u4E2A\u6708\u7684\u7B2C\u4E00\u5468\u7684\u7B2C6\u5929\u768415\:20\:30
cmsScheduler.associate=\u5173\u8054\u4EFB\u52A1
cmsScheduler.status.0=\u505C\u6B62
cmsScheduler.status.1=\u8FD0\u884C
cmsScheduler.opt.start=\u5F00\u59CB
cmsScheduler.opt.end=\u505C\u6B62
cmsScheduler.status=\u72B6\u6001
cmsScheduler.startTime=\u5F00\u59CB\u65F6\u95F4
cmsScheduler.endTime=\u7ED3\u675F\u65F6\u95F4
cmsScheduler.log.delete=\u5220\u9664\u4EFB\u52A1
==============================模板====================================
scheduler/add.html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title></title>
<#include "/jeecms_sys/head.html"/>
<script type="text/javascript">
$.validator.methods.leafChannel = function(value, element, param) {
var i = element.selectedIndex;
return $(element.options[i]).attr("class")!="sel-disabled";
};
$(function() {
$("#jvForm").validate({
rules: {
channelId: {
required: true,
leafChannel: true
}
},
messages:{
channelId: {
leafChannel: "<@s.m "cmsAcquisition.error.notLeafChannel"/>"
}
}
});
});
</script>
<style type="text/css">
.sel-disabled{background-color:#ccc}
</style>
</head>
<body>
<div class="body-box">
<div class="rhead">
<div class="rpos"><@s.m "global.position"/>: <@s.m "cmsScheduler.acquisition.function"/> - <@s.m "global.add"/></div>
<form class="ropt">
<input type="hidden" name="moduleType" value="${moduleType!}" />
<input type="submit" value="<@s.m "global.backToList"/>" onclick="this.form.action='v_listBy.do';"/>
</form>
<div class="clear"></div>
</div>
<@p.form id="jvForm" action="o_save.do" labelWidth="12">
<input type="hidden" name="moduleType" value="${moduleType!}" />
<@p.text colspan="1" width="50" label="cmsScheduler.name" name="name" required="true" class="required" maxlength="50"/>
<@p.td colspan="1" width="50" label="cmsScheduler.associate" required="true">
<@p.select list=schedulerTaskList name="associateId" listKey="id" listValue="name"/>
</@p.td><@p.tr/>
<@p.textarea colspan="2" label="cmsScheduler.expression" name="expression" help="cmsScheduler.expression.help" helpPosition="3" rows="1" cols="70" required="true" class="required" /><@p.tr/>
<@p.td colspan="2"><@p.submit code="global.submit"/> <@p.reset code="global.reset"/></@p.td>
</@p.form>
</div>
</body>
</html>
scheduler/edit.html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title></title>
<#include "/jeecms_sys/head.html"/>
<script type="text/javascript">
$.validator.methods.leafChannel = function(value, element, param) {
var i = element.selectedIndex;
return $(element.options[i]).attr("class")!="sel-disabled";
};
$(function() {
$("#jvForm").validate({
rules: {
channelId: {
required: true,
leafChannel: true
}
},
messages:{
channelId: {
leafChannel: "<@s.m "cmsAcquisition.error.notLeafChannel"/>"
}
}
});
});
</script>
<style type="text/css">
.sel-disabled{background-color:#ccc}
</style>
</head>
<body>
<div class="body-box">
<div class="rhead">
<div class="rpos"><@s.m "global.position"/>: <@s.m "cmsScheduler.acquisition.function"/> - <@s.m "global.edit"/></div>
<form class="ropt">
<input type="button" value="<@s.m "global.backToList"/>" onclick="history.back();"/>
</form>
<div class="clear"></div>
</div>
<@p.form id="jvForm" action="o_update.do" labelWidth="12">
<input type="hidden" name="moduleType" value="${cmsScheduler.moduleType!}" />
<@p.text colspan="1" width="50" label="cmsScheduler.name" name="name" value=cmsScheduler.name required="true" class="required" maxlength="50"/>
<@p.td colspan="1" width="50" label="cmsScheduler.associate" required="true">
<@p.select list=schedulerTaskList name="associateId" value=cmsScheduler.associateId listKey="id" listValue="name"/>
</@p.td><@p.tr/>
<@p.textarea colspan="2" label="cmsScheduler.expression" name="expression" rows="1" help="cmsScheduler.expression.help" helpPosition="3" value=cmsScheduler.expression required="true" class="required" cols="70" /><@p.tr/>
<@p.td colspan="2">
<@p.hidden name="id" value=cmsScheduler.id/>
<@p.submit code="global.submit"/> <@p.reset code="global.reset"/>
</@p.td>
</@p.form>
</div>
</body>
</html>
scheduler/list.html
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title></title>
<#include "/jeecms_sys/head.html"/>
<script type="text/javascript">
function getTableForm() {
return document.getElementById('tableForm');
}
function optDelete() {
if(Pn.checkedCount('ids')<=0) {
alert("<@s.m 'error.checkRecord'/>");
return;
}
if(!confirm("<@s.m 'global.confirm.delete'/>")) {
return;
}
var f = getTableForm();
f.action="o_delete.do";
f.submit();
}
</script>
</head>
<body>
<div class="body-box">
<div class="rhead">
<div class="rpos"><@s.m "global.position"/>: <@s.m "cmsScheduler.acquisition.function"/> - <@s.m "global.list"/></div>
<form class="ropt">
<input type="hidden" name="moduleType" value="${moduleType!}" />
<input type="submit" value="<@s.m "global.add"/>" onclick="this.form.action='v_add.do';"/>
</form>
<div class="clear"></div>
</div>
<form id="tableForm" method="post">
<input type="hidden" name="pageNo" value="${pageNo!}"/>
<@p.table value=list;cmsScheduler,i,has_next><#rt/>
<@p.column title="<input type='checkbox' onclick='Pn.checkbox(\"ids\",this.checked)'/>" width="20">
<input type='checkbox' name='ids' value='${cmsScheduler.id}'/><#t/>
</@p.column><#t/>
<@p.column title="ID">${cmsScheduler.id}</@p.column><#t/>
<@p.column code="cmsScheduler.name">${cmsScheduler.name}</@p.column><#t/>
<@p.column code="cmsScheduler.status" align="center"><#if cmsScheduler.status==1><strong style="color:red"></#if><@s.m "cmsScheduler.status."+cmsScheduler.status/><#if cmsScheduler.status==1></strong></#if></@p.column><#t/>
<@p.column code="cmsScheduler.startTime" align="center">${(cmsScheduler.startTime?string('yyyy-MM-dd HH:mm:ss'))!}</@p.column><#t/>
<@p.column code="cmsScheduler.endTime" align="center">${(cmsScheduler.endTime?string('yyyy-MM-dd HH:mm:ss'))!}</@p.column><#t/>
<@p.column code="global.operate" align="center">
<#if cmsScheduler.status==0>
<a href="o_start.do?id=${cmsScheduler.id}" class="pn-opt"><@s.m "cmsScheduler.opt.start"/></a> | <#rt/>
<#else>
<@s.m "cmsScheduler.opt.start"/> | <#rt/>
</#if>
<#if cmsScheduler.status==1 || cmsScheduler.status==1>
<a href="o_end.do?id=${cmsScheduler.id}" class="pn-opt"><@s.m "cmsScheduler.opt.end"/></a> | <#rt/>
<#else>
<@s.m "cmsScheduler.opt.end"/> | <#rt/>
</#if>
<a href="v_edit.do?id=${cmsScheduler.id}" class="pn-opt"><@s.m "global.edit"/></a> | <#rt/>
<a href="o_delete.do?ids=${cmsScheduler.id}&moduleType=${cmsScheduler.moduleType}" class="pn-opt" onclick="if(!confirm('<@s.m "global.confirm.delete"/>')) {return false;}"><@s.m "global.delete"/></a><#t/>
</@p.column><#t/>
</@p.table>
<div><input type="button" value="<@s.m "global.delete"/>" onclick="optDelete();"/></div>
</form>
</div>
<#include "/common/alert_message.html"/>
</body>
</html>
generate_left.html 有修改
加上
<@cms_perm url="/scheduler/v_listBy.do">
<li><a href="../scheduler/v_listBy.do?moduleType=schedulerAcquisitionSvc" target="rightFrame"><@s.m "cmsScheduler.acquisition.function"/></a></li>
</@cms_perm>