CREATE TABLE extract_task_temp ( id integer NOT NULL DEFAULT nextval('extract_task_temp_731_id_seq'::regclass), task_init_time timestamp with time zone, -- 初始化抽取任务时间 task_current_time timestamp with time zone, -- 当前任务抽取时间 task_next_time timestamp with time zone, -- 下一次任务抽取时间 create_time timestamp with time zone DEFAULT now(), update_time timestamp with time zone, -- 修改时间 task_type integer, -- 任务类型1:文章,2回复 website_id integer, -- 站点类型id start_size integer, -- 分页起始大小 limit_size integer, -- 分次取多少条数据 cid integer, -- 客户id authors text, -- 作者昵称 interval_time integer -- 间隔时间单位(分钟) )
package com.cyyun.mobile.tools; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import javax.annotation.Resource; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.time.DateUtils; import org.apache.log4j.Logger; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Component; import com.cyyun.mobile.dao.ICommentAccountTempDao; import com.cyyun.mobile.dao.IExtractTaskTempDao; import com.cyyun.mobile.pojo.CommentAccountTemp; import com.cyyun.mobile.pojo.ExtractTaskTemp; import com.cyyun.mobile.service.ExtractTaskTempService; import com.cyyun.mobile.tools.httpconnection.HttpUrlConnection; import com.cyyun.mobile.tools.json.JsonEntity; import com.cyyun.mobile.tools.json.JsonEntityArray; import com.twmacinta.util.MD5; /** * 抽取数据任务 * * @author zhangzm * */ @Component public class ExtractTask { static Logger log = Logger.getLogger(ExtractTask.class); @Resource IExtractTaskTempDao iExtractTaskTempDao; @Resource ICommentAccountTempDao iCommentAccountTempDao; @Resource ExtractTaskTempService extractTaskTempService; /** * 获取任务对象 * * @return */ public List<ExtractTaskTemp> getExtractTaskTemp() { List<ExtractTaskTemp> extractTaskTemps = null; try { extractTaskTemps = extractTaskTempService .queryExtractTaskTemp(null); } catch (Exception e) { log.error(e.getMessage(), e); } return extractTaskTemps; } @Scheduled(cron = "0 0/1 * * * ?") public void execute() { List<ExtractTaskTemp> extractTaskTemps = getExtractTaskTemp(); if (CollectionUtils.isEmpty(extractTaskTemps)) { log.warn("extractTaskTemps isEmpty"); return; } for (ExtractTaskTemp e : extractTaskTemps) { if (null != e) { if ("1".equals(String.valueOf(e.getTaskType()))) { createTask(e); } else { createReplyTask(e); } } } } /** * 获取回复数 * * @param bean */ public void createReplyTask(ExtractTaskTemp bean) { if (bean == null) { log.warn("SpidTaskSynBean is null "); return; } if (null == bean.getTaskType()) { log.warn("ExtractTaskTemp getTaskType is null "); return; } initTask(bean); HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); String url = Constant.GET_ARTICLE_REPLY_URL; dataMap.put("order", "rid"); dataMap.put("desc", "asc"); dataMap.put("cid", String.valueOf(bean.getCid())); dataMap.put("limit", String.valueOf(bean.getLimitSize())); dataMap.put("fid", String.valueOf(bean.getWebsiteId())); dataMap.put("authors", bean.getAuthors()); dataMap.put("from", String.valueOf(bean.getTaskCurrentTime().getTime())); dataMap.put("to", String.valueOf(bean.getTaskNextTime().getTime())); StringBuilder logBuilder = new StringBuilder(); logBuilder.append("开始时间为:").append(bean.getCreateTime()) .append("结束时间为:").append(bean.getTaskNextTime()) .append(" url :" + url).append(" dataMap :" + dataMap); log.info(logBuilder.toString()); String response = null; try { response = connection.readData(dataMap, url); } catch (Exception e) { log.error(e.getMessage(), e); return; } if ("[]".equals(response)) { bean.setAuthors(null); bean.setStartSize(0); extractTaskTempService.updateExtractTaskTemp(bean); log.warn("get response is null " + url + " " + dataMap); return; } JsonEntityArray array = new JsonEntityArray(response); List<Map<String, Object>> addArticleBeans = new ArrayList<Map<String, Object>>(); if (array != null && array.size() > 0) { for (int i = 0; i < array.size(); i++) { JsonEntity jsonE = array.getJsonEntity(i); String rid = jsonE.getString("rid"); Map<String, Object> map = new HashMap<String, Object>(); map.put("rid", Integer.valueOf(rid)); addArticleBeans.add(map); } try { bean.setStartSize(bean.getStartSize() + addArticleBeans.size()); extractTaskTempService .updateExtractTaskTempAndArticleReplyTemp(bean, addArticleBeans); } catch (Exception e) { log.error(e); } } } public String getCommentAccountTempName( List<CommentAccountTemp> accountTemps) { StringBuilder builder = new StringBuilder(); try { if (CollectionUtils.isNotEmpty(accountTemps)) { for (CommentAccountTemp c : accountTemps) { if (c != null) { String name = c.getCommentNickname(); builder.append(name).append(","); } } } } catch (Exception e) { log.error(e); } String s = builder.toString(); if (s.endsWith(",")) { s = s.substring(0, s.length() - 1); } return s; } /** * 初始化任务,设置 开始时间,结束时间,以及账号表中的任务起始时间。(每一次任务表中的结束时间=账号表中的开始时间) 在账号表中的时间会 * 出现的时间范围是 初始化时间+时间间隔*次数 * * @param bean */ public void initTask(ExtractTaskTemp bean) { if (StringUtils.isNotBlank(bean.getAuthors())) { return; } Map<String, Object> map = new HashMap<String, Object>(); map.put("websiteId", bean.getWebsiteId()); map.put("deleteFlag", 1); map.put("taskTime", bean.getTaskInitTime()); List<CommentAccountTemp> accountTemps = null; try { accountTemps = extractTaskTempService.queryCommentAccountTemp(map); } catch (Exception e) { log.error(e); } if (CollectionUtils.isEmpty(accountTemps)) { map.clear(); map.put("websiteId", bean.getWebsiteId()); map.put("deleteFlag", 1); map.put("taskTime", bean.getTaskNextTime()); try { accountTemps = extractTaskTempService .queryCommentAccountTemp(map); bean.setStartSize(0);// 起始页 bean.setTaskCurrentTime(bean.getTaskNextTime());// 当前处理时间 Date taskNextTime = DateUtils.addMinutes( bean.getTaskNextTime(), bean.getIntervalTime()); bean.setTaskNextTime(taskNextTime);// 下次处理时间 } catch (Exception e) { log.error(e); } } else { bean.setStartSize(0);// 起始页 bean.setTaskCurrentTime(bean.getTaskInitTime());// 当前处理时间 Date taskNextTime = DateUtils.addMinutes(bean.getTaskInitTime(), bean.getIntervalTime()); bean.setTaskNextTime(taskNextTime);// 下次处理时间 } try { bean.setAuthors(getCommentAccountTempName(accountTemps));// 设置作者 extractTaskTempService.updateExtractTaskTempAndCommentAccountTemp( bean, accountTemps); } catch (Exception e) { log.error(e); } } /** * 抽取文章数据 * * @param bean */ public void createTask(ExtractTaskTemp bean) { if (bean == null) { log.warn("SpidTaskSynBean is null "); return; } if (null == bean.getTaskType()) { log.warn("ExtractTaskTemp getSysTypeId is null "); return; } initTask(bean); while (true) { HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); String url = Constant.GET_ARTICLE_URL; dataMap.put("action", "full"); dataMap.put("sort", "6"); dataMap.put("order", "1"); dataMap.put("start", String.valueOf(bean.getStartSize())); dataMap.put("cid", String.valueOf(bean.getCid())); dataMap.put("limit", String.valueOf(bean.getLimitSize())); dataMap.put("fid", String.valueOf(bean.getWebsiteId())); dataMap.put("authors", bean.getAuthors()); dataMap.put("from", String.valueOf(bean.getTaskCurrentTime().getTime())); dataMap.put("to", String.valueOf(bean.getTaskNextTime().getTime())); StringBuilder logBuilder = new StringBuilder(); logBuilder.append("开始时间为:").append(bean.getCreateTime()) .append("结束时间为:").append(bean.getTaskNextTime()) .append(" url :" + url).append(" dataMap :" + dataMap); log.info(logBuilder.toString()); String response = null; try { response = connection.readData(dataMap, url); } catch (Exception e) { log.error(e.getMessage(), e); return; } JsonEntity jsonEntity = new JsonEntity(response); int result = Integer.valueOf(jsonEntity.getString("count")); if (result == 0) { log.error("抽取文件数据为0条 "); bean.setAuthors(null); bean.setStartSize(0); extractTaskTempService.updateExtractTaskTemp(bean); return; } else { JsonEntityArray array = jsonEntity.getJsonEntityArray("items"); List<Map<String, Object>> addArticleBeans = new ArrayList<Map<String, Object>>(); if (array != null && array.size() > 0) { for (int i = 0; i < array.size(); i++) { JsonEntity jsonE = array.getJsonEntity(i); String aid = jsonE.getString("aid"); Map<String, Object> map = new HashMap<String, Object>(); map.put("aid", Integer.valueOf(aid)); addArticleBeans.add(map); } } try { bean.setStartSize(bean.getStartSize() + addArticleBeans.size()); extractTaskTempService.updateExtractTaskTempAndArticleTemp( bean, addArticleBeans); } catch (Exception e) { log.error(e); } } } } public void getArticleByGuid(Set<String> guids) { if (CollectionUtils.isNotEmpty(guids)) { for (String guid : guids) { getArticleByGuid(Constant.CID, guid); } } } /** * 抽取文章数据 * * @param bean */ public void getArticleByGuid(String cid, String guid) { if (StringUtils.isBlank(cid)) { log.warn("cid is null "); return; } if (StringUtils.isBlank(guid)) { log.warn("cid is null "); return; } HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); String url = Constant.GET_ARTICLE_URL; dataMap.put("cid", cid); dataMap.put("guid", guid); String response = null; try { response = connection.readData(dataMap, url); } catch (Exception e) { log.error(e.getMessage(), e); return; } // 解析json JsonEntity jsonEntity = new JsonEntity(response); int result = Integer.valueOf(jsonEntity.getString("count")); if (result == 0) { log.error("获取0条数据"); } else { } } /** * 抽取文章数据 * * @param bean */ public static void testGetArticleByGuid(String cid, String guid) { if (StringUtils.isBlank(cid)) { log.warn("cid is null "); return; } if (StringUtils.isBlank(guid)) { log.warn("cid is null "); return; } HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); dataMap.put("cid", cid); dataMap.put("guid", guid); String response = null; try { response = connection.readData(dataMap, url); } catch (Exception e) { log.error(e.getMessage(), e); return; } // 解析json JsonEntity jsonEntity = new JsonEntity(response); int result = Integer.valueOf(jsonEntity.getString("count")); if (result == 0) { log.error("获取0条数据"); } else { } } /** * 获取文章内容 * * @param aid * @return */ public String getArticleContent(Long aid) { HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); String url = Constant.GET_ARTICLE_CONTENT_URL; dataMap.put("aid", String.valueOf(aid)); String response = null; try { response = connection.readData(dataMap, url); } catch (Exception e) { log.error(e.getMessage(), e); return null; } if (StringUtils.isNotBlank(response)) { // 解析json JsonEntity jsonEntity = new JsonEntity(response); return jsonEntity.getString("content"); } return null; } public static void testSpidArticle() { HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); dataMap.put("start", "0"); dataMap.put("cid", "731"); dataMap.put("limit", "5"); dataMap.put("action", "full"); dataMap.put("sort", "6"); dataMap.put("order", "1"); dataMap.put("author", "品味咖啡"); dataMap.clear(); String response = connection.readData(dataMap, url); JsonEntity jsonEntity = new JsonEntity(response); JsonEntityArray array = jsonEntity.getJsonEntityArray("items"); } public static void testArticleReply() { HttpUrlConnection connection = new HttpUrlConnection(); Map<String, String> dataMap = new HashMap<String, String>(); dataMap.put("start", "0"); dataMap.put("cid", Constant.CID); dataMap.put("limit", "10"); dataMap.put("author", "最爱看九爷"); String response = connection.readData(dataMap, url); // JsonEntity jsonEntity = new JsonEntity(response); JsonEntityArray array = new JsonEntityArray(response); // 采集完成 // JsonEntityArray array = jsonEntity.getJsonEntityArray("items"); } public static String getMD5Url(String url) { if (StringUtils.isBlank(url)) { return url; } try { MD5 md5 = new MD5(); md5.Update(url); return md5.asHex(); } catch (Exception e) { log.error("md5 加密异常", e); } return null; } public static void main(String[] args) { } }
最好把传输数据接口的参数都配置到数据库中,
每次任务记录好当前任务的参数参数,当宕机或者重启的时候,有利于保存当前的查询参数,有利于下一次 的查询