Java爬虫框架WebMagic的使用总结

最近,项目做一个公司新闻网站,分为PC&移动端(h5),数据来源是从HSZX与huanqiu2个网站爬取,主要使用 Java编写的WebMagic作为爬虫框架,数据分为批量抓取、增量抓取,批量抓当前所有历史数据,增量需要每10分钟定时抓取一次,由于从2个网站抓取,并且频道很多,数据量大,更新频繁;开发过程中遇到很多的坑,今天腾出时间,感觉有必要做以总结。

工具说明:

           1、WebMagic是一个简单灵活的爬虫框架。基于WebMagic,你可以快速开发出一个高效、易维护的爬虫。

                  官网地址:http://webmagic.io/

                  文档说明:http://webmagic.io/docs/zh/

            2、jsoup是Java的一个html解析工作,解析性能很不错。

                    文档地址:http://www.open-open.com/jsoup/

            3、Jdiy一款超轻量的java极速开发框架,javaEE/javaSE环境均适用,便捷的数据库CRUD操作API。支持各大主流数据库。

                    官网地址:http://www.jdiy.org/jdiy.jd



一、使用到的技术,如下:
       WebMagic作为爬虫框架、httpclient作为获取网页工具、Jsoup作为分析页面定位抓取内容、ExecutorService线程池作为定时增量抓取、Jdiy作为持久层框架
       
二、历史抓取代码,如下:

[java] view plain copy
  1. package com.spider.huanqiu.history;  
  2.   
  3. import java.util.ArrayList;  
  4. import java.util.List;  
  5. import org.apache.commons.lang3.StringUtils;  
  6. import org.jdiy.core.Rs;  
  7. import org.jsoup.Jsoup;  
  8. import org.jsoup.nodes.Document;  
  9. import org.jsoup.nodes.Element;  
  10. import org.jsoup.select.Elements;  
  11. import us.codecraft.webmagic.Page;  
  12. import us.codecraft.webmagic.Site;  
  13. import us.codecraft.webmagic.Spider;  
  14. import us.codecraft.webmagic.processor.PageProcessor;  
  15. import com.spider.huasheng.history.Pindao;  
  16. import com.spider.utils.Config;  
  17. import com.spider.utils.ConfigBase;  
  18. import com.spider.utils.DateUtil;  
  19. import com.spider.utils.HttpClientUtil;  
  20. import com.spider.utils.service.CommService;  
  21.   
  22. /** 
  23.  * 描        述:抓取xxx-国际频道历史数据 
  24.  * 创建时间:2016-11-9 
  25.  * @author Jibaole 
  26.  */  
  27. public class HQNewsDao extends ConfigBase  implements PageProcessor{  
  28.        public static final String index_list = "(.*).huanqiu.com/(.*)pindao=(.*)";//校验地址正则  
  29.        public static String pic_dir = fun.getProValue(PINDAO_PIC_FILE_PATH);//获取图片保存路径  
  30.          
  31.       
  32.     // 部分一:抓取网站的相关配置,包括编码、重试次数、抓取间隔、超时时间、请求消息头、UA信息等  
  33.         private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(6000)  
  34.                            .addHeader("Accept-Encoding""/").setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36");  
  35.     @Override  
  36.     public Site getSite() {    
  37.         return site;  
  38.     }  
  39.       
  40.       
  41.      @Override  
  42.         public void process(Page page) {  
  43.         try {  
  44.             //列表页  
  45.             if (page.getUrl().regex(index_list).match()) {  
  46.                 List Urllist =new ArrayList();  
  47.                 String url =page.getUrl().toString();  
  48.                 String pageUrl = url.substring(0,url.lastIndexOf("?"));  
  49.                 String pindaoId =url.substring(url.lastIndexOf("=")+1);  
  50.                 Urllist = saveNewsListData(pageUrl,pindaoId);  
  51.                 page.addTargetRequests(Urllist);//添加地址,根据url对该地址处理  
  52.             }  
  53.             //可增加else if 处理不同URL地址  
  54.         } catch (Exception e) {  
  55.             e.printStackTrace();  
  56.         }  
  57.      }  
  58.   
  59.   
  60.     private List saveNewsListData(String pageUrl,String pindaoId) {  
  61.     List urlList = new ArrayList();  
  62.     Document docList = null;  
  63.    String newsIdFirst="";  
  64.    String pageListStr=HttpClientUtil.getPage(pageUrl);//HttpClientUtil方式获取网页内容  
  65.     if(StringUtils.isNotEmpty(pageListStr)){  
  66.      try {  
  67.         docList = Jsoup.parse(pageListStr);  
  68.         Elements fallsFlow=docList.getElementsByClass("fallsFlow");  
  69.         if(!fallsFlow.isEmpty()){  
  70.             Elements liTag=fallsFlow.get(0).getElementsByTag("li");  
  71.             if(!liTag.isEmpty()){  
  72.                 for(int i=0;i
  73.                      String  title="",contentUrl="",newsId="",pic="",absContent="",pushTime="",timeFalg="";  
  74.                     Element obj=liTag.get(i);  
  75.                     try{  
  76.                           contentUrl=obj.getElementsByTag("h3").select("a").attr("href");  
  77.                          if(StringUtils.isNotEmpty(contentUrl)){  
  78.                               title=obj.getElementsByTag("h3").select("a").attr("title");//标题  
  79.                               Rs isTitle = CommService.checkNewsName(title); //校验新闻标题  
  80.                                 if(!isTitle.isNull()){  
  81.                                     continue;  
  82.                                 }  
  83.                                   
  84.                                   
  85.                               System.err.println("<<<<<<--DAO------当前抓取文章为(xxx历史):"+title+"------------");  
  86.                               newsId =  contentUrl.substring(contentUrl.lastIndexOf("/") + 1,contentUrl.lastIndexOf(".html"));  
  87.                               if(!pageUrl.contains(".htm") && i == 0){  
  88.                                   newsIdFirst = newsId;  
  89.                               }  
  90.                              //图片  
  91.                              if(!obj.getElementsByTag("img").attr("src").isEmpty()){  
  92.                                     pic=obj.getElementsByTag("img").first().attr("src");  
  93.                                     if(StringUtils.isNotEmpty(pic) ){  
  94.                                         pic = fun.downloadPic(pic,pic_dir+"list/"+newsId+"/");//获取列表图片,保存本地  
  95.                                     }  
  96.                                 }  
  97.                              if(!obj.getElementsByTag("h5").isEmpty()){  
  98.                                  //简介  
  99.                                  absContent = obj.getElementsByTag("h5").first().text();  
  100.                                  if(StringUtils.isNotEmpty(absContent) && absContent.indexOf("[")>0){  
  101.                                       absContent = absContent.substring(0, absContent.indexOf("["));  
  102.                                   }  
  103.                              }  
  104.                                
  105.                               if(!obj.getElementsByTag("h6").isEmpty()){  
  106.                                   pushTime = obj.getElementsByTag("h6").text();   
  107.                                   timeFalg=pushTime.substring(04);  
  108.                               }  
  109.                               String hrmlStr=HttpClientUtil.getPage(contentUrl);  
  110.                               if(StringUtils.isNotEmpty(hrmlStr)){  
  111.                                   Document docPage = Jsoup.parse(hrmlStr);  
  112.                               Elements pageContent = docPage.getElementsByClass("conText");  
  113.                                 if(!pageContent.isEmpty()){  
  114.                                     String comefrom = pageContent.get(0).getElementsByClass("fromSummary").text();//来源  
  115.                                     if(StringUtils.isNotEmpty(comefrom) && comefrom.contains("环球")){  
  116.                                         String author=pageContent.get(0).getElementsByClass("author").text();//作者  
  117.                                         Element contentDom = pageContent.get(0).getElementById("text");  
  118.                                         if(!contentDom.getElementsByTag("a").isEmpty()){  
  119.                                             contentDom.getElementsByTag("a").removeAttr("href");//移除外跳连接  
  120.                                         }  
  121.                                         if(!contentDom.getElementsByClass("reTopics").isEmpty()){  
  122.                                              contentDom.getElementsByClass("reTopics").remove();//推荐位  
  123.                                         }  
  124.                                        if(!contentDom.getElementsByClass("spTopic").isEmpty()){  
  125.                                            contentDom.getElementsByClass("spTopic").remove(); //去除排行榜列表  
  126.                                        }  
  127.                                        if(!contentDom.getElementsByClass("editorSign").isEmpty()){  
  128.                                            contentDom.getElementsByClass("editorSign").remove();//移除编辑标签  
  129.                                        }  
  130.                                          
  131.                                         String content = contentDom.toString();  
  132.                                         if(!StringUtils.isEmpty(content)){  
  133.                                             content = content.replaceAll("\r\n|\r|\n|\t|\b|~|\f""");//去掉回车换行符  
  134.                                             content = replaceForNews(content,pic_dir+"article/"+newsId+"/");//替换内容中的图片  
  135.                                             while (true) {  
  136.                                                  if(content.indexOf("")>0){  
  137.                                                           String moveContent= content.substring(content.indexOf("")+3);//去除注释  
  138.                                                           content = content.replace(moveContent, "");  
  139.                                                             }  
  140.                                                          if(content.indexOf(") >0 && content.lastIndexOf("")>0){  
  141.                                                           String moveContent= content.substring(content.indexOf("), content.indexOf("")+9);//去除JS  
  142.                                                           content = content.replace(moveContent, "");  
  143.                                                           }  
  144.                                                }  
  145.                                             }  
  146.                                         }  
  147.                                     if(StringUtils.isEmpty(timeFalg) || "2016".equals(timeFalg) ||   
  148.                                         "28".equals(pindaoId) || "29".equals(pindaoId) || "30".equals(pindaoId)){  
  149.                                             Rs news= new Rs("News");  
  150.                                             news.set("title", title);  
  151.                                             news.set("shortTitle",title);  
  152.                                             news.set("beizhu",absContent);  
  153.                                             news.set("savetime", pushTime);  
  154.                                             if(StringUtils.isNotEmpty(pic)){  
  155.                                                 news.set("path", pic);  
  156.                                                 news.set("mini_image", pic);  
  157.                                             }  
  158.                                             news.set("pindaoId", pindaoId);  
  159.                                             news.set("status"0);//不显示  
  160.                                             news.set("canComment"1);//是否被评论  
  161.                                             news.set("syn"1);//是否异步  
  162.                                             news.set("type"1);//是否异步  
  163.                                             news.set("comefrom",comefrom);  
  164.                                             news.set("author", author);  
  165.                                             news.set("content", content);  
  166.                                             news.set("content2", content);  
  167.                                             CommService.save(news);  
  168.   
  169.                                               System.err.println("------新增(xxx历史):"+title+"------>>>>>>>");  
  170.                                     }else{  
  171.                                         break;  
  172.                                     }  
  173.                                       }  
  174.                                   }  
  175.                               }  
  176.                               }  
  177.                     }catch (Exception e) {  
  178.                         e.printStackTrace();    
  179.                     }  
  180.                 }  
  181.             }  
  182.             if(!pageUrl.contains(".htm")){  
  183.                 //得到分页内容  
  184.                 Element pages = docList.getElementById("pages");  
  185.                 int num = pages.getElementsByTag("a").size();  
  186.                 String pageMaxStr = pages.getElementsByTag("a").get(num-2).text();  
  187.                 int pageMax=0;  
  188.                 if(StringUtils.isNotEmpty(pageMaxStr)){  
  189.                     pageMax= Integer.parseInt(pageMaxStr);  
  190.                 }  
  191.                 if(pageMax>historyMaxPage){//控制历史抓取页数  
  192.                     pageMax = historyMaxPage;  
  193.                 }  
  194.                 for(int i=1 ;i//翻页请求  
  195.                     String link = "";  
  196.                     link = pageUrl+(i+1)+".html?pindao="+pindaoId;  
  197.                     urlList.add(link);//循环处理url,翻页内容  
  198.                 }  
  199.                 //获取增量标识  
  200.                  Rs flag = CommService.checkPd(pindaoId,pageUrl,Config.SITE_HQ);  
  201.                     //初始化  
  202.                     if(flag.isNull()){  
  203.                         Rs task= new Rs("TaskInfo");  
  204.                         task.set("pindao_id", pindaoId);  
  205.                         task.set("news_id", newsIdFirst);  
  206.                         task.set("page_url", pageUrl);  
  207.                         task.set("site", Config.SITE_HQ);  
  208.                         task.set("create_time", DateUtil.fullDate());  
  209.                         CommService.save(task);   
  210.                     }  
  211.              }  
  212.         }  
  213.       } catch (Exception e) {  
  214.         e.printStackTrace();  
  215.       }  
  216.     }  
  217.     return urlList;  
  218.     }  
  219.   
  220.     public static void main(String[] args) {  
  221.         List strList=new ArrayList();  
  222.             strList.add("http://www.xxx/exclusive/?pindao="+Pindao.getKey("国际"));  
  223.             //滚动新闻  
  224.             strList.add("http://www.xxx/article/?pindao="+Pindao.getKey("国际"));  
  225.           
  226.         for(String str:strList){  
  227.             Spider.create(new HQNewsDao()).addUrl(str).thread(1).run();   
  228.         }  
  229.  }  
  230.       
  231.     //所有频道Action  
  232.     public static void runNewsList(List strList){  
  233.         for(String str:strList){  
  234.             Spider.create(new HQNewsDao()).addUrl(str).thread(1).run(); //添加爬取地址、设置线程数    
  235.         }  
  236.     }  
  237. }  

三、增量抓取代码,如下(在历史上改动):

 说明:增量每10分钟执行一次,每次只抓取最新一页数据,根据增量标识(上一次第一条新闻news_id),存在相同news_id或一页爬完就终止抓取。

[java] view plain copy
  1. package com.spider.huanqiu.task;  
  2.   
  3. import java.util.ArrayList;  
  4. import java.util.List;  
  5. import org.apache.commons.lang3.StringUtils;  
  6. import org.jdiy.core.Rs;  
  7. import org.jsoup.Jsoup;  
  8. import org.jsoup.nodes.Document;  
  9. import org.jsoup.nodes.Element;  
  10. import org.jsoup.select.Elements;  
  11. import us.codecraft.webmagic.Page;  
  12. import us.codecraft.webmagic.Site;  
  13. import us.codecraft.webmagic.Spider;  
  14. import us.codecraft.webmagic.processor.PageProcessor;  
  15. import com.spider.huasheng.history.Pindao;  
  16. import com.spider.utils.Config;  
  17. import com.spider.utils.ConfigBase;  
  18. import com.spider.utils.DateUtil;  
  19. import com.spider.utils.HttpClientUtil;  
  20. import com.spider.utils.service.CommService;  
  21.   
  22. public class HQNewsTaskDao extends ConfigBase  implements PageProcessor{  
  23.        public static final String index_list = "(.*).huanqiu.com/(.*)pindao=(.*)";  
  24.        public static String pic_dir = fun.getProValue(PINDAO_PIC_FILE_PATH);  
  25.        public static String new_id="";       
  26.       
  27.     // 部分一:抓取网站的相关配置,包括编码、抓取间隔、重试次数等  
  28.     private Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setTimeOut(6000)  
  29.                        .addHeader("Accept-Encoding""/").setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36");  
  30.     @Override  
  31.     public Site getSite() {    
  32.         return site;  
  33.     }  
  34.       
  35.      @Override  
  36.         public void process(Page page) {  
  37.         try {  
  38.             //列表页  
  39.             if (page.getUrl().regex(index_list).match()) {  
  40.                 List Urllist =new ArrayList();  
  41.                 String url =page.getUrl().toString();  
  42.                 String pageUrl = url.substring(0,url.lastIndexOf("?"));  
  43.                 String pindaoId =url.substring(url.lastIndexOf("=")+1);  
  44.                 Rs isFlag = CommService.checkPd(pindaoId,pageUrl,Config.SITE_HQ);  
  45.                 if(!isFlag.isNull()){  
  46.                      new_id=isFlag.getString("news_id");  
  47.                 }  
  48.                 Urllist = saveNewsListData(pageUrl,pindaoId);  
  49.                 page.addTargetRequests(Urllist);  
  50.             }  
  51.         } catch (Exception e) {  
  52.             e.printStackTrace();  
  53.         }  
  54.      }  
  55.   
  56.     private List saveNewsListData(String pageUrl,String pindaoId) {  
  57.     List urlList = new ArrayList();  
  58.     Document docList = null;  
  59.     String pageListStr=HttpClientUtil.getPage(pageUrl);  
  60.     if(StringUtils.isNotEmpty(pageListStr)){  
  61.      try {  
  62.         docList = Jsoup.parse(pageListStr);  
  63.         Elements fallsFlow=docList.getElementsByClass("fallsFlow");  
  64.         if(!fallsFlow.isEmpty()){  
  65.             String newsIdFirst="";  
  66.             Boolean isIng = true;  
  67.             Elements liTag=fallsFlow.get(0).getElementsByTag("li");  
  68.             if(!liTag.isEmpty()){  
  69.                 for(int i=0;i
  70.                     String  title="",contentUrl="",newsId="",pic="",absContent="",pushTime="";  
  71.                     Element obj=liTag.get(i);  
  72.                     try{  
  73.                           contentUrl=obj.getElementsByTag("h3").select("a").attr("href");  
  74.                          if(StringUtils.isNotEmpty(contentUrl)){  
  75.                               title=obj.getElementsByTag("h3").select("a").attr("title");//标题  
  76.                               Rs isTitle = CommService.checkNewsName(title); //校验新闻标题  
  77.                                 if(!isTitle.isNull()){  
  78.                                     continue;  
  79.                                 }  
  80.                               System.err.println("---------当前抓取文章为(增量):"+title+"------------");  
  81.                               newsId =  contentUrl.substring(contentUrl.lastIndexOf("/") + 1,contentUrl.lastIndexOf(".html"));  
  82.                             if(!newsId.equals(new_id)){  
  83.                               if(!pageUrl.contains(".htm") && i == 0){  
  84.                                   newsIdFirst = newsId;  
  85.                               }  
  86.                              //图片  
  87.                              if(!obj.getElementsByTag("img").attr("src").isEmpty()){  
  88.                                     pic=obj.getElementsByTag("img").first().attr("src");  
  89.                                     if(StringUtils.isNotEmpty(pic) ){  
  90.                                         pic = fun.downloadPic(pic,pic_dir+"list/"+newsId+"/");  
  91.                                     }  
  92.                                 }  
  93.                              if(!obj.getElementsByTag("h5").isEmpty()){  
  94.                                  //简介  
  95.                                  absContent = obj.getElementsByTag("h5").first().text();  
  96.                                  if(StringUtils.isNotEmpty(absContent) && absContent.indexOf("[")>0){  
  97.                                       absContent = absContent.substring(0, absContent.indexOf("["));  
  98.                                   }   
  99.                              }  
  100.                             if(!obj.getElementsByTag("h6").isEmpty()){  
  101.                                 pushTime = obj.getElementsByTag("h6").text();  
  102.                             }  
  103.                             String hrmlStr=HttpClientUtil.getPage(contentUrl);  
  104.                             if(StringUtils.isNotEmpty(hrmlStr)){  
  105.                               Document docPage = Jsoup.parse(hrmlStr);  
  106.                               Elements pageContent = docPage.getElementsByClass("conText");  
  107.                                 if(!pageContent.isEmpty()){  
  108.                                     String comefrom = pageContent.get(0).getElementsByClass("fromSummary").text();//来源  
  109.                                     if(StringUtils.isNotEmpty(comefrom) && comefrom.contains("环球")){  
  110.                                         String author=pageContent.get(0).getElementsByClass("author").text();//作者  
  111.                                         Element contentDom = pageContent.get(0).getElementById("text");  
  112.                                         if(!contentDom.getElementsByTag("a").isEmpty()){  
  113.                                             contentDom.getElementsByTag("a").removeAttr("href");//移除外跳连接  
  114.                                         }  
  115.                                         if(!contentDom.getElementsByClass("reTopics").isEmpty()){  
  116.                                              contentDom.getElementsByClass("reTopics").remove();//推荐位  
  117.                                         }  
  118.                                         if(!contentDom.getElementsByClass("spTopic").isEmpty()){  
  119.                                            contentDom.getElementsByClass("spTopic").remove();   
  120.                                        }  
  121.                                         if(!contentDom.getElementsByClass("editorSign").isEmpty()){  
  122.                                            contentDom.getElementsByClass("editorSign").remove();//移除编辑   
  123.                                        }  
  124.                                         String content = contentDom.toString();  
  125.                                         if(!StringUtils.isEmpty(content)){  
  126.                                             content = content.replaceAll("\r\n|\r|\n|\t|\b|~|\f""");//去掉回车换行符  
  127.                                             content = replaceForNews(content,pic_dir+"article/"+newsId+"/");//替换内容中的图片  
  128.                                             while (true) {  
  129.                                                  if(content.indexOf("")>0){  
  130.                                                           String moveContent= content.substring(content.indexOf("")+3);//去除注释  
  131.                                                           content = content.replace(moveContent, "");  
  132.                                                             }  
  133.                                                          if(content.indexOf(") >0 && content.lastIndexOf("")>0){  
  134.                                                           String moveContent= content.substring(content.indexOf("), content.indexOf("")+9);//去除JS  
  135.                                                           content = content.replace(moveContent, "");  
  136.                                                           }  
  137.                                                }  
  138.                                             }  
  139.                                         }  
  140.                                         if(StringUtils.isNotEmpty(content) && StringUtils.isNotEmpty(title)){  
  141.                                             Rs news= new Rs("News");  
  142.                                             news.set("title", title);  
  143.                                             news.set("shortTitle",title);  
  144.                                             news.set("beizhu",absContent);  
  145.                                             news.set("savetime", pushTime);  
  146.                                             if(StringUtils.isNotEmpty(pic)){  
  147.                                                 news.set("path", pic);  
  148.                                                 news.set("mini_image", pic);  
  149.                                             }  
  150.                                             news.set("pindaoId", pindaoId);  
  151.                                             news.set("status"1);//不显示  
  152.                                             news.set("canComment"1);//是否被评论  
  153.                                             news.set("syn"1);//是否异步  
  154.                                             news.set("type"1);//是否异步  
  155.                                             news.set("comefrom",comefrom);  
  156.                                             news.set("author", author);  
  157.                                             news.set("content", content);  
  158.                                             news.set("content2", content);  
  159.                                             CommService.save(news);  
  160.                                         }  
  161.                                       }  
  162.                                   }  
  163.                             }  
  164.                             }else{  
  165.                                 isIng=false;  
  166.                                 break;  
  167.                             }  
  168.                               }  
  169.                     }catch (Exception e) {  
  170.                         e.printStackTrace();    
  171.                     }  
  172.                 }  
  173.             }  
  174.             if(!pageUrl.contains(".htm")){  
  175.                 //增量标识  
  176.                  Rs flag = CommService.checkPd(pindaoId,pageUrl,Config.SITE_HQ);  
  177.                     //初始化  
  178.                     if(flag.isNull()){  
  179.                         Rs task= new Rs("TaskInfo");  
  180.                         task.set("pindao_id", pindaoId);  
  181.                         task.set("news_id", newsIdFirst);  
  182.                         task.set("page_url", pageUrl);  
  183.                         task.set("site", Config.SITE_HQ);  
  184.                         task.set("create_time", DateUtil.fullDate());  
  185.                         CommService.save(task);   
  186.                     }else if(StringUtils.isNotEmpty(newsIdFirst)){  
  187.                         flag.set("news_id", newsIdFirst);  
  188.                         flag.set("update_time", DateUtil.fullDate());  
  189.                         CommService.save(flag);  
  190.                     }  
  191.              }  
  192.         }  
  193.       } catch (Exception e) {  
  194.         e.printStackTrace();  
  195.       }  
  196.     }  
  197.     return urlList;  
  198.     }  
  199.   
  200.     public static void main(String[] args) {  
  201.         List strList=new ArrayList();  
  202.             strList.add("http://www.xxx/exclusive/?pindao="+Pindao.getKey("国际"));  
  203.             //滚动新闻  
  204.             strList.add("http://www.xxx/article/?pindao="+Pindao.getKey("国际"));  
  205.           
  206.         for(String str:strList){  
  207.             Spider.create(new HQNewsTaskDao()).addUrl(str).thread(1).run();   
  208.         }  
  209.  }  
  210.       
  211.     //所有频道Action  
  212.     public static void runNewsList(List strList){  
  213.         for(String str:strList){  
  214.             Spider.create(new HQNewsTaskDao()).addUrl(str).thread(1).run();       
  215.         }  
  216.     }  
  217. }  


四、定时抓取,配置如下:
           1、web.xml重配置监听       
[java] view plain copy
  1.   
  2.       
  3.        class>com.spider.utils.AutoRunclass>   
  4.      
        2、定时代码

       

[java] view plain copy
  1. package com.spider.utils;  
  2.   
  3. import java.util.concurrent.Executors;  
  4. import java.util.concurrent.ScheduledExecutorService;  
  5. import java.util.concurrent.TimeUnit;  
  6. import javax.servlet.ServletContextEvent;  
  7. import javax.servlet.ServletContextListener;  
  8. import com.spider.huanqiu.timer.HQJob1;  
  9. import com.spider.huanqiu.timer.HQJob2;  
  10. import com.spider.huanqiu.timer.HQJob3;  
  11. import com.spider.huanqiu.timer.HQJob4;  
  12. import com.spider.huasheng.timer.HSJob1;  
  13. import com.spider.huasheng.timer.HSJob2;  
  14. /** 
  15.  * 描        述:监听增量抓取Job 
  16.  * 创建时间:2016-11-4 
  17.  * @author Jibaole 
  18.  */  
  19. public class AutoRun implements ServletContextListener {   
  20.     
  21.   public void contextInitialized(ServletContextEvent event) {   
  22.       ScheduledExecutorService scheduExec =  Executors.newScheduledThreadPool(6);  
  23.     /*  
  24.      * 这里开始循环执行 HSJob()方法了  
  25.      * schedule(param1, param2,param3)这个函数的三个参数的意思分别是:  
  26.      *    param1:你要执行的方法;param2:延迟执行的时间,单位毫秒;param3:循环间隔时间,单位毫秒  
  27.      */   
  28.     scheduExec.scheduleAtFixedRate(new HSJob1(), 1*1000*60,1000*60*10,TimeUnit.MILLISECONDS);  //延迟1分钟,设置没10分钟执行一次  
  29.     scheduExec.scheduleAtFixedRate(new HSJob2(), 3*1000*60,1000*60*10,TimeUnit.MILLISECONDS);  //延迟3分钟,设置没10分钟执行一次  
  30.       
  31.     scheduExec.scheduleAtFixedRate(new HQJob1(), 5*1000*60,1000*60*10,TimeUnit.MILLISECONDS);  //延迟5分钟,设置没10分钟执行一次  
  32.     scheduExec.scheduleAtFixedRate(new HQJob2(), 7*1000*60,1000*60*10,TimeUnit.MILLISECONDS);  //延迟7分钟,设置没10分钟执行一次  
  33.     scheduExec.scheduleAtFixedRate(new HQJob3(), 9*1000*60,1000*60*14,TimeUnit.MILLISECONDS);  //延迟9分钟,设置没10分钟执行一次  
  34.     scheduExec.scheduleAtFixedRate(new HQJob4(), 11*1000*60,1000*60*10,TimeUnit.MILLISECONDS);  //延迟11分钟,设置没10分钟执行一次  
  35.   }   
  36.   public void contextDestroyed(ServletContextEvent event) {   
  37.       System.out.println("=======timer销毁==========");  
  38.     //timer.cancel();   
  39.   }   
  40. }   

       3、具体执行业务(举一个例子)

         

[java] view plain copy
  1. package com.spider.huasheng.timer;  
  2.   
  3. import java.util.ArrayList;  
  4. import java.util.List;  
  5. import java.util.TimerTask;  
  6. import com.spider.huasheng.task.HSTaskDao;  
  7. import com.spider.huasheng.task.HSTaskDao1;  
  8. import com.spider.huasheng.task.HSTaskDao2;  
  9.   
  10. /** 
  11.  * 描        述:国际、社会、国内、评论等频道定时任务 
  12.  * 创建时间:2016-11-9 
  13.  * @author Jibaole 
  14.  */  
  15. public class HSJob1 implements Runnable{   
  16.     @Override   
  17.     public void run() {   
  18.         System.out.println("======>>>开始:xxx-任务1====");  
  19.   try {   
  20.       runNews();  
  21.         runNews1();  
  22.         runNews2();  
  23.      } catch (Throwable t) {    
  24.          System.out.println("Error");    
  25.      }   
  26.        System.out.println("======xxx-任务1>>>结束!!!====");  
  27.     }   
  28.     /** 
  29.      * 抓取-新闻 频道列表 
  30.      */  
  31.     public void runNews(){  
  32.         List strList=new ArrayList();  
  33.         /**##############>>>16、国际<<<##################*/  
  34.         //国际视野  
  35.         strList.add("http://xxx/class/2199.html?pindao=国际");  
  36.           
  37.         /**##############>>>17、社会<<<##################*/  
  38.         //社会  
  39.         strList.add("http://xxx/class/2200.html?pindao=社会");  
  40.           
  41.         /**##############>>>18、国内<<<##################*/  
  42.         //国内动态  
  43.         strList.add("http://xxx/class/1922.html?pindao=国内");  
  44.         HQNewsTaskDao.runNewsList(strList);  
  45.     }  
  46.       
  47.     /** 
  48.      * 抓取-新闻 频道列表 
  49.      */  
  50.     public void runNews1(){  
  51.         List strList=new ArrayList();  
  52.         /**##############>>>19、评论<<<##################*/  
  53.         //华声视点  
  54.         strList.add("http://xxx/class/709.html?pindao=评论");  
  55.         //财经观察  
  56.         strList.add("http://xxx/class/2557.html?pindao=评论");  
  57.         /**##############>>>20、军事<<<##################*/  
  58.         //军事  
  59.         strList.add("http://xxx/class/2201.html?pindao=军事");  
  60.         HQNewsTaskDao.runNewsList(strList);  
  61.     }  
  62.     /** 
  63.      * 抓取-新闻 频道列表 
  64.      */  
  65.     public void runNews2(){  
  66.         List strList=new ArrayList();  
  67.         /**##############>>>24、财经<<<##################*/  
  68.         //财讯  
  69.         strList.add("http://xxx/class/2353.html?pindao=财经");  
  70.         //经济观察  
  71.         strList.add("http://xxx/class/2348.html?pindao=财经");  
  72.         /**##############>>>30、人文<<<##################*/  
  73.         //历史上的今天  
  74.         strList.add("http://xxx/class/1313.html?pindao=人文");  
  75.         //正史风云  
  76.         strList.add("http://xxx/class/1362.html?pindao=人文");  
  77.         HSTaskDao2.runNewsList(strList);  
  78.     }  
  79. }  



 五、使用到的工具类

       1、HttpClientUtil工具类

[java] view plain copy
  1. package com.spider.utils;  
  2.   
  3.   
  4. import java.io.BufferedReader;  
  5. import java.io.File;  
  6. import java.io.IOException;  
  7. import java.io.InputStreamReader;  
  8. import java.net.URL;  
  9. import java.util.ArrayList;  
  10. import java.util.List;  
  11. import java.util.Map;  
  12. import org.apache.commons.httpclient.HttpClient;  
  13. import org.apache.commons.httpclient.HttpStatus;  
  14. import org.apache.commons.httpclient.methods.GetMethod;  
  15. import org.apache.http.HttpEntity;  
  16. import org.apache.http.NameValuePair;  
  17. import org.apache.http.client.config.RequestConfig;  
  18. import org.apache.http.client.entity.UrlEncodedFormEntity;  
  19. import org.apache.http.client.methods.CloseableHttpResponse;  
  20. import org.apache.http.client.methods.HttpGet;  
  21. import org.apache.http.client.methods.HttpPost;  
  22. import org.apache.http.conn.ssl.DefaultHostnameVerifier;  
  23. import org.apache.http.conn.util.PublicSuffixMatcher;  
  24. import org.apache.http.conn.util.PublicSuffixMatcherLoader;  
  25. import org.apache.http.entity.ContentType;  
  26. import org.apache.http.entity.StringEntity;  
  27. import org.apache.http.entity.mime.MultipartEntityBuilder;  
  28. import org.apache.http.entity.mime.content.FileBody;  
  29. import org.apache.http.entity.mime.content.StringBody;  
  30. import org.apache.http.impl.client.CloseableHttpClient;  
  31. import org.apache.http.impl.client.HttpClients;  
  32. import org.apache.http.message.BasicNameValuePair;  
  33. import org.apache.http.util.EntityUtils;  
  34.   
  35.   
  36. public class HttpClientUtil {  
  37.     private final static String charset = "UTF-8";  
  38.     private RequestConfig requestConfig = RequestConfig.custom().setSocketTimeout(15000)  
  39.             .setConnectTimeout(15000)  
  40.             .setConnectionRequestTimeout(15000)  
  41.             .build();  
  42.       
  43.     private static HttpClientUtil instance = null;  
  44.     private HttpClientUtil(){}  
  45.     public static HttpClientUtil getInstance(){  
  46.         if (instance == null) {  
  47.             instance = new HttpClientUtil();  
  48.         }  
  49.         return instance;  
  50.     }  
  51.       
  52.     /** 
  53.      * 发送 post请求 
  54.      * @param httpUrl 地址 
  55.      */  
  56.     public String sendHttpPost(String httpUrl) {  
  57.         HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost    
  58.         return sendHttpPost(httpPost);  
  59.     }  
  60.       
  61.     /** 
  62.      * 发送 post请求 
  63.      * @param httpUrl 地址 
  64.      * @param params 参数(格式:key1=value1&key2=value2) 
  65.      */  
  66.     public String sendHttpPost(String httpUrl, String params) {  
  67.         HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost    
  68.         try {  
  69.             //设置参数  
  70.             StringEntity stringEntity = new StringEntity(params, "UTF-8");  
  71.             stringEntity.setContentType("application/x-www-form-urlencoded");   
  72.             httpPost.setEntity(stringEntity);  
  73.         } catch (Exception e) {  
  74.             e.printStackTrace();  
  75.         }  
  76.         return sendHttpPost(httpPost);  
  77.     }  
  78.        
  79.     /** 
  80.      * 发送 post请求 
  81.      * @param httpUrl 地址 
  82.      * @param maps 参数 
  83.      */  
  84.     public String sendHttpPost(String httpUrl, Map maps) {  
  85.         HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost    
  86.         httpPost.setHeader("Content-Type","application/x-www-form-urlencoded;charset="+charset);  
  87.         httpPost.setHeader("User-Agent","Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.");  
  88.         // 创建参数队列    
  89.         List nameValuePairs = new ArrayList();  
  90.         for (String key : maps.keySet()) {  
  91.             nameValuePairs.add(new BasicNameValuePair(key, maps.get(key)));  
  92.         }  
  93.         try {  
  94.             httpPost.setEntity(new UrlEncodedFormEntity(nameValuePairs, "UTF-8"));  
  95.         } catch (Exception e) {  
  96.             e.printStackTrace();  
  97.         }  
  98.         return sendHttpPost(httpPost);  
  99.     }  
  100.       
  101.       
  102.     /** 
  103.      * 发送 post请求(带文件) 
  104.      * @param httpUrl 地址 
  105.      * @param maps 参数 
  106.      * @param fileLists 附件 
  107.      */  
  108.     public String sendHttpPost(String httpUrl, Map maps, List fileLists) {  
  109.         HttpPost httpPost = new HttpPost(httpUrl);// 创建httpPost  
  110.         MultipartEntityBuilder meBuilder = MultipartEntityBuilder.create();  
  111.         for (String key : maps.keySet()) {  
  112.             meBuilder.addPart(key, new StringBody(maps.get(key), ContentType.TEXT_PLAIN));  
  113.         }  
  114.         for(File file : fileLists) {  
  115.             FileBody fileBody = new FileBody(file);  
  116.             meBuilder.addPart("files", fileBody);  
  117.         }  
  118.         HttpEntity reqEntity = meBuilder.build();  
  119.         httpPost.setEntity(reqEntity);  
  120.         return sendHttpPost(httpPost);  
  121.     }  
  122.       
  123.     /** 
  124.      * 发送Post请求 
  125.      * @param httpPost 
  126.      * @return 
  127.      */  
  128.     private String sendHttpPost(HttpPost httpPost) {  
  129.         CloseableHttpClient httpClient = null;  
  130.         CloseableHttpResponse response = null;  
  131.         HttpEntity entity = null;  
  132.         String responseContent = null;  
  133.         try {  
  134.             // 创建默认的httpClient实例.  
  135.             httpClient = HttpClients.createDefault();  
  136.             httpPost.setConfig(requestConfig);  
  137.             // 执行请求  
  138.             response = httpClient.execute(httpPost);  
  139.             entity = response.getEntity();  
  140.             responseContent = EntityUtils.toString(entity, "UTF-8");  
  141.         } catch (Exception e) {  
  142.             e.printStackTrace();  
  143.         } finally {  
  144.             try {  
  145.                 // 关闭连接,释放资源  
  146.                 if (response != null) {  
  147.                     response.close();  
  148.                 }  
  149.                 if (httpClient != null) {  
  150.                     httpClient.close();  
  151.                 }  
  152.             } catch (IOException e) {  
  153.                 e.printStackTrace();  
  154.             }  
  155.         }  
  156.         return responseContent;  
  157.     }  
  158.   
  159.     /** 
  160.      * 发送 get请求 
  161.      * @param httpUrl 
  162.      */  
  163.     public String sendHttpGet(String httpUrl) {  
  164.         HttpGet httpGet = new HttpGet(httpUrl);// 创建get请求  
  165.         return sendHttpGet(httpGet);  
  166.     }  
  167.   
  168.     /** 
  169.      * 发送 get请求Https 
  170.      * @param httpUrl 
  171.      */  
  172.     public String sendHttpsGet(String httpUrl) {  
  173.         HttpGet httpGet = new HttpGet(httpUrl);// 创建get请求  
  174.         return sendHttpsGet(httpGet);  
  175.     }  
  176.       
  177.     /** 
  178.      * 发送Get请求 
  179.      * @param httpPost 
  180.      * @return 
  181.      */  
  182.     private String sendHttpGet(HttpGet httpGet) {  
  183.         CloseableHttpClient httpClient = null;  
  184.         CloseableHttpResponse response = null;  
  185.         HttpEntity entity = null;  
  186.         String responseContent = null;  
  187.         try {  
  188.             // 创建默认的httpClient实例.  
  189.             httpClient = HttpClients.createDefault();  
  190.             httpGet.setConfig(requestConfig);  
  191.             // 执行请求  
  192.             response = httpClient.execute(httpGet);  
  193.             entity = response.getEntity();  
  194.             responseContent = EntityUtils.toString(entity, "UTF-8");  
  195.         } catch (Exception e) {  
  196.             e.printStackTrace();  
  197.         } finally {  
  198.             try {  
  199.                 // 关闭连接,释放资源  
  200.                 if (response != null) {  
  201.                     response.close();  
  202.                 }  
  203.                 if (httpClient != null) {  
  204.                     httpClient.close();  
  205.                 }  
  206.             } catch (IOException e) {  
  207.                 e.printStackTrace();  
  208.             }  
  209.         }  
  210.         return responseContent;  
  211.     }  
  212.       
  213.     /** 
  214.      * 发送Get请求Https 
  215.      * @param httpPost 
  216.      * @return 
  217.      */  
  218.     private String sendHttpsGet(HttpGet httpGet) {  
  219.         CloseableHttpClient httpClient = null;  
  220.         CloseableHttpResponse response = null;  
  221.         HttpEntity entity = null;  
  222.         String responseContent = null;  
  223.         try {  
  224.             // 创建默认的httpClient实例.  
  225.             PublicSuffixMatcher publicSuffixMatcher = PublicSuffixMatcherLoader.load(new URL(httpGet.getURI().toString()));  
  226.             DefaultHostnameVerifier hostnameVerifier = new DefaultHostnameVerifier(publicSuffixMatcher);  
  227.             httpClient = HttpClients.custom().setSSLHostnameVerifier(hostnameVerifier).build();  
  228.             httpGet.setConfig(requestConfig);  
  229.             // 执行请求  
  230.             response = httpClient.execute(httpGet);  
  231.             entity = response.getEntity();  
  232.             responseContent = EntityUtils.toString(entity, "UTF-8");  
  233.         } catch (Exception e) {  
  234.             e.printStackTrace();  
  235.         } finally {  
  236.             try {  
  237.                 // 关闭连接,释放资源  
  238.                 if (response != null) {  
  239.                     response.close();  
  240.                 }  
  241.                 if (httpClient != null) {  
  242.                     httpClient.close();  
  243.                 }  
  244.             } catch (IOException e) {  
  245.                 e.printStackTrace();  
  246.             }  
  247.         }  
  248.         return responseContent;  
  249.     }  
  250.       
  251.     /** 
  252.      * 利用httpClient获取页面 
  253.      * @param url 
  254.      * @return 
  255.      */  
  256.      public static String getPage(String url){  
  257.          String result="";  
  258.         HttpClient httpClient = new HttpClient();  
  259.         GetMethod getMethod = new GetMethod(url+"?date=" + new Date().getTime());//加时间戳,防止页面缓存  
  260.         try {  
  261.             int statusCode = httpClient.executeMethod(getMethod);  
  262.             httpClient.setTimeout(5000);  
  263.             httpClient.setConnectionTimeout(5000);  
  264.             if (statusCode != HttpStatus.SC_OK) {  
  265.                 System.err.println("Method failed: "+ getMethod.getStatusLine());  
  266.             }  
  267.               
  268.             // 读取内容  
  269.             //byte[] responseBody = getMethod.getResponseBody();  
  270.             BufferedReader reader = new BufferedReader(new InputStreamReader(getMethod.getResponseBodyAsStream()));    
  271.             StringBuffer stringBuffer = new StringBuffer();    
  272.             String str = "";    
  273.             while((str = reader.readLine())!=null){    
  274.                 stringBuffer.append(str);    
  275.             }    
  276.             // 处理内容  
  277.             result = stringBuffer.toString();  
  278.         } catch (Exception e) {  
  279.             System.err.println("页面无法访问");  
  280.         }  
  281.         getMethod.releaseConnection();  
  282.         return result;  
  283.   }  
  284. }  
2、下载图片方法

 

[java] view plain copy
  1. /** 
  2.      * 下载图片到本地 
  3.      * @param picUrl 图片Url 
  4.      * @param localPath 本地保存图片地址 
  5.      * @return 
  6.      */  
  7.     public String downloadPic(String picUrl,String localPath){  
  8.         String filePath = null;  
  9.         String url = null;  
  10.         try {    
  11.             URL httpurl = new URL(picUrl);    
  12.             String fileName = getFileNameFromUrl(picUrl);    
  13.             filePath = localPath + fileName;  
  14.             File f = new File(filePath);    
  15.             FileUtils.copyURLToFile(httpurl, f);   
  16.             Function fun = new Function();  
  17.             url = filePath.replace("/www/web/imgs", fun.getProValue("IMG_PATH"));  
  18.         } catch (Exception e) {    
  19.             logger.info(e);    
  20.             return null;    
  21.         }   
  22.         return url;  
  23.     }  

          1、替换咨询内容图片方法

         

[java] view plain copy
  1. /** 
  2.      * 替换内容中图片地址为本地地址 
  3.      * @param content html内容 
  4.      * @param pic_dir 本地地址文件路径 
  5.      * @return html内容 
  6.      */  
  7.     public static String replaceForNews(String content,String pic_dir){  
  8.         String str = content;  
  9.         String cont = content;  
  10.         while (true) {  
  11.             int i = str.indexOf("src=\"");  
  12.             if (i != -1) {  
  13.                 str = str.substring(i+5, str.length());  
  14.                 int j = str.indexOf("\"");  
  15.                 String pic_url = str.substring(0, j);  
  16.                 //下载图片到本地并返回图片地址  
  17.                 String pic_path = fun.downloadPicForNews(pic_url,pic_dir);  
  18.                 if(StringUtils.isNotEmpty(pic_url) && StringUtils.isNotEmpty(pic_path)){  
  19.                 cont = cont.replace(pic_url, pic_path);  
  20.                 str = str.substring(j,str.length());  
  21.                 }  
  22.             } else{  
  23.                 break;  
  24.             }  
  25.         }  
  26.         return cont;  
  27.     }  

         
[java] view plain copy
  1. /** 
  2.      * 下载图片到本地 
  3.      * @param picUrl 图片Url 
  4.      * @param localPath 本地保存图片地址 
  5.      * @return 
  6.      */  
  7.     public String downloadPicForNews(String picUrl,String localPath){  
  8.         String filePath = "";  
  9.         String url = "";  
  10.         try {    
  11.             URL httpurl = new URL(picUrl);  
  12.            HttpURLConnection urlcon = (HttpURLConnection) httpurl.openConnection();  
  13.            urlcon.setReadTimeout(3000);  
  14.            urlcon.setConnectTimeout(3000);  
  15.            int state = urlcon.getResponseCode(); //图片状态  
  16.            if(state == 200){  
  17.                String fileName = getFileNameFromUrl(picUrl);    
  18.                filePath = localPath + fileName;  
  19.                File f = new File(filePath);    
  20.                FileUtils.copyURLToFile(httpurl, f);   
  21.                Function fun = new Function();  
  22.                url = filePath.replace("/www/web/imgs", fun.getProValue("IMG_PATH"));  
  23.            }  
  24.         } catch (Exception e) {    
  25.             logger.info(e);    
  26.             return null;    
  27.         }   
  28.         return url;  
  29.     }  

     获取文件名称,根绝时间戳自定义
[java] view plain copy
  1. /** 
  2.      * 根据url获取文件名 
  3.      * @param url  
  4.      * @return 文件名 
  5.      */  
  6.     public static String getFileNameFromUrl(String url){    
  7.         //获取后缀  
  8.         String sux = url.substring(url.lastIndexOf("."));  
  9.         if(sux.length() > 4){  
  10.             sux = ".jpg";  
  11.         }  
  12.         int i = (int)(Math.random()*1000);  
  13.         //随机时间戳文件名称  
  14.         String name = new Long(System.currentTimeMillis()).toString()+ i + sux;   
  15.         return name;    
  16.     }  


  




五、遇到的坑
  1、增量抓取经常遇到这2个异常,如下
        抓取超时:Jsoup 获取页面内容,替换为 httpclient获取,Jsoup去解析

   页面gzip异常(这个问题特别坑,导致历史、增量抓取数据严重缺失,线上一直有问题



     解决方案:
                      增加:Site..addHeader("Accept-Encoding", "/")

                      这个是WebMagic的框架源码有点小Bug,如果没有设置Header,默认页面Accept-Encoding为:gzip

      

 
   
  2、定时抓取
     由ScheduledExecutorService多线程并行执行任务,替换Timer单线程串行

      原方式代码,如下:

[java] view plain copy
  1. package com.spider.utils;  
  2.   
  3. import java.util.Timer;   
  4. import javax.servlet.ServletContextEvent;  
  5. import javax.servlet.ServletContextListener;  
  6. import com.spider.huanqiu.timer.HQJob1;  
  7. import com.spider.huanqiu.timer.HQJob2;  
  8. import com.spider.huanqiu.timer.HQJob3;  
  9. import com.spider.huanqiu.timer.HQJob4;  
  10. import com.spider.huasheng.timer.HSJob1;  
  11. import com.spider.huasheng.timer.HSJob2;  
  12. /** 
  13.  * 描    述:监听增量抓取Job 
  14.  * 创建时间:2016-11-4 
  15.  * @author Jibaole 
  16.  */  
  17. public class AutoRun implements ServletContextListener {   
  18.   //HS-job   
  19.   private Timer hsTimer1 = null;   
  20.   private Timer hsTimer2 = null;   
  21.   //HQZX-job  
  22.   private Timer hqTimer1 = null;   
  23.   private Timer hqTimer2 = null;   
  24.   private Timer hqTimer3 = null;   
  25.   private Timer hqTimer4 = null;   
  26.     
  27.   public void contextInitialized(ServletContextEvent event) {   
  28.     hsTimer1 = new Timer(true);   
  29.     hsTimer2 = new Timer(true);  
  30.       
  31.     hqTimer1 = new Timer(true);  
  32.     hqTimer2 = new Timer(true);  
  33.     hqTimer3 = new Timer(true);  
  34.     hqTimer4 = new Timer(true);  
  35.     /*  
  36.      * 这里开始循环执行 HSJob()方法了  
  37.      * schedule(param1, param2,param3)这个函数的三个参数的意思分别是:  
  38.      *    param1:你要执行的方法;param2:延迟执行的时间,单位毫秒;param3:循环间隔时间,单位毫秒  
  39.      */   
  40.     hsTimer1.scheduleAtFixedRate(new HSJob1(), 1*1000*60,1000*60*10);  //延迟1分钟,设置没10分钟执行一次  
  41.     hsTimer2.scheduleAtFixedRate(new HSJob2(), 3*1000*60,1000*60*10);  //延迟3分钟,设置没10分钟执行一次  
  42.       
  43.     hqTimer1.scheduleAtFixedRate(new HQJob1(), 5*1000*60,1000*60*10);  //延迟5分钟,设置没10分钟执行一次  
  44.     hqTimer2.scheduleAtFixedRate(new HQJob2(), 7*1000*60,1000*60*10);  //延迟7分钟,设置没10分钟执行一次  
  45.     hqTimer3.scheduleAtFixedRate(new HQJob3(), 9*1000*60,1000*60*10);  //延迟9分钟,设置没10分钟执行一次  
  46.     hqTimer4.scheduleAtFixedRate(new HQJob4(), 11*1000*60,1000*60*10);  //延迟11分钟,设置没10分钟执行一次     
  47.   }   
  48.   public void contextDestroyed(ServletContextEvent event) {   
  49.       System.out.println("=======timer销毁==========");  
  50.     //timer.cancel();   
  51.   }   
  52. }   


3、定时多个任务时,使用多线程,遇到某个线程抛异常终止任务

     解决方案:在多线程run()方法里面,增加try{}catch{}

4、通过HttpClient定时获取页面内容时,页面缓存,抓不到最新内容

     解决方案:在工具类请求URL地址后面增加:url+"?date=" + new Date().getTime()

   
六、一些方面的处理    
  1、页面抓取规则调整
     先抓列表,在抓内容;改为 抓取列表的同时,需要获取内容详情
  2、保存数据方式作调整
      先抓取标题等概要信息,保存数据库,然后,更新内容信息,根据业务需求再删除一些非来源文章(版权问题);改为:直接控制来源,得到完整数据,再做批量保存;
 3、页面有一个不想要的内容,处理方法
       注释、JS代码、移除无用标签块  

你可能感兴趣的:(网络爬虫)