java爬虫爬取网站数据实例

01
002 package com.zzger.model;
003   
004 import java.util.ArrayList;
005 import java.util.Collections;
006 import java.util.List;
007 import java.util.concurrent.CountDownLatch;
008   
009 import com.zzger.module.queue.UrlQueue;
010 import com.zzger.util.HttpUtils;
011 import com.zzger.util.RegexUtils;
012   
013 public class WebSite {
014   
015     /**
016      * 站点url
017      */
018     private String url;
019       
020     /**
021      * 需要爬行的url队列
022      */
023     private UrlQueue urls = new UrlQueue<>();
024       
025     /**
026      * 已爬行过的页面url
027      */
028     private List exitUrls = Collections.synchronizedList(new ArrayList<>());
029       
030     private static final int TOTAL_THREADS = 12
031       
032     private final CountDownLatch mStartSignal = new CountDownLatch(1); 
033       
034     private final CountDownLatch mDoneSignal = new CountDownLatch(TOTAL_THREADS);  
035       
036     public WebSite(String url){
037         this.url = url;
038         urls.offer(url);//把网站首页加入需要爬行的队列中
039     }
040       
041     public void guangDu(){
042         new Thread(new Runnable() {
043             @Override
044             public void run() {
045                 paxing(HttpUtils.httpGet(url));
046             }
047         }).start();
048     }
049       
050     public void paxing(String html){
051         if(html.lastIndexOf("下一页")<0)    return ;
052         String strList = html.substring(html.indexOf("
  • next-page\\">"),
  • 053                 html.lastIndexOf("下一页"));
    054         String url = RegexUtils.RegexString("(.+?)\\"", strList);
    055         if(url.equals("Nothing")) return ;
    056         urls.put(url);//把url存储到队列中
    057         paxing(HttpUtils.httpGet(url));
    058     }
    059       
    060     public void dxcPx(){
    061         Page page = new Gxpage(urls.take());
    062         List> list = page.ybhqSection().getSections();
    063         for(Section section : list){
    064             new Thread(new Runnable() {
    065                 @Override
    066                 public void run() {
    067                     mStartSignal.countDown();// 计数减一为0,工作线程真正启动具体操作  
    068                     try {
    069                         mStartSignal.await();// 阻塞,等待mStartSignal计数为0运行后面的代码  
    070                         // 所有的工作线程都在等待同一个启动的命令  
    071                     catch (InterruptedException e) {
    072                         e.printStackTrace();
    073                     }
    074                     DuanZi duanzi = section.select().getModel();
    075                     System.out.println(duanzi.getTitle());
    076                     mDoneSignal.countDown();// 完成以后计数减一  
    077                 }
    078             }
    079             ).start();
    080         }
    081         try
    082         
    083             mDoneSignal.await();// 等待所有工作线程结束  
    084         
    085         catch (InterruptedException e) 
    086         
    087             e.printStackTrace(); 
    088         "vertical-align: inherit;">"vertical-align: inherit;">
    089         dxcPx(); //线程任务执行完后,再次获取URL队列进行任务
    090     }"vertical-align: inherit;">"vertical-align: inherit;">
    091     public static void main(String [] args){"vertical-align: inherit;">"vertical-align: inherit;">
    092         WebSite web = new WebSite(“https://www.bdqnhyq.com”);
    093         web.guangDu();"vertical-align: inherit;">"vertical-align: inherit;">
    094         forint i = 0; i <10; i ++){"vertical-align: inherit;">"vertical-align: inherit;">
    095             新线程(new Runnable(){"vertical-align: inherit;">"vertical-align: inherit;">
    096                 @覆盖"vertical-align: inherit;">"vertical-align: inherit;">
    097                 public void run(){"vertical-align: inherit;">"vertical-align: inherit;">
    098                     web.dxcPx();"vertical-align: inherit;">"vertical-align: inherit;">
    099                 }"vertical-align: inherit;">"vertical-align: inherit;">
    100             })。开始();"vertical-align: inherit;">"vertical-align: inherit;">
    101         }
    102           "vertical-align: inherit;">"vertical-align: inherit;">
    103     }"vertical-align: inherit;">"vertical-align: inherit;">
    104 }"vertical-align: inherit;">"vertical-align: inherit;">

    文章来源: 北大青鸟 开发小组

    你可能感兴趣的:(java爬虫爬取网站数据实例)