webcollector爬虫demo

由于我们公司第二季度亏了7-8亿,所以项目组没有多余的资金让我们去正规渠道买数据。然后我就走向了一天爬虫的不归路。

其实Java爬虫有很多开源的框架,这边我选择的是webcollector这个中小型的框架(官网:https://github.com/CrawlScript/WebCollector,教程文档:http://datahref.com/archives/category/webcollector%E6%95%99%E7%A8%8B)


爬虫新手一只,现在我把代码贴出来,我们共同学习:



import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.util.CharsetDetector;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 这段代码是获取
 * http://www.variflight.com/sitemap.html?AE71649A58c77=
 * 页面的所有的航班号信息的一小段爬虫代码
 */

public class DemoPostCrawlerTest extends BreadthCrawler {


    public DemoPostCrawlerTest(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
    }

    @Override
    public void visit(Page page, CrawlDatums next) {
        CloseableHttpClient client = HttpClients.createDefault();
        String url = page.getUrl();
        try {
            HttpGet get = new HttpGet(url);
            HttpResponse response = client.execute(get);
            HttpEntity entity = response.getEntity();
            byte[] content = EntityUtils.toByteArray(entity);
            String charset = CharsetDetector.guessEncoding(content);
            String html = new String(content, charset);
            Document doc = Jsoup.parse(html, url);
            Elements links = doc.select("a[href]");
            for (int i = 0; i < links.size(); i++) {
                Element link = links.get(i);
                String href = link.attr("abs:href");
                if (href.startsWith("http://www.variflight.com/flight/fnum/")) {
                    String flightNo = href.replace("http://www.variflight.com/flight/fnum/", "").split("\\.")[0];
                    System.out.println(flightNo);
                }
            }
        } catch (Exception e) {

        }
    }


    public static void main(String[] args) throws Exception {
        DemoPostCrawlerTest crawler = new DemoPostCrawlerTest("crawl", true);
        crawler.setThreads(20);
        crawler.addSeed("http://www.variflight.com/sitemap.html?AE71649A58c77=");
        crawler.start(3);
    }
}


pom.xml


    
        cn.edu.hfut.dmic.webcollector
        WebCollector
        2.31
    
    
        org.apache.httpcomponents
        httpclient
        4.5
    
    
        org.jsoup
        jsoup
        1.8.3
    

你可能感兴趣的:(爬虫学习,java,爬虫,webcollector)