多线程爬虫 用jsoup爬杭州房价

初学者学爬虫可以玩玩我做的这个小demo,原理就是用了jsoup这个小玩意,数据源是房天下的数据,杭州的房价,总共爬了100页。

三个类,超简单的,一个是爬虫demo,另一个是简单的开发商类,里面存放了这个开放商有多少套房源,均价多少,总价多少,方便后面对所有开放商的均价做了个排行。

import lombok.Data;
import lombok.SneakyThrows;
import lombok.Synchronized;

import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

/**                                  s
 * Created by Precious_Life on 2018/9/2.
 */
@Data
public class CrawlByJsoupTest {
    public static int count = 0;
    public static final Map treeMap=new TreeMap();
    public static final Queue urlList=new LinkedList<>();
    public static final CountDownLatch countDownLatch=new CountDownLatch(100);
    @Synchronized
    public void addDeveloperInfo(String name,int totalPrice,int avengePrice){
        if(treeMap.containsKey(name)){
            DeveloperInfo tmp=treeMap.get(name);
            int tmpTotalPrice=tmp.getTotalPrice()+totalPrice;
            int tmpCount=tmp.getHouseNum()+1;
            int tmpAvengePrice=(tmp.getAveragePrice()+avengePrice)/2;
            treeMap.put(name,new DeveloperInfo(name,tmpCount,tmpTotalPrice,tmpAvengePrice));
        }else{
            treeMap.put(name,new DeveloperInfo(name,1,totalPrice,avengePrice));
        }
    }
    public static Map getDeveloperInfoMap(){
        return treeMap;

    }
    @Synchronized
    public static void increaseCount(){
        count++;
    }
    @SneakyThrows
    public static void main(String[] args){
        ExecutorService executorService= Executors.newCachedThreadPool();
        CrawlByJsoupTest crawlByJsoupTest=new CrawlByJsoupTest();
        for(int j=1;j<=100;j++){
            urlList.offer("http://esf.hz.fang.com/house-a0154/i3" + j);
        }
        for(int i=0;i<10;i++){
            executorService.execute(new ElementsAnalysis(countDownLatch,crawlByJsoupTest));
        }
        countDownLatch.await();
        System.out.println("总共" + count + "套房源!");
        sortByAveragePrice(treeMap);
    }
    public static void sortByAveragePrice(Map treeMap){

        List> list = new ArrayList>(treeMap.entrySet());
        list.sort((Map.Entry o1, Map.Entry o2)->o1.getValue().getAveragePrice()-o2.getValue().getAveragePrice());
        System.out.println("下面是滨江区的开放商房价排行");
        int houseCount=0;
        for (Map.Entry e: list) {
            houseCount+=e.getValue().getHouseNum();
            System.out.println(e.getKey()+"  均价:"+e.getValue().getAveragePrice()+"元/平"+ " 房源共"+e.getValue().getHouseNum()+"套");
        }
        System.out.println("滨江现有房源"+houseCount+"套");
    }
    @Synchronized
    public static String pollUrl(){
        String url="";
        if(urlList.size()!=0){
            url=urlList.poll();
        }
        return url;
    }
    public static int getCount(){
        return count;
    }


}

下面就是开放商的类了,我都是用拼音进行命名的,应该很容易看懂吧。

@Data
@AllArgsConstructor
@NoArgsConstructor
public class DeveloperInfo {
    private String developerName;
    private int houseNum;
    private int totalPrice;
    private int averagePrice;
}
import com.sun.corba.se.impl.orbutil.concurrent.Sync;
import lombok.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.util.*;
import java.util.concurrent.CountDownLatch;

/**
 * Created by Precious_Life on 2018/9/2.
 */
@Data
@NoArgsConstructor
public class ElementsAnalysis implements Runnable{
    public CountDownLatch countDownLatch;
    CrawlByJsoupTest crawlByJsoupTest;

    public ElementsAnalysis(CountDownLatch countDownLatch,CrawlByJsoupTest crawlByJsoupTest){
        this.countDownLatch=countDownLatch;
        this.crawlByJsoupTest=crawlByJsoupTest;
    }


    @Override
    public void run() {
        while(analysisPage(CrawlByJsoupTest.pollUrl())){
        }

    }
    @SneakyThrows
    public  boolean analysisPage(String url){
        if(url.equals("")){
            return false;
        }
        System.out.println("now will analysis Num"+url+"页");
        Document document = Jsoup.connect(url)
                .userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
                .timeout(999999999)
                .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
                .header("Accept-Encoding", "gzip, deflate")
                .header("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3")
                .header("Connection", "keep-alive")
                .header("Host", "esf.hz.fang.com")
                //是忽略请求类型Id
                .ignoreContentType(true)
                .get();
        Element div_element = document.getElementsByClass("shop_list").get(0);
        Elements elements = div_element.select("dl");
        for (Element element : elements) {
            CrawlByJsoupTest.increaseCount();
            Elements elements1 = element.children();
            Elements tmp = elements1.get(1).select("a");
            if (tmp.size() == 0) continue;
            String developer = elements1.get(1).select("a").get(2).attr("title");
            String address = elements1.get(1).getElementsByClass("add_shop").select("span").text();
            String type = elements1.get(1).getElementsByClass("tel_shop").text();
            Element ss = elements1.get(2);
            String totalPrice = elements1.get(2).getElementsByClass("red").text();
            String averagePrice = elements1.get(2).select("span").get(1).text();
            int finalAveragePrice=(int)Double.parseDouble(averagePrice.substring(0,averagePrice.indexOf("元")));
            int finalTotalPrice=(int)Double.parseDouble(totalPrice.substring(0,totalPrice.indexOf("万")))*10000;
            crawlByJsoupTest.addDeveloperInfo(developer,finalTotalPrice,finalAveragePrice);
            System.out.println("Num"+CrawlByJsoupTest.count+"  开发商" + developer + " 地址:" + address + " 户型:" + type + " 单价:" + finalTotalPrice + "元/每平 总价:" + finalAveragePrice+"元");
        }
        countDownLatch.countDown();
        return true;
    }



}

对于上面的结果,就是及时输出了下房源的信息,然后又对房源数据进行了下分析,做了下均价排行。

截下图~~

多线程爬虫 用jsoup爬杭州房价_第1张图片

这个图是各开放商的房源均价排行

多线程爬虫 用jsoup爬杭州房价_第2张图片

把这个数据爬下来之后,有啥用么??没啥用。。。。。哈哈哈,查了下房价最低的这个聆涛苑到滨江这边的距离

多线程爬虫 用jsoup爬杭州房价_第3张图片

要两个小时呢!!!!!!!!!!!

后面看了一个那种开源的爬虫框架,实现逻辑和手写框架差不多,只不过封装性做的更好,扩展性更好,功能更强大,这个手写框架看好了的话,其他的框架代码也可以熟练上手的!

你可能感兴趣的:(java工作日常,JAVA,爬虫,Ĵsoup,房价,多线程)