初学者学爬虫可以玩玩我做的这个小demo,原理就是用了jsoup这个小玩意,数据源是房天下的数据,杭州的房价,总共爬了100页。
三个类,超简单的,一个是爬虫demo,另一个是简单的开发商类,里面存放了这个开放商有多少套房源,均价多少,总价多少,方便后面对所有开放商的均价做了个排行。
import lombok.Data;
import lombok.SneakyThrows;
import lombok.Synchronized;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/** s
* Created by Precious_Life on 2018/9/2.
*/
@Data
public class CrawlByJsoupTest {
public static int count = 0;
public static final Map treeMap=new TreeMap();
public static final Queue urlList=new LinkedList<>();
public static final CountDownLatch countDownLatch=new CountDownLatch(100);
@Synchronized
public void addDeveloperInfo(String name,int totalPrice,int avengePrice){
if(treeMap.containsKey(name)){
DeveloperInfo tmp=treeMap.get(name);
int tmpTotalPrice=tmp.getTotalPrice()+totalPrice;
int tmpCount=tmp.getHouseNum()+1;
int tmpAvengePrice=(tmp.getAveragePrice()+avengePrice)/2;
treeMap.put(name,new DeveloperInfo(name,tmpCount,tmpTotalPrice,tmpAvengePrice));
}else{
treeMap.put(name,new DeveloperInfo(name,1,totalPrice,avengePrice));
}
}
public static Map getDeveloperInfoMap(){
return treeMap;
}
@Synchronized
public static void increaseCount(){
count++;
}
@SneakyThrows
public static void main(String[] args){
ExecutorService executorService= Executors.newCachedThreadPool();
CrawlByJsoupTest crawlByJsoupTest=new CrawlByJsoupTest();
for(int j=1;j<=100;j++){
urlList.offer("http://esf.hz.fang.com/house-a0154/i3" + j);
}
for(int i=0;i<10;i++){
executorService.execute(new ElementsAnalysis(countDownLatch,crawlByJsoupTest));
}
countDownLatch.await();
System.out.println("总共" + count + "套房源!");
sortByAveragePrice(treeMap);
}
public static void sortByAveragePrice(Map treeMap){
List> list = new ArrayList>(treeMap.entrySet());
list.sort((Map.Entry o1, Map.Entry o2)->o1.getValue().getAveragePrice()-o2.getValue().getAveragePrice());
System.out.println("下面是滨江区的开放商房价排行");
int houseCount=0;
for (Map.Entry e: list) {
houseCount+=e.getValue().getHouseNum();
System.out.println(e.getKey()+" 均价:"+e.getValue().getAveragePrice()+"元/平"+ " 房源共"+e.getValue().getHouseNum()+"套");
}
System.out.println("滨江现有房源"+houseCount+"套");
}
@Synchronized
public static String pollUrl(){
String url="";
if(urlList.size()!=0){
url=urlList.poll();
}
return url;
}
public static int getCount(){
return count;
}
}
下面就是开放商的类了,我都是用拼音进行命名的,应该很容易看懂吧。
@Data
@AllArgsConstructor
@NoArgsConstructor
public class DeveloperInfo {
private String developerName;
private int houseNum;
private int totalPrice;
private int averagePrice;
}
import com.sun.corba.se.impl.orbutil.concurrent.Sync;
import lombok.*;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.*;
import java.util.concurrent.CountDownLatch;
/**
* Created by Precious_Life on 2018/9/2.
*/
@Data
@NoArgsConstructor
public class ElementsAnalysis implements Runnable{
public CountDownLatch countDownLatch;
CrawlByJsoupTest crawlByJsoupTest;
public ElementsAnalysis(CountDownLatch countDownLatch,CrawlByJsoupTest crawlByJsoupTest){
this.countDownLatch=countDownLatch;
this.crawlByJsoupTest=crawlByJsoupTest;
}
@Override
public void run() {
while(analysisPage(CrawlByJsoupTest.pollUrl())){
}
}
@SneakyThrows
public boolean analysisPage(String url){
if(url.equals("")){
return false;
}
System.out.println("now will analysis Num"+url+"页");
Document document = Jsoup.connect(url)
.userAgent("Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; MALC)")
.timeout(999999999)
.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
.header("Accept-Encoding", "gzip, deflate")
.header("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3")
.header("Connection", "keep-alive")
.header("Host", "esf.hz.fang.com")
//是忽略请求类型Id
.ignoreContentType(true)
.get();
Element div_element = document.getElementsByClass("shop_list").get(0);
Elements elements = div_element.select("dl");
for (Element element : elements) {
CrawlByJsoupTest.increaseCount();
Elements elements1 = element.children();
Elements tmp = elements1.get(1).select("a");
if (tmp.size() == 0) continue;
String developer = elements1.get(1).select("a").get(2).attr("title");
String address = elements1.get(1).getElementsByClass("add_shop").select("span").text();
String type = elements1.get(1).getElementsByClass("tel_shop").text();
Element ss = elements1.get(2);
String totalPrice = elements1.get(2).getElementsByClass("red").text();
String averagePrice = elements1.get(2).select("span").get(1).text();
int finalAveragePrice=(int)Double.parseDouble(averagePrice.substring(0,averagePrice.indexOf("元")));
int finalTotalPrice=(int)Double.parseDouble(totalPrice.substring(0,totalPrice.indexOf("万")))*10000;
crawlByJsoupTest.addDeveloperInfo(developer,finalTotalPrice,finalAveragePrice);
System.out.println("Num"+CrawlByJsoupTest.count+" 开发商" + developer + " 地址:" + address + " 户型:" + type + " 单价:" + finalTotalPrice + "元/每平 总价:" + finalAveragePrice+"元");
}
countDownLatch.countDown();
return true;
}
}
对于上面的结果,就是及时输出了下房源的信息,然后又对房源数据进行了下分析,做了下均价排行。
截下图~~
这个图是各开放商的房源均价排行
把这个数据爬下来之后,有啥用么??没啥用。。。。。哈哈哈,查了下房价最低的这个聆涛苑到滨江这边的距离
要两个小时呢!!!!!!!!!!!
后面看了一个那种开源的爬虫框架,实现逻辑和手写框架差不多,只不过封装性做的更好,扩展性更好,功能更强大,这个手写框架看好了的话,其他的框架代码也可以熟练上手的!