spark集群搭建及介绍:敬请关注
数据集:http://pan.baidu.com/s/1i4yMwHB
总结:sparkR通过data.table中的fread函数读取大数据集,相比于正常读取方式大致有3-5倍的提升;在重新处理大数据集是,最好重新登录R或者sparkR环境,因为建模或预测占用了大量的内存,不能及时释放。
查看原始数据集:通过iris数据集生成
[root@masterdata]#pwd
/data
[root@masterdata]#ls -lhsrt iris1g.txt
1.3G -rw-r--r--1root root 1.3G Feb 16 14:16 iris1g.txt
登录sparkR:
sparkR--masteryarn-client --num-executors 15
#1、加载数据:47671650千万数据,耗时34.4509secs
library(data.table)
> (time1 <-Sys.time())
[1] "2016-02-1810:29:54 CST"
> data_iris <-fread("/data/iris1g.txt", stringsAsFactors=T, sep=",",header=T, encoding="UTF-8")
Read 47671650 rowsand 5 (of 5) columns from 1.216 GB file in 00:00:33
> Sys.time() -time1
Time difference of34.4509 secs
#2、数据预处理
> dim(data_iris)
[1] 47671650 5
> str(data_iris)
Classes ‘data.table’and 'data.frame': 47671650 obs.of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 11 1 1 1 1 1 1 ...
- attr(*,".internal.selfref")=<externalptr>
> names(iris)
[1]"Sepal.Length" "Sepal.Width" "Petal.Length""Petal.Width" "Species"
> names(data_iris)
[1]"Sepal.Length" "Sepal.Width" "Petal.Length""Petal.Width" "Species"
>names(data_iris) <- names(iris)
>names(data_iris)
[1]"Sepal.Length" "Sepal.Width" "Petal.Length""Petal.Width" "Species"
#3、创建训练集和测试集数据
library(caret)
#创建训练集和测试集数据:耗时8.766534 secs
> (time1 <-Sys.time())
[1] "2016-02-1810:31:16 CST"
> ind <-base:::sample(3, nrow(data_iris), prob=c(0.3, 0.2, 0.5), replace=T)
> train <-data_iris[ind==1, ]
> test <-data_iris[ind==2, ]
> Sys.time() -time1
Time difference of3.32017 secs
#使用createDataPartition导致内存溢出
#(time1 <-Sys.time())
#index <-createDataPartition(data$Species, nrow(data), p=0.7, list=F)
#Sys.time() - time1
#train <-data[index, ]
#test <-data[-index, ]
> dim(train)
[1] 14300956 5
> dim(test)
[1] 9531827 5
#memory.size()
#gc()
#4、建模
#1)随机森林
#library(randomForest)
#model <-randomForest(train$X.Species.~., data=train, ntree=50, nPerm=10, mtry=3,proximity=T, importance=T)
#随机森林建模导致内存溢出
#2)使用决策时间建模:1.969041 mins
library(party)
> (time1 <-Sys.time())
[1] "2016-02-1811:02:18 CST"
> model <-ctree(Species~., data=train)
> Sys.time() -time1
Time difference of1.969041 mins
>print(object.size(model), units="Mb")
6317.7 Mb
#str(model)
> summary(model)
Length Class Mode
1 BinaryTree S4
#5、预测
> (time1 <-Sys.time())
[1] "2016-02-1811:12:46 CST"
> pred <-predict(model, test)
> Sys.time() -time1
Time difference of49.95143 secs
#6、模型评估
>mean(pred==test$Species)
[1] 1
>base:::table(pred, test$Species)
pred setosa versicolor virginica
setosa 3176958 0 0
versicolor 0 3177160 0
virginica 0 0 3177709