关于sparklyr的一个简单的教程

下载Sparklyr


install.packages("sparklyr")

下载spark


library(sparklyr)
spark_install(version = "2.1.0")

链接本地spark

library(sparklyr)
sc <- spark_connect(master = "local")
* Using Spark: 2.1.0

传入数据进入spark

library(dplyr)
iris_tbl <- copy_to(sc, iris)

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

数据操作，就喝本地使用dplyr是一样的


iris_tbl %>% filter(Sepal_Length < 5)
# Source: spark [?? x 5]
   Sepal_Length Sepal_Width Petal_Length Petal_Width Species
 *                                 
 1          4.9         3            1.4         0.2 setosa 
 2          4.7         3.2          1.3         0.2 setosa 
 3          4.6         3.1          1.5         0.2 setosa 
 4          4.6         3.4          1.4         0.3 setosa 
 5          4.4         2.9          1.4         0.2 setosa 
 6          4.9         3.1          1.5         0.1 setosa 
 7          4.8         3.4          1.6         0.2 setosa 
 8          4.8         3            1.4         0.1 setosa 
 9          4.3         3            1.1         0.1 setosa 
10          4.6         3.6          1           0.2 setosa 
# ... with more rows

使用sql

library(DBI)
iris_preview <- dbGetQuery(sc, "SELECT * FROM iris LIMIT 10")
iris_preview
  Sepal_Length Sepal_Width Petal_Length Petal_Width Species
1           5.1         3.5          1.4         0.2  setosa
2           4.9         3.0          1.4         0.2  setosa
3           4.7         3.2          1.3         0.2  setosa
4           4.6         3.1          1.5         0.2  setosa
5           5.0         3.6          1.4         0.2  setosa
6           5.4         3.9          1.7         0.4  setosa
7           4.6         3.4          1.4         0.3  setosa
8           5.0         3.4          1.5         0.2  setosa
9           4.4         2.9          1.4         0.2  setosa
10          4.9         3.1          1.5         0.1  setosa

机器学习

# 划分数据机
partitions <-
  iris_tbl %>% sdf_partition(training = 0.5,
                             test = 0.5,
                             seed = 1099)

# 训练模型
fit <- partitions$training %>% ml_random_forest(response = 'Species',
                                                features = c('Sepal_Length','Sepal_Width','Petal_Length','Petal_Width'))

# 查看模型
fit
Formula: Species ~ Sepal_Length + Sepal_Width + Petal_Length + Petal_Width

RandomForestClassificationModel (uid=rfc_eff545dc5b6f) with 20 trees
# 查看模型细节
summary(fit)
              Length Class                                 Mode     
pipeline        5     ml_pipeline                           list     
pipeline_model  5     ml_pipeline_model                     list     
model          14     ml_random_forest_classification_model list     
dataset         2     tbl_spark                             list     
formula         1     -none-                                character
.response       1     -none-                                character
.features       4     -none-                                character
.index_labels   3     -none-                                character

# 进行预测
pred <- ml_predict(fit,partitions$test)
pred
# Source: spark [?? x 14]
   Sepal_Length Sepal_Width Petal_Length Petal_Width Species features label rawPrediction probability prediction predicted_label probability_0 probability_1
 *                                                                                      
 1          4.4         2.9          1.4         0.2 setosa                   2 setosa                   0                0
 2          4.4         3            1.3         0.2 setosa                   2 setosa                   0                0
 3          4.5         2.3          1.3         0.3 setosa                   2 setosa                   0                0
 4          4.6         3.2          1.4         0.2 setosa                   2 setosa                   0                0
 5          4.6         3.4          1.4         0.3 setosa                   2 setosa                   0                0
 6          4.6         3.6          1           0.2 setosa                   2 setosa                   0                0
 7          4.8         3            1.4         0.1 setosa                   2 setosa                   0                0
 8          4.8         3            1.4         0.3 setosa                   2 setosa                   0                0
 9          4.8         3.1          1.6         0.2 setosa                   2 setosa                   0                0
10          4.8         3.4          1.9         0.2 setosa                   2 setosa                   0.45             0
# ... with more rows, and 1 more variable: probability_2 
> 
# 计算精度
ml_classification_eval(pred)

分布式计算


spark_apply(iris_tbl, function(data) {
  data[1:4] + rgamma(1,2)
})

# Error in force(code) : 
#   sparklyr worker rscript failure, check worker logs for details
这里我遇到了一些小小的问题

使用h2o

library(rsparkling)
library(sparklyr)
library(dplyr)
library(h2o)


sc <- spark_connect(master = "local", version = "2.1.0")
mtcars_tbl <- copy_to(sc, mtcars, "mtcars")

#  将数据读入h2o里面
mtcars_h2o <- as_h2o_frame(sc, mtcars_tbl, strict_version_check = FALSE)
#  构建模型
mtcars_glm <- h2o.glm(x = c("wt", "cyl"), 
                      y = "mpg",
                      training_frame = mtcars_h2o,
                      lambda_search = TRUE)

sparklyr: R interface for Apache Spark

下载Sparklyr

下载spark

链接本地spark

传入数据进入spark

数据操作，就喝本地使用dplyr是一样的

使用sql

机器学习

分布式计算

使用h2o

你可能感兴趣的:(sparklyr: R interface for Apache Spark)