SparkR是一个提供从R中使用Spark的轻量级前端的R包。在Spark1.6以后,SparkR提供了分布式数据框,它支持selection,filtering,aggregation等操作。也支持使用MLlib分布式机器学习。
类似于R中的数据框,数据源有结构化数据文件,Hive表,外部数据库或者本地R数据框。
SparkContext连接R程序到Spark集群。通过sparkR.init来创建SparkContext,为了使用数据框,需要使用SparkContext创建一个SQLContext。如果在SparkR的shell中,这些已经创建好了
sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)
需要设置环境变量
if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
Sys.setenv(SPARK_HOME = "/home/spark")
}
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
sc <- sparkR.init(master = "local[*]", sparkEnvir = list(spark.driver.memory="2g"))
df <- createDataFrame(sqlContext, faithful)
# Displays the content of the DataFrame to stdout
head(df)
## eruptions waiting
##1 3.600 79
##2 1.800 54
##3 3.333 74
一般从数据源创建数据框的方法是read.df.它有三个参数,SQLContext,文件路径,数据源类型。SparkR原生支持JSON和Parquet文件,通过Spark包也可以读取csv和Avro。这些包可以通过–packages添加(使用spark-submit或者SparkR命令行时),也可以通过init创建SparkContext时使用packages参数添加。
sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3")
sqlContext <- sparkRSQL.init(sc)
读取json数据
people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
head(people)
## age name
##1 NA Michael
##2 30 Andy
##3 19 Justin
# SparkR automatically infers the schema from the JSON file
printSchema(people)
# root
# |-- age: integer (nullable = true)
# |-- name: string (nullable = true)
保存
write.df(people, path="people.parquet", source="parquet", mode="overwrite")
# sc is an existing SparkContext.
hiveContext <- sparkRHive.init(sc)
sql(hiveContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
sql(hiveContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")
# Queries can be expressed in HiveQL.
results <- sql(hiveContext, "FROM src SELECT key, value")
# results is now a DataFrame
head(results)
## key value
## 1 238 val_238
## 2 86 val_86
## 3 311 val_311
# Create the DataFrame
df <- createDataFrame(sqlContext, faithful)
# Get basic information about the DataFrame
df
## DataFrame[eruptions:double, waiting:double]
# Select only the "eruptions" column
head(select(df, df$eruptions))
## eruptions
##1 3.600
##2 1.800
##3 3.333
# You can also pass in column name as strings
head(select(df, "eruptions"))
# Filter the DataFrame to only retain rows with wait times shorter than 50 mins
head(filter(df, df$waiting < 50))
# We use the `n` operator to count the number of times each waiting time appears
head(summarize(groupBy(df, df$waiting), count = n(df$waiting)))
## waiting count
##1 81 13
##2 60 6
##3 68 1
# We can also sort the output from the aggregation to get the most common waiting times
waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting))
head(arrange(waiting_counts, desc(waiting_counts$count)))
## waiting count
##1 78 15
##2 83 14
##3 81 13
# Convert waiting time from hours to seconds.
# Note that we can assign this to a new column in the same DataFrame
df$waiting_secs <- df$waiting * 60
head(df)
## eruptions waiting waiting_secs
##1 3.600 79 4740
##2 1.800 54 3240
##3 3.333 74 4440
# Load a JSON file
people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
# Register this DataFrame as a table.
registerTempTable(people, "people")
# SQL statements can be run by using the sql method
teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")
head(teenagers)
## name
##1 Justin
# Create the DataFrame
df <- createDataFrame(sqlContext, iris)
# Fit a gaussian GLM model over the dataset.
model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian")
# Model summary are returned in a similar format to R's native glm().
summary(model)
##$devianceResiduals
## Min Max
## -1.307112 1.412532
##
##$coefficients
## Estimate Std. Error t value Pr(>|t|)
##(Intercept) 2.251393 0.3697543 6.08889 9.568102e-09
##Sepal_Width 0.8035609 0.106339 7.556598 4.187317e-12
##Species_versicolor 1.458743 0.1121079 13.01195 0
##Species_virginica 1.946817 0.100015 19.46525 0
# Make predictions based on the model.
predictions <- predict(model, newData = df)
head(select(predictions, "Sepal_Length", "prediction"))
## Sepal_Length prediction
##1 5.1 5.063856
##2 4.9 4.662076
##3 4.7 4.822788
##4 4.6 4.742432
##5 5.0 5.144212
##6 5.4 5.385281
# Create the DataFrame
df <- createDataFrame(sqlContext, iris)
training <- filter(df, df$Species != "setosa")
# Fit a binomial GLM model over the dataset.
model <- glm(Species ~ Sepal_Length + Sepal_Width, data = training, family = "binomial")
# Model coefficients are returned in a similar format to R's native glm().
summary(model)
##$coefficients
## Estimate
##(Intercept) -13.046005
##Sepal_Length 1.902373
##Sepal_Width 0.404655