Starting up:SparkContext,SQLContext


sc <- sparkR.init()
sqlContext <- sparkRSQL.init(sc)

Starting Up from RStudio


if (nchar(Sys.getenv("SPARK_HOME")) < 1) {
  Sys.setenv(SPARK_HOME = "/home/spark")
library(SparkR, lib.loc = c(file.path(Sys.getenv("SPARK_HOME"), "R", "lib")))
sc <- sparkR.init(master = "local[*]", sparkEnvir = list(spark.driver.memory="2g"))

From local data frames
df <- createDataFrame(sqlContext, faithful)

# Displays the content of the DataFrame to stdout
## eruptions waiting
##1 3.600 79
##2 1.800 54
##3 3.333 74

From Data Sources


sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3")
sqlContext <- sparkRSQL.init(sc)


people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")
## age name
##1 NA Michael
##2 30 Andy
##3 19 Justin

# SparkR automatically infers the schema from the JSON file
# root
# |-- age: integer (nullable = true)
# |-- name: string (nullable = true)


write.df(people, path="people.parquet", source="parquet", mode="overwrite")

From Hive tables

# sc is an existing SparkContext.
hiveContext <- sparkRHive.init(sc)

sql(hiveContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
sql(hiveContext, "LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

# Queries can be expressed in HiveQL.
results <- sql(hiveContext, "FROM src SELECT key, value")

# results is now a DataFrame
## key value
## 1 238 val_238
## 2 86 val_86
## 3 311 val_311


# Create the DataFrame
df <- createDataFrame(sqlContext, faithful)

# Get basic information about the DataFrame
## DataFrame[eruptions:double, waiting:double]

# Select only the "eruptions" column
head(select(df, df$eruptions))
## eruptions
##1 3.600
##2 1.800
##3 3.333

# You can also pass in column name as strings
head(select(df, "eruptions"))

# Filter the DataFrame to only retain rows with wait times shorter than 50 mins
head(filter(df, df$waiting < 50))
# We use the `n` operator to count the number of times each waiting time appears
head(summarize(groupBy(df, df$waiting), count = n(df$waiting)))
## waiting count
##1 81 13
##2 60 6
##3 68 1

# We can also sort the output from the aggregation to get the most common waiting times
waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting))
head(arrange(waiting_counts, desc(waiting_counts$count)))

## waiting count
##1 78 15
##2 83 14
##3 81 13
# Convert waiting time from hours to seconds.
# Note that we can assign this to a new column in the same DataFrame
df$waiting_secs <- df$waiting * 60
## eruptions waiting waiting_secs
##1 3.600 79 4740
##2 1.800 54 3240
##3 3.333 74 4440


# Load a JSON file
people <- read.df(sqlContext, "./examples/src/main/resources/people.json", "json")

# Register this DataFrame as a table.
registerTempTable(people, "people")

# SQL statements can be run by using the sql method
teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")
## name
##1 Justin


Gaussian GLM model

# Create the DataFrame
df <- createDataFrame(sqlContext, iris)

# Fit a gaussian GLM model over the dataset.
model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian")

# Model summary are returned in a similar format to R's native glm().
## Min Max 
## -1.307112 1.412532
## Estimate Std. Error t value Pr(>|t|) 
##(Intercept) 2.251393 0.3697543 6.08889 9.568102e-09
##Sepal_Width 0.8035609 0.106339 7.556598 4.187317e-12
##Species_versicolor 1.458743 0.1121079 13.01195 0 
##Species_virginica 1.946817 0.100015 19.46525 0 

# Make predictions based on the model.
predictions <- predict(model, newData = df)
head(select(predictions, "Sepal_Length", "prediction"))
## Sepal_Length prediction
##1 5.1 5.063856
##2 4.9 4.662076
##3 4.7 4.822788
##4 4.6 4.742432
##5 5.0 5.144212
##6 5.4 5.385281

Binomial GLM model

# Create the DataFrame
df <- createDataFrame(sqlContext, iris)
training <- filter(df, df$Species != "setosa")

# Fit a binomial GLM model over the dataset.
model <- glm(Species ~ Sepal_Length + Sepal_Width, data = training, family = "binomial")

# Model coefficients are returned in a similar format to R's native glm().
## Estimate
##(Intercept) -13.046005
##Sepal_Length 1.902373
##Sepal_Width 0.404655
