[SparkR]

words <- flatMap(data, function(line) {
  strsplit(line, " ")[[1]]
})
wordCount <- lapply(words, function(word) {
  list(word, 1L)
})
counts <- reduceBykey(wordCount, "+", 2L)
output <- collect(counts)
for (wordCount in output) {
  cat(wordCount[[1]], ": ", wordCount[[2]], "\n")
}
#dataframe.R
library(SparkR)

# Initialize SparkContext and SQLContext
sc <- sparkR.init(appName="SparkR-DataFrame-example")
sqlContext <- sparkRSQL.init(sc)

hadoop fs -rm /user/yyl/alldata.csv
Sys.setenv('SPARKR_SUBMIT_ARGS'='"--packages" "com.databricks:spark-csv_2.10:1.0.3" "sparkr-shell"')
# Create a DataFrame from a JSON file
path <- file.path("hdfs://hadoop-namenode1:8020/user/yyl/alldata.csv")
peopleDF <- read.text(sqlContext, path)
printSchema(peopleDF)

# Register this DataFrame as a table.
registerTempTable(peopleDF, "people")

# SQL statements can be run by using the sql methods provided by sqlContext
teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19")

# Call collect to get a local data.frame
teenagersLocalDF <- collect(teenagers)

# Print the teenagers in our dataset 
print(teenagersLocalDF)
#data-manipulation.R
library(SparkR)
hadoop dfs -copyFromLocal /opt/cloudera/parcels/spark-1.6.2-bin-cdh5/data/hoho /user/yyl
args <- commandArgs(trailing = TRUE)
# Provides access to a copy of the command line arguments supplied when this R session was invoked.

if (length(args) != 1) {
  print("Usage: data-manipulation.R %
    summarize(avg(flightsDF$dep_delay), avg(flightsDF$arr_delay)) -> dailyDelayDF

  # Print the computed data frame
  head(dailyDelayDF)
}

# Stop the SparkContext now
sparkR.stop()

你可能感兴趣的:([SparkR])