更多代码请见:https://github.com/xubo245/SparkLearning
环境:
ubuntu:Spark 1.5.2(已装)、R3.2.1
Window: Rstudio
1Ubuntu下配置
1.1.R安装:
Spark安装后直接启动SparkR会报错,R找不到,故需要装R
1.1.1R下载:
https://cran.r-project.org/src/base/R-3/
或者:
https://cran.rstudio.com/src/base/R-3/
1.1.2安装:解压后
./configure make sudo make install
sudo make uninstall
vi /etc/profile source /etc/profile
1.2使用
1.2.1启动:
./bin/sparkR
sparkR里面已经初始化了sc等:
sc <- sparkR.init() sqlContext <- sparkRSQL.init(sc)
1.2.2.1 R自带faithful数据集:
> df <- createDataFrame(sqlContext, faithful) > head(df) eruptions waiting 1 3.600 79 2 1.800 54 3 3.333 74 4 2.283 62 5 4.533 85 6 2.883 55
> people <- read.df(sqlContext, "/examples/src/main/resources/people.json", "json") > head(people) age name 1 NA Michael 2 30 Andy 3 19 Justin
> printSchema(people) root |-- age: long (nullable = true) |-- name: string (nullable = true)
> write.df(people, path="/xubo/spark/people.parquet", source="parquet", mode="overwrite") NULL
hadoop@Master:~$ hadoop fs -ls /xubo/spark Found 5 items drwxr-xr-x - hadoop supergroup 0 2016-03-29 21:24 /xubo/spark/data drwxr-xr-x - hadoop supergroup 0 2016-04-14 15:55 /xubo/spark/dataSQL drwxr-xr-x - hadoop supergroup 0 2016-04-14 16:45 /xubo/spark/examples drwxr-xr-x - xubo supergroup 0 2016-04-15 10:56 /xubo/spark/file drwxr-xr-x - xubo supergroup 0 2016-03-29 15:32 /xubo/spark/output hadoop@Master:~$ hadoop fs -ls /xubo/spark Found 6 items drwxr-xr-x - hadoop supergroup 0 2016-03-29 21:24 /xubo/spark/data drwxr-xr-x - hadoop supergroup 0 2016-04-14 15:55 /xubo/spark/dataSQL drwxr-xr-x - hadoop supergroup 0 2016-04-14 16:45 /xubo/spark/examples drwxr-xr-x - xubo supergroup 0 2016-04-15 10:56 /xubo/spark/file drwxr-xr-x - xubo supergroup 0 2016-03-29 15:32 /xubo/spark/output drwxr-xr-x - hadoop supergroup 0 2016-04-20 00:34 /xubo/spark/people.parquet
hadoop@Master:~$ hadoop fs -ls /xubo/spark/people.parquet Found 5 items -rw-r--r-- 3 hadoop supergroup 0 2016-04-20 00:34 /xubo/spark/people.parquet/_SUCCESS -rw-r--r-- 3 hadoop supergroup 277 2016-04-20 00:34 /xubo/spark/people.parquet/_common_metadata -rw-r--r-- 3 hadoop supergroup 750 2016-04-20 00:34 /xubo/spark/people.parquet/_metadata -rw-r--r-- 3 hadoop supergroup 537 2016-04-20 00:34 /xubo/spark/people.parquet/part-r-00000-9d377482-1bb6-46c3-bb19-d107a7da660a.gz.parquet -rw-r--r-- 3 hadoop supergroup 531 2016-04-20 00:34 /xubo/spark/people.parquet/part-r-00001-9d377482-1bb6-46c3-bb19-d107a7da660a.gz.parquet
> df <- createDataFrame(sqlContext, faithful) > df DataFrame[eruptions:double, waiting:double] > head(select(df, df$eruptions)) eruptions 1 3.600 2 1.800 3 3.333 4 2.283 5 4.533 6 2.883 > head(select(df, "eruptions")) eruptions 1 3.600 2 1.800 3 3.333 4 2.283 5 4.533 6 2.883 > head(filter(df, df$waiting < 50)) eruptions waiting 1 1.750 47 2 1.750 47 3 1.867 48 4 1.750 48 5 2.167 48 6 2.100 49
> head(summarize(groupBy(df, df$waiting), count = n(df$waiting))) waiting count 1 81 13 2 60 6 3 93 2 4 68 1 5 47 4 6 80 8 > waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting)) > head(arrange(waiting_counts, desc(waiting_counts$count))) waiting count 1 78 15 2 83 14 3 81 13 4 77 12 5 82 12 6 84 10
> df$waiting_secs <- df$waiting * 60 > head(df) eruptions waiting waiting_secs 1 3.600 79 4740 2 1.800 54 3240 3 3.333 74 4440 4 2.283 62 3720 5 4.533 85 5100 6 2.883 55 3300
> people <- read.df(sqlContext, "/examples/src/main/resources/people.json", "json") > registerTempTable(people, "people") > teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") > head(teenagers) name 1 Justin
> df <- createDataFrame(sqlContext, iris) Warning messages: 1: In FUN(X[[i]], ...) : Use Sepal_Length instead of Sepal.Length as column name 2: In FUN(X[[i]], ...) : Use Sepal_Width instead of Sepal.Width as column name 3: In FUN(X[[i]], ...) : Use Petal_Length instead of Petal.Length as column name 4: In FUN(X[[i]], ...) : Use Petal_Width instead of Petal.Width as column name > model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian") > head(df) Sepal_Length Sepal_Width Petal_Length Petal_Width Species 1 5.1 3.5 1.4 0.2 setosa 2 4.9 3.0 1.4 0.2 setosa 3 4.7 3.2 1.3 0.2 setosa 4 4.6 3.1 1.5 0.2 setosa 5 5.0 3.6 1.4 0.2 setosa 6 5.4 3.9 1.7 0.4 setosa > summary(model) $coefficients Estimate (Intercept) 2.2513930 Sepal_Width 0.8035609 Species__versicolor 1.4587432 Species__virginica 1.9468169 > predictions <- predict(model, newData = df) > head(select(predictions, "Sepal_Length", "prediction")) Sepal_Length prediction 1 5.1 5.063856 2 4.9 4.662076 3 4.7 4.822788 4 4.6 4.742432 5 5.0 5.144212 6 5.4 5.385281
由于没有搭建hive,故没有尝试hive的操作
2.Windows下配置:
2.1R安装
2.1.1 下载:
https://cran.r-project.org/mirrors.html
https://mirrors.tuna.tsinghua.edu.cn/CRAN/
2.1.2安装:简单。。。
2.2 RStudio安装:windows 7
2.2.1 下载:
https://www.rstudio.com/products/rstudio/download/
2.2.2安装:简单...
2.3 配置RStudio与SparkR
2.3.1下载编译好的spark到本地,比如:spark-1.5.2-bin-hadoop2.6.tar
2.3.2在RStudio中导入:
# Set this to where Spark is installed Sys.setenv(SPARK_HOME="D:/1win7/java/spark-1.5.2-bin-hadoop2.6") # This line loads SparkR from the installed directory .libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R","lib"), .libPaths())) library(SparkR) <pre name="code" class="plain">sc <- sparkR.init(master="local") sqlContext <- sparkRSQL.init(sc)
print("SparkR") df <- createDataFrame(sqlContext, faithful) head(df) print(df) people <- read.df(sqlContext, "D:/all/R/examples/src/main/resources/people.json", "json") head(people) print(people) print("end")
> source('D:/all/R/1.R') [1] "SparkR" DataFrame[eruptions:double, waiting:double] DataFrame[age:bigint, name:string] [1] "end"
2.4 RStudio上使用SparkR:
2.4.1 自带数据集:
> df <- createDataFrame(sqlContext, faithful) > head(df) eruptions waiting 1 3.600 79 2 1.800 54 3 3.333 74 4 2.283 62 5 4.533 85 6 2.883 55
> sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3") Re-using existing Spark Context. Please stop SparkR with sparkR.stop() or restart R to create a new Spark Context > sqlContext <- sparkRSQL.init(sc) > sparkR.stop() > sc <- sparkR.init(sparkPackages="com.databricks:spark-csv_2.11:1.0.3") Launching java with spark-submit command D:/1win7/java/spark-1.5.2-bin-hadoop2.6/bin/spark-submit.cmd --packages com.databricks:spark-csv_2.11:1.0.3 sparkr-shell C:\Users\xubo\AppData\Local\Temp\RtmpaGdWr8\backend_porte9c63a41172 > sqlContext <- sparkRSQL.init(sc)
> people <- read.df(sqlContext, "D:/all/R/examples/src/main/resources/people.json", "json") > head(people) age name 1 NA Michael 2 30 Andy 3 19 Justin
> printSchema(people) root |-- age: long (nullable = true) |-- name: string (nullable = true)
# Set this to where Spark is installed #Sys.setenv(SPARK_HOME="D:/1win7/java/spark-1.5.2-bin-hadoop2.6") # This line loads SparkR from the installed directory #.libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R","lib"), .libPaths())) #library(SparkR) #sc <- sparkR.init(master="local") #sqlContext <- sparkRSQL.init(sc) print("SparkR") df <- createDataFrame(sqlContext, faithful) head(df) print(df) people <- read.df(sqlContext, "D:/all/R/examples/src/main/resources/people.json", "json") head(people) print(people) printSchema(people) print("end")
> source('D:/all/R/1.R') [1] "SparkR" DataFrame[eruptions:double, waiting:double] DataFrame[age:bigint, name:string] root |-- age: long (nullable = true) |-- name: string (nullable = true) [1] "end"
write.df(people, path="D:/all/R/people.parquet", source="parquet", mode="overwrite")
2.4.6 Hive的操作:
> hiveContext <- sparkRHive.init(sc) > sql(hiveContext, "CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") DataFrame[result:string] > sql(hiveContext, "LOAD DATA LOCAL INPATH 'D:/all/R/examples/src/main/resources/kv1.txt' INTO TABLE src") DataFrame[result:string] > > results <- sql(hiveContext, "FROM src SELECT key, value") > head(results) key value 1 238 val_238 2 86 val_86 3 311 val_311 4 27 val_27 5 165 val_165 6 409 val_409
> # Create the DataFrame > df <- createDataFrame(sqlContext, faithful) > > # Get basic information about the DataFrame > df DataFrame[eruptions:double, waiting:double] > ## DataFrame[eruptions:double, waiting:double] > > # Select only the "eruptions" column > head(select(df, df$eruptions)) eruptions 1 3.600 2 1.800 3 3.333 4 2.283 5 4.533 6 2.883 > ## eruptions > ##1 3.600 > ##2 1.800 > ##3 3.333 > > # You can also pass in column name as strings > head(select(df, "eruptions")) eruptions 1 3.600 2 1.800 3 3.333 4 2.283 5 4.533 6 2.883 > > # Filter the DataFrame to only retain rows with wait times shorter than 50 mins > head(filter(df, df$waiting < 50)) eruptions waiting 1 1.750 47 2 1.750 47 3 1.867 48 4 1.750 48 5 2.167 48 6 2.100 49 > ## eruptions waiting > ##1 1.750 47 > ##2 1.750 47 > ##3 1.867 48
> # We use the `n` operator to count the number of times each waiting time appears > head(summarize(groupBy(df, df$waiting), count = n(df$waiting))) waiting count 1 81 13 2 60 6 3 93 2 4 68 1 5 47 4 6 80 8 > ## waiting count > ##1 81 13 > ##2 60 6 > ##3 68 1 > > # We can also sort the output from the aggregation to get the most common waiting times > waiting_counts <- summarize(groupBy(df, df$waiting), count = n(df$waiting)) > head(arrange(waiting_counts, desc(waiting_counts$count))) waiting count 1 78 15 2 83 14 3 81 13 4 77 12 5 82 12 6 84 10 > > ## waiting count > ##1 78 15 > ##2 83 14 > ##3 81 13
> # Convert waiting time from hours to seconds. > # Note that we can assign this to a new column in the same DataFrame > df$waiting_secs <- df$waiting * 60 > head(df) eruptions waiting waiting_secs 1 3.600 79 4740 2 1.800 54 3240 3 3.333 74 4440 4 2.283 62 3720 5 4.533 85 5100 6 2.883 55 3300 > ## eruptions waiting waiting_secs > ##1 3.600 79 4740 > ##2 1.800 54 3240 > ##3 3.333 74 4440
> # Load a JSON file > people <- read.df(sqlContext, "D:/all/R/examples/src/main/resources/people.json", "json") > > # Register this DataFrame as a tabllse. > registerTempTable(people, "people") > > # SQL statements can be run by using the sql method > teenagers <- sql(sqlContext, "SELECT name FROM people WHERE age >= 13 AND age <= 19") > head(teenagers) name 1 Justin > ## name > ##1 Justin
> # Create the DataFrame > df <- createDataFrame(sqlContext, iris) Warning messages: 1: In FUN(X[[i]], ...) : Use Sepal_Length instead of Sepal.Length as column name 2: In FUN(X[[i]], ...) : Use Sepal_Width instead of Sepal.Width as column name 3: In FUN(X[[i]], ...) : Use Petal_Length instead of Petal.Length as column name 4: In FUN(X[[i]], ...) : Use Petal_Width instead of Petal.Width as column name > > # Fit a linear model over the dataset. > model <- glm(Sepal_Length ~ Sepal_Width + Species, data = df, family = "gaussian") > > # Model coefficients are returned in a similar format to R's native glm(). > summary(model) $coefficients Estimate (Intercept) 2.2513930 Sepal_Width 0.8035609 Species__versicolor 1.4587432 Species__virginica 1.9468169 > ##$coefficients > ## Estimate > ##(Intercept) 2.2513930 > ##Sepal_Width 0.8035609 > ##Species_versicolor 1.4587432 > ##Species_virginica 1.9468169 > > # Make predictions based on the model. > predictions <- predict(model, newData = df) > head(select(predictions, "Sepal_Length", "prediction")) Sepal_Length prediction 1 5.1 5.063856 2 4.9 4.662076 3 4.7 4.822788 4 4.6 4.742432 5 5.0 5.144212 6 5.4 5.385281 > ## Sepal_Length prediction > ##1 5.1 5.063856 > ##2 4.9 4.662076 > ##3 4.7 4.822788 > ##4 4.6 4.742432 > ##5 5.0 5.144212 > ##6 5.4 5.385281
2.5 记录:开始配置不成功:
> library(SparkR) Error in library(SparkR) : 不存在叫‘SparkR’这个名字的程辑包待解决
# Set this to where Spark is installed Sys.setenv(SPARK_HOME="D:/1win7/java/spark-1.5.2") # This line loads SparkR from the installed directory .libPaths(c(file.path(Sys.getenv("SPARK_HOME"), "R"), .libPaths())) library(SparkR) sc <- sparkR.init(master="local") sqlContext <- sparkRSQL.init(sc) df <- createDataFrame(sqlContext, faithful) head(df) print("end")
SparkR从集群编译好的地方下载,然后放到本地
> source('D:/all/R/1.R') Launching java with spark-submit command D:/1win7/java/spark-1.5.2/bin/spark-submit.cmd sparkr-shell C:\Users\xubo\AppData\Local\Temp\RtmpwpZOpB\backend_port2cd416031ca9 Error in sparkR.init(master = "local") : JVM is not ready after 10 seconds
参考:
【1】 http://spark.apache.org/docs/1.5.2/sparkr.html
【2】http://www.csdn.net/article/1970-01-01/2826010
【3】http://files.meetup.com/3138542/SparkR-meetup.pdf
【4】https://github.com/amplab-extras/SparkR-pkg