Tom,DataBase,80
Tom,Algorithm,50
Tom,DataStructure,60
Jim,DataBase,90
Jim,Algorithm,60
Jim,DataStructure,80
该系总共有多少学生;
val lines = sc.textFile("file:///usr/local/spark/sparksqldata/Data01.txt")
val par = lines.map(row=>row.split(",")(0))
val distinct_par = par.distinct() //去重操作
distinct_par.count //取得总数
val lines = sc.textFile("file:///usr/local/spark/sparksqldata/Data01.txt")
val par = lines.map(row=>row.split(",")(1))
val distinct_par = par.distinct()
distinct_par.count
val lines = sc.textFile("file:///usr/local/spark/sparksqldata/Data01.txt")
val pare = lines.filter(row=>row.split(",")(0)=="Tom")
pare.foreach(println)
Tom,DataBase,26
Tom,Algorithm,12
Tom,OperatingSystem,16
Tom,Python,40
Tom,Software,60
pare.map(row=>(row.split(",")(0),row.split(",")(2).toInt))
.mapValues(x=>(x,1))
.reduceByKey((x,y0) => (x._1+y._1,x._2 + y._2))
.mapValues(x => (x._1 / x._2)).collect()
//res9: Array[(String, Int)] = Array((Tom,30))
val lines = sc.textFile("file:///usr/local/spark/sparksqldata/Data01.txt")
val pare = lines.map(row=>(row.split(",")(0),row.split(",")(1)))
pare.mapValues(x => (x,1))
.reduceByKey((x,y) => (" ",x._2 + y._2))
.mapValues(x => x._2).foreach(println)
val lines = sc.textFile("file:///usr/local/spark/sparksqldata/Data01.txt")
val pare = lines.map(row=>(row.split(",")(1),row.split(",")(2).toInt))
pare.mapValues(x=>(x,1))
.reduceByKey((x,y) => (x._1+y._1,x._2 + y._2))
.mapValues(x => (x._1 / x._2)).collect()
res0: Array[(String, Int)] = Array((Python,57), (OperatingSystem,54), (CLanguage,50),
val lines = sc.textFile("file:///usr/local/spark/sparksqldata/Data01.txt")
val pare = lines.filter(row=>row.split(",")(1)=="DataBase")
.map(row=>(row.split(",")(1),1))
val accum = sc.longAccumulator("My Accumulator")
pare.values.foreach(x => accum.add(x))
accum.value
res19: Long = 126
源文件内容如下(包含 id,name,age),将数据复制保存到 ubuntu 系统/usr/local/spark 下, 命名为 employee.txt,实现从 RDD 转换得到 DataFrame,并按 id:1,name:Ella,age:36 的格式 打印出 DataFrame 的所有数据。请写出程序代码。
1,Ella,36
2,Bob,29
3,Jack,29
方法一:
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.Encoder
import spark.implicits._
object RDDtoDF {
def main(args: Array[String]) {
case class Employee(id:Long,name: String, age: Long)
val employeeDF = spark.sparkContext.textFile("file:///usr/local/spark/employee.txt")
.map(_.split(","))
.map(attributes => Employee(attributes(0).trim.toInt,attributes(1), attributes(2).trim.toInt)).toDF()
employeeDF.createOrReplaceTempView("employee")
val employeeRDD = spark.sql("select id,name,age from employee")
employeeRDD.map(t => "id:"+t(0)+","+"name:"+t(1)+","+"age:"+t(2)).show()
}
}
方法二:
import org.apache.spark.sql.types._import org.apache.spark.sql.Encoder
import org.apache.spark.sql.Row
object RDDtoDF {
def main(args: Array[String]) {
val employeeRDD =
spark.sparkContext.textFile("file:///usr/local/spark/employee.txt")
val schemaString = "id name age"
val fields = schemaString.split(" ").map(fieldName => StructField(fieldName,
StringType, nullable = true))
val schema = StructType(fields)
val rowRDD = employeeRDD.map(_.split(",")).map(attributes =>
Row(attributes(0).trim, attributes(1), attributes(2).trim))
val employeeDF = spark.createDataFrame(rowRDD, schema)
employeeDF.createOrReplaceTempView("employee")
val results = spark.sql("SELECT id,name,age FROM employee")
results.map(t => "id:"+t(0)+","+"name:"+t(1)+","+"age:"+t(2)).show()
}
}
{ “id”:1 ,“name”:" Ella",“age”:36 }
{ “id”:2,“name”:“Bob”,“age”:29 }
{ “id”:3 ,“name”:“Jack”,“age”:29 }
{ “id”:4 ,“name”:“Jim”,“age”:28 }
{ “id”:5 ,“name”:“Damon” }
{ “id”:5 ,“name”:“Damon” }
scala> import org.apache.spark.sql.SparkSession
scala> val spark=SparkSession.builder().getOrCreate()
scala> import spark.implicits._
scala> val df = spark.read.json("file:///usr/local/spark/employee.json")
答案:
scala> df.show()
答案:
scala> df.distinct().show()
答案:
scala> df.drop("id").show()
答案:
scala> df.filter(df("age") > 30 ).show()
答案:
scala> df.groupBy("name").count().show()
答案:
scala> df.sort(df("name").asc).show()
答案:
scala> df.take(3) 或 scala> df.head(3)
答案:
scala> df.select(df("name").as("username")).show()
答案:
scala> df.agg("age"->"avg")
答案:
scala> df.agg("age"->"min")
编程实现利用 DataFrame 读写 MySQL 的数据
(1) 在 MySQL 数据库中新建数据库 sparktest,再建表 employee,包含下列两行数据;
表 1 employee 表原有数据
id name gender age
1 Alice F 22 2 John M 25
假设当前目录为/usr/local/spark/mycode/testmysql,在当前目录下新建一个目录 mkdir -p src/main/scala , 然 后 在 目 录 /usr/local/spark/mycode/testmysql/src/main/scala 下 新 建 一 个testmysql.scala
import java.util.Properties
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row
object TestMySQL {
def main(args: Array[String]) {
val employeeRDD = spark.sparkContext.parallelize(Array("3 Mary F 26","4 Tom M 23")).map(_.split(" "))
val schema = StructType(
List(
StructField("id", IntegerType, true),StructField("name", StringType, true),
StructField("gender", StringType, true),StructField("age", IntegerType, true)
)
)
val rowRDD = employeeRDD.map(p => Row(p(0).toInt,p(1).trim, p(2).trim,p(3).toInt))
val employeeDF = spark.createDataFrame(rowRDD, schema)
val prop = new Properties()
prop.put("user", "root")
prop.put("password", "hadoop")
prop.put("driver","com.mysql.jdbc.Driver")
employeeDF.write.mode("append").jdbc("jdbc:mysql://localhost:3306/sparktest",
sparktest.employee", prop)
val jdbcDF = spark.read.format("jdbc")
.option("url","jdbc:mysql://localhost:3306/sparktest")
.option("driver","com.mysql.jdbc.Driver")
.option("dbtable","employee")
.option("user","root").option("password", "hadoop").load()
jdbcDF.agg("age" -> "max", "age" -> "sum")
}
}
cd /usr/local/spark
./bin/spark-shell
:quit
sudo tar -zxf ~/下载/spark-2.1.0-bin-without-hadoop.tgz -C /usr/local
cd /usr/local
sudo mv ./spark-2.1.0-bin-without-hadoop ./spark
sudo chow -R hadoop:hadoop ./spark
操作 | 含义 |
---|---|
filter(func) | 帅选出满足函数func的元素,并返回一个新的数据集 |
map(func) | 将每个元素传入到函数func中,并将结构返回一个新的数据集 |
flatMap(func) | 与上类似,但每个元素都可以映射到0个或多个输出结果 |
groupByKey() | 应用于(K,V)键值对数据集时,返回一个(K,Iterable)形式的数据集 |
reduceByKey(func) | 应用于(K,V)键值对数据集时,返回一个新的(K,V)形式的数据集,V是每个Key传递到函数func中的聚合后的结果。 |
操作 | 含义 |
---|---|
count() | 返回数据集中的元素个数 |
collect() | 以数组的形式返回数据集中的所有元素 |
first() | 返回数据集中的第一个元素 |
take(n) | 以数据的形式返回数据集中的前n个元素 |
reduce(func) | 通过函数func聚合数据集中的元素 |
foreache(func) | 将数据集中的每个元素传递到函数func中运行 |