D盘的test/t目录下有文件users.csv,文件预览如下,加载数据到spark并移除首行。
import org.apache.spark.{
SparkConf, SparkContext}
object CsvDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("CsvDemo")
val sc = SparkContext.getOrCreate(conf)
val lines = sc.textFile("D:\\test\\t\\users.csv")
println("lines:"+lines.count())
//方法一
val fields= lines.mapPartitionsWithIndex((i, v) => {
if (i == 0)
v.drop(1)
else
v
}).map(x => x.split(","))
println("fields:"+fields.count())
//方法二(优选)
val fields2 = lines.filter(x=>x.startsWith("user_id")==false).map(x=>x.split(","))
println("fields2:"+fields.count())
}
}
/*输出:
lines:38210
fields:38209
fields2:38209
*/
import org.apache.spark.sql.SparkSession
import org.apache.spark.{
SparkConf, SparkContext, sql}
object CsvDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("CsvDemo")
val sc = SparkContext.getOrCreate(conf)
val spark:SparkSession = SparkSession.builder().config(conf).getOrCreate()
val df:sql.DataFrame = spark.read.format("csv").option("header","true").load("D:\\test\\t\\users.csv")
df.printSchema()
df.select("user_id","locale","birthyear").show(5)
}
}
D盘的test/t目录下有文件users.json,文件预览如下,加载数据到spark。
使用SparkContext装载数据源,需要导入scala内置的JSON库。
import org.apache.spark.rdd.RDD
import org.apache.spark.{
SparkConf, SparkContext}
object JsonDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("JsonDemo")
val sc = SparkContext.getOrCreate(conf)
val lines = sc.textFile("D:\\test\\t\\users.json")
lines.collect().foreach(println)
//导入scala内置的JSON包
import scala.util.parsing.json._
val rdd:RDD[Option[Any]] = lines.map(x=>JSON.parseFull(x))
rdd.collect().foreach(println)
}
}
/*输出:
{"name":"Michael"}
{"name":"Andy","Age":30}
{"name":"Justin","Age":19}
Some(Map(name -> Michael)) 注意是Option类型
Some(Map(name -> Andy, Age -> 30.0))
Some(Map(name -> Justin, Age -> 19.0))
*/
import org.apache.spark.sql.SparkSession
import org.apache.spark.{
SparkConf, SparkContext}
object JsonDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("JsonDemo")
val sc = SparkContext.getOrCreate(conf)
val spark = SparkSession.builder().config(conf).getOrCreate()
val df:sql.DataFrame = spark.read.format("json").option("header","true").load("D:\\test\\t\\users.json")
df.printSchema() //使用printSchema方法输出DataFrame的Schema信息
df.show() //使用show方法将DataFrame的内容输出
}
}
/*输出:
root
|-- Age: long (nullable = true)
|-- name: string (nullable = true)
+----+-------+
| Age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
既然用到了DataFrame,不妨试试操作列吧,依旧用上面的json文件。
import org.apache.spark.sql.SparkSession
import org.apache.spark.{
SparkConf, SparkContext}
object JsonDemo {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("JsonDemo")
val sc = SparkContext.getOrCreate(conf)
val spark = SparkSession.builder().config(conf).getOrCreate()
val df = spark.read.format("json").option("header","true").load("D:\\test\\t\\users.json")
df.show()
/*输出:
+----+-------+
| Age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
*/
//修改列名,将Age修改为age
println("-------------------修改列名-----------------------")
val df2 = df.withColumnRenamed("Age","age")
df2.printSchema()
df2.select("age","name").show()
/*输出:
root
|-- age: long (nullable = true) 可以看出重命名成功
|-- name: string (nullable = true)
+----+-------+
| age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
*/
//修改列类型(long->integer)
val df3 = df.withColumn("Age",df.col("Age").cast("int"))
df3.printSchema()
/*输出:
root
|-- Age: integer (nullable = true) 可以看出类型修改成功
|-- name: string (nullable = true)
*/
//Age列的类型修改为(long->string)
val df4 = df.withColumn("Age",df.col("Age").cast("string"))
df4.printSchema()
df4.show()
/*输出:
root
|-- Age: string (nullable = true)
|-- name: string (nullable = true)
+----+-------+
| Age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
*/
//name的类型修改为(string->integer)
val df5 = df.withColumn("name",df.col("name").cast("int"))
df5.printSchema() //不报错,类型修改成功
df5.show() //实际上string无法转换成int,实际值就是null
/*输出:
root
|-- Age: long (nullable = true) 可以看出类型修改成功
|-- name: integer (nullable = true)
+----+----+
| Age|name|
+----+----+
|null|null| but!!!,实际值为null!。
| 30|null|
| 19|null|
+----+----+
*/
//操作列,年龄加5,名字加hello
val df6 = df.withColumn("Age",df.col("Age")+5)
.withColumn("name",df.col("name")+"hello")
df6.show() //年龄加5成功,姓名加hello失败,该列全为null
/*输出:
+----+----+
| Age|name|
+----+----+
|null|null|
| 35|null|
| 24|null|
+----+----+
*/
//增加列,只需要列名命名不同,注意不严格区分不小写,Age->age不会新增列。
val df7 = df.withColumn("Age1",df.col("Age")+5)
.withColumn("name1",df.col("name")+"hello")
df7.show()
/*输出:
+----+-------+----+-----+
| Age| name|Age1|name1|
+----+-------+----+-----+
|null|Michael|null| null|
| 30| Andy| 35| null|
| 19| Justin| 24| null|
+----+-------+----+-----+
*/
//如何将一列改名并且改类型,改完列数不变
//方法1,先改名,再改类型
val df8 = df.withColumnRenamed("Age", "Age1")
val df9 = df8.withColumn("Age1",df8.col("Age1").cast("int"))
df9.printSchema()
df9.show()
/*输出:
root
|-- Age1: integer (nullable = true)
|-- name: string (nullable = true)
+----+-------+
|Age1| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
*/
//方法2,增列再删除列
val df10 = df.withColumn("Age1",df.col("Age").cast("int")).drop("Age")
df10.printSchema()
df10.show()
//注意:列名是不严格区分大小写的,如果Age+5赋值给新列age,不会新增age,只会在原Age列上改变。
/*输出:
root
|-- name: string (nullable = true)
|-- Age1: integer (nullable = true)
+-------+----+
| name|Age1|
+-------+----+
|Michael|null|
| Andy| 30|
| Justin| 19|
+-------+----+
*/
}
}