本节课的主要内容为在Spark中使用Scala对数据进行分析,并熟悉简单的RDD相关的操作
#HDFS文件:
[hadoop@gpmaster ~]$ hdfs dfs -ls /linkage
Found 10 items
-rw-r--r-- 2 hadoop supergroup 26248574 2016-05-23 20:59 /linkage/block_1.csv
-rw-r--r-- 2 hadoop supergroup 26255957 2016-05-23 20:59 /linkage/block_10.csv
-rw-r--r-- 2 hadoop supergroup 26241784 2016-05-23 20:59 /linkage/block_2.csv
-rw-r--r-- 2 hadoop supergroup 26253247 2016-05-23 20:59 /linkage/block_3.csv
-rw-r--r-- 2 hadoop supergroup 26247471 2016-05-23 20:59 /linkage/block_4.csv
-rw-r--r-- 2 hadoop supergroup 26249424 2016-05-23 20:59 /linkage/block_5.csv
-rw-r--r-- 2 hadoop supergroup 26256126 2016-05-23 20:59 /linkage/block_6.csv
-rw-r--r-- 2 hadoop supergroup 26261911 2016-05-23 20:59 /linkage/block_7.csv
-rw-r--r-- 2 hadoop supergroup 26253911 2016-05-23 20:59 /linkage/block_8.csv
-rw-r--r-- 2 hadoop supergroup 26254012 2016-05-23 20:59 /linkage/block_9.csv
#下面使用spark-shell操作
[hadoop@gpmaster ~]$ spark-shell
16/05/23 22:53:15 INFO SecurityManager: Changing view acls to: hadoop
16/05/23 22:53:15 INFO SecurityManager: Changing modify acls to: hadoop
16/05/23 22:53:15 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(hadoop); users with modify permissions: Set(hadoop)
16/05/23 22:53:16 INFO HttpServer: Starting HTTP Server
16/05/23 22:53:16 INFO Utils: Successfully started service 'HTTP class server' on port 15893.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 1.5.0
/_/
Using Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_60)
Type in expressions to have them evaluated.
Type :help for more information.
scala>
#读取HDFS文件并创建RDD
scala> val rawblocks = sc.textFile("/linkage")
rawblocks: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[6] at textFile at <console>:21
#获取前三行数据
scala> rawblocks.take(3)
res36: Array[String] = Array("id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match", 37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE, 39086,47614,1,?,1,?,1,1,1,1,1,TRUE)
#定义判断行中是否包含指定字符串的函数
scala> def isHeader(line:String) = line.contains("id_1")
scala> val head = rawblocks.take(3)
#获取指定字符串的行
scala> head.filter(isHeader(_)).foreach(println)
"id_1","id_2","cmp_fname_c1","cmp_fname_c2","cmp_lname_c1","cmp_lname_c2","cmp_sex","cmp_bd","cmp_bm","cmp_by","cmp_plz","is_match"
#过滤掉指定字符串的行
scala> head.filter(!isHeader(_)).foreach(println)
37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE
39086,47614,1,?,1,?,1,1,1,1,1,TRUE
#获取除指定字符串外的所有数据
scala> val nohead = rawblocks.filter(!isHeader(_))
#获取除去指定字符串外的第一行
scala> nohead.first
res41: String = 37291,53113,0.833333333333333,?,1,?,1,1,1,1,0,TRUE
#获取除去指定字符串外的总行数
scala> nohead.count()
res42: Long = 5749132
#使用元组合case class对数据进行结构化
#通过查询/linkage下面的数据可以发现数据都以逗号进行分隔,并且包含如下信息:
#1. 前两个字段是整数型ID,代表记录中匹配的两个病人
#2. 后面九个值是双精度浮点数,代表病人记录中不同字段(姓名,生日,地址)的匹配分值(可能包含数据丢失的情况)
#3. 最后一个字段是布尔型(TRUE或FALSE),代表该行病人记录对是否匹配
49451,90407,1,?,1,?,1,1,1,1,0,TRUE
39932,40902,1,?,1,?,1,1,1,1,1,TRUE
25965,64753,1,?,1,?,1,1,1,1,1,TRUE
46626,47940,1,?,1,?,1,1,1,1,1,TRUE
84795,97439,1,?,1,?,1,1,1,1,1,TRUE
36950,42116,1,?,1,1,1,1,1,1,1,TRUE
42413,48491,1,?,1,?,1,1,1,1,1,TRUE
...............
#我们将/linkage下面的数据解析为4个值的元祖:第一个病人的整数ID,第二个病人的整数ID,包含九个双精度浮点数的一个数组和表示是否匹配的布尔型字段。
scala> val head = rawblocks.take(10)
scala> val line = head(5)
line: String = 36950,42116,1,?,1,1,1,1,1,1,1,TRUE
scala> val pieces = line.split(',')
pieces: Array[String] = Array(36950, 42116, 1, ?, 1, 1, 1, 1, 1, 1, 1, TRUE)
scala> val id1 = pieces(0).toInt
id1: Int = 36950
scala> val id2 = pieces(1).toInt
id2: Int = 42116
scala> val matched = pieces(11).toBoolean
matched: Boolean = true
scala> val rawscores = pieces.slice(2,11)
rawscores: Array[String] = Array(1, ?, 1, 1, 1, 1, 1, 1, 1)
#发生了错误,因为其中包含?无法转换为Double类型
scala> rawscores.map(s => s.toDouble)
java.lang.NumberFormatException: For input string: "?"
at sun.misc.FloatingDecimal.readJavaFormatString(FloatingDecimal.java:1250)
#写一个函数处理
def toDouble(s: String) = {
if ("?".equals(s)) Double.NaN else s.toDouble
}
#遇到? 时返回NaN
scala> val scores = rawscores.map(toDouble)
scores: Array[Double] = Array(1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0)
#下面我们将上面的代码合并到一个函数中
def parse(line: String) = {
val pieces = line.split(",")
val id1 = pieces(0).toInt
val id2 = pieces(1).toInt
val scores = pieces.slice(2,11).map(toDouble)
val matched = pieces(11).toBoolean
(id1,id2,scores,matched)
}
val tup = parse(line)
#验证
scala> val line = head(2)
line: String = 39086,47614,1,?,1,?,1,1,1,1,1,TRUE
val tup = parse(line)
scala> parse(line)
res50: (Int, Int, Array[Double], Boolean) = (39086,47614,Array(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0),true
scala> val tup = parse(line)
#从元组中获取单个字段的值,可以使用下标函数,从_1开始,或者用productElement方法,它是从0开始计数的
scala> tup._1
res52: Int = 39086
scala> tup.productElement(0)
res53: Any = 39086
#获取元组大小
scala> tup.productArity
res54: Int = 4
#但是通过下标访问不是很好理解,下面介绍如何通过字段名称来访问
#case class提供这种功能,case class是不可变类的一种简单类型,内置了所有Java类的基本方法,比如toString,equals和hashCode
case class MatchData(id1: Int, id2: Int, scores: Array[Double], matched: Boolean)
#修改parse方法,以返回MatchData实例
def parse(line: String) = {
val pieces = line.split(",")
val id1 = pieces(0).toInt
val id2 = pieces(1).toInt
val scores = pieces.slice(2,11).map(toDouble)
val matched = pieces(11).toBoolean
MatchData(id1,id2,scores,matched)
}
#验证,根据名称来访问数据
scala> val md = parse(line)
md: MatchData = MatchData(39086,47614,[D@39334fb6,true)
scala> md.id1
res57: Int = 39086
scala> md.id2
res58: Int = 47614
scala> md.scores
res59: Array[Double] = Array(1.0, NaN, 1.0, NaN, 1.0, 1.0, 1.0, 1.0, 1.0)
#应用到/linkage目录下面的所有数据上
scala> val mds = rawblocks.filter(x => !isHeader(x)).map(x => parse(x))
mds: org.apache.spark.rdd.RDD[MatchData] = MapPartitionsRDD[9] at map at <console>:50
#在内存中缓存这个RDD
scala> mds.cache()
res69: mds.type = MapPartitionsRDD[9] at map at <console>:50
scala> mds.count
res70: Long = 5749132
#Spark为持久化RDD定义了几种不同的机制,用不同的StorageLevel值表示。
#1. rdd.cache()是rdd.persist(StorageLevel.MEMORY)是简写,它将RDD存储为未序列化的Java对象。在对象需要频繁访问或低延迟访问时适合使用StorageLevel.MEMORY,因为它
#避免序列化的开销。相比其他选项,StorageLevel.MEMORY占用更大的内存空间。
#2. Spark也提供了MEMORY_SER的存储级别,用于在内存中分配大字节缓冲区以存储RDD序列化内容。
#3. Spark也可以用磁盘来缓存RDD,存储级别MEMORY_AND_DISK 和MEMORY_AND_DISK_SER分别类似于MEMORY和MEMORY_SER。