spark版本:spark 2.0.2
scala版本:2.11.8
服务器版本:CentOS 6.7
对比map和flatMap在RDD中的使用:
val rdd1 = sc.parallelize(Seq(("one two three four five six seven"), ("one two three four five six seven"), ("one two three four five six seven"))) rdd1.map(_.split(" ")).collect /* res6: Array[Array[String]] = Array(Array(one, two, three, four, five, six, seven), Array(one, two, three, four, five, six, seven), Array(one, two, three, four, five, six, seven)) */ rdd1.flatMap(_.split(" ")).collect /* res7: Array[String] = Array(one, two, three, four, five, six, seven, one, two, three, four, five, six, seven, one, two, three, four, five, six, seven) */ val rdd2 = sc.parallelize(Seq((1, "one two three four five six seven"), (2, "one two three four five six seven"), (3, "one two three four five six seven"))) rdd2.map(x => (x._1, x._2.split(" "))).collect /* res14: Array[(Int, Array[String])] = Array((1,Array(one, two, three, four, five, six, seven)), (2,Array(one, two, three, four, five, six, seven)), (3,Array(one, two, three, four, five, six, seven))) */ rdd2.map(x => { val x2 = x._2.split(" ") (x._1, x2.toIterable)}).collect /* res4: Array[(Int, Iterable[String])] = Array((1,WrappedArray(one, two, three, four, five, six, seven)), (2,WrappedArray(one, two, three, four, five, six, seven)), (3,WrappedArray(one, two, three, four, five, six, seven))) */ rdd2.map(x => { val x2 = x._2.split(" ") (x._1, x2.toIterable)}).flatMap{x => val y = x._2 for (w <- y) yield (x._1, w)}.collect /* res7: Array[(Int, String)] = Array((1,one), (1,two), (1,three), (1,four), (1,five), (1,six), (1,seven), (2,one), (2,two), (2,three), (2,four), (2,five), (2,six), (2,seven), (3,one), (3,two), (3,three), (3,four), (3,five), (3,six), (3,seven)) */