import java.io.{BufferedReader, InputStreamReader}
import java.util.zip.ZipInputStream
import org.apache.spark.input.PortableDataStream
val dataAndPortableRDD = sc.binaryFiles("zipData path")
val dataRDD = dataAndPortableRDD.flatMap { case (name: String, content: PortableDataStream) =>
val zis = new ZipInputStream(content.open)
Stream.continually(zis.getNextEntry)
.takeWhile(_ != null)
.flatMap { _ =>
//注意,这里非并行
val br = new BufferedReader(new InputStreamReader(zis))
Stream.continually(br.readLine()).takeWhile(_ != null)
//实际上,还要关br和new InputStreamReader(zis)
}#::: { zis.close; Stream.empty[String] }//不加#::: { zis.close; Stream.empty[String] }会不关闭连接
}
dataRDD.take(10).foreach(println)
//另一种
sc.binaryFiles(path).repartition(2000)
.flatMap{
case (zipFilePath: String, context:PortableDataStream) => {
val zis = new ZipInputStream(context.open())
Stream.continually(zis.getNextEntry)
.takeWhile(_ != null)
.flatMap(zipEn => {
if(!zipEn.isDirectory) {
scala.io.Source.fromInputStream(zis,"UTF-8").getLines()
}else {
None
}
})#::: { zis.close; Stream.empty[String]}
}
}.repartition(2000)
问题:
当文件比较大时,效率特别慢,比如:
13G的文件读取,50核300G内存情况下,5h还不能处理完(实际上,是因为没有关闭连接而卡住导致的)
spark2.3的官方文档中也写到:
Partitioning is determined by data locality. This may result in too few partitions by default.
/** Returns the longest prefix of this `Stream` whose elements satisfy the
* predicate `p`.
*
* @param p the test predicate.
* @return A new `Stream` representing the values that satisfy the predicate
* `p`.
*
* @example {{{
+ naturalsFrom(0) takeWhile { _ < 5 } mkString ", "
* produces: "0, 1, 2, 3, 4"
* }}}
*/
override def takeWhile(p: A => Boolean): Stream[A] =
if (!isEmpty && p(head)) cons(head, tail takeWhile p)
else Stream.Empty
binaryFile:https://spark.apache.org/docs/2.0.2/api/scala/index.html#org.apache.spark.SparkContext
zipInputStream:
https://docs.oracle.com/javase/7/docs/api/java/util/zip/ZipInputStream.html\
PortableDataStream:
https://spark.apache.org/docs/2.3.0/api/scala/index.html#org.apache.spark.input.PortableDataStream
Stream:
https://zhuanlan.zhihu.com/p/130958554
分区问题:
https://www.coder.work/article/6527071
不关闭会出问题,表现为大文件时处理完后卡住不动
https://stackoverflow.com/questions/32080475/how-to-read-a-zip-containing-multiple-files-in-apache-spark
最佳答案
Spark 2.4+ ,问题应该得到解决,请参阅此答案下方的@Rahul 评论。
Spark 2.1-2.3 ,minPartitions
的 binaryFiles()
参数被忽略。请参阅 Spark-16575 和 commit changes to function setMinPartitions() 。请注意提交更改如何在函数中不再使用 minPartitions
!
如果您使用 binaryFiles()
读取多个二进制文件,则输入文件将根据以下内容合并到分区中:
spark.files.maxPartitionBytes
,默认 128 MBspark.files.openCostInBytes
,默认 4 MBspark.default.parallelism
spark.files.maxPartitionBytes
之前将 binaryFiles()
设置为 40 M: spark = SparkSession \
.builder \
.config("spark.files.maxPartitionBytes", 40*1024*1024)
binaryFiles()
将单个文件拆分为多个分区。minPartitions
参数确实有效,您必须使用它。如果不这样做,您将遇到 Spark-16575 问题:您的所有输入文件将仅被读入两个分区!minPartitions
设置为“输入文件数 * 7/10”大致可以满足我的需求。我有另一种情况,我希望每个输入文件有一个输入分区。我发现将 minPartitions
设置为“输入文件数 * 2”给了我想要的。binaryFiles()
的行为:每个输入文件都有一个分区。记一次zip测试:
spark-shell \
--name "x00450248_test" \
--master yarn-client \
--num-executors 5 \
--executor-cores 2 \
--executor-memory 8G \
--driver-memory 5G \
--conf spark.driver.maxResultSize=10g \
--conf spark.yarn.executor.memoryOverhead=10000 \
--conf spark.serializer="org.apache.spark.serializer.KryoSerializer" \
--conf spark.shuffle.memoryFraction=0.3 \
import java.io.{BufferedReader, InputStreamReader}
import java.util.zip.ZipInputStream
import org.apache.spark.input.PortableDataStream
import java.util.Date
val dataAndPortableRDD = sc.binaryFiles("hdfs:path/merged_20210827_1630042463.zip").repartition(2000)
dataAndPortableRDD.map{ case (name: String, content: PortableDataStream)=>name+content.getPath}.toDF().show(20,false) //跑完会卡住,按回车才跳出
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|hdfs://hacluster/AppData/BIProd/ADS/Rcm/Newsfeed/obsStream/tmp/20210816_1629085113.ziphdfs://hacluster/AppData/BIProd/ADS/Rcm/Newsfeed/obsStream/tmp/20210816_1629085113.zip|
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
val dataRDD = dataAndPortableRDD.flatMap { case (name: String, content: PortableDataStream) =>
val zis = new ZipInputStream(content.open)
Stream.continually(zis.getNextEntry)
.takeWhile(_ != null)
.flatMap { _ =>
val br = new BufferedReader(new InputStreamReader(zis))
Stream.continually(br.readLine()).takeWhile(_ != null)
}#::: { zis.close; Stream.empty[String] }
}
val dataRDD3 = dataAndPortableRDD.flatMap { case (name: String, content: PortableDataStream) =>
val zis = new ZipInputStream(content.open)
Stream.continually(zis.getNextEntry)
.takeWhile(_ != null)
.flatMap {zipEn =>
if (!zipEn.isDirectory) {
scala.io.Source.fromInputStream(zis, "UTF-8").getLines()
} else {
None
}
}#::: { zis.close; Stream.empty[String] }
}
val dataRDD2 = dataAndPortableRDD.flatMap { case (name: String, content: PortableDataStream) =>
val zis = new ZipInputStream(content.open)
Stream.continually(zis.getNextEntry)
.takeWhile(_ != null)
.flatMap { _ =>
val br = new BufferedReader(new InputStreamReader(zis))
Stream.continually(br.readLine()).takeWhile(_ != null)
}
}
dataRDD.partitions.length
dataRDD.toDebugString
var start_time =new Date().getTime
println(dataRDD3.count())
var end_time =new Date().getTime
println((end_time-start_time)/1000+"s")
//单位毫秒
391s //默认并行度
383s // --conf spark.default.parallelism=2000 没啥用
592s //dataRDD3 的getlines要慢一些
val dataAndPortableRDD = sc.binaryFiles("hdfs://hacluster/AppData/BIProd/ADS/Rcm/Newsfeed/obsStream/tmp/merged_20210827_1630042463.zip").repartition(2000)
val dataRDD3 = dataAndPortableRDD.flatMap { case (name: String, content: PortableDataStream) =>
val zis = new ZipInputStream(content.open)
val entry=zis.getNextEntry
if(entry!=null) Array(entry.getName+"#"+entry.getSize+"#"+entry.getTime)
else { zis.close; Array.empty[String] }
}
dataRDD3.toDF().show(20,false)
+--------------------------------------------+
|value |
+--------------------------------------------+
|jobs_merged/merged_20210827/#0#1630031077000|
+--------------------------------------------+
val dataRDD4 = dataAndPortableRDD.flatMap { case (name: String, content: PortableDataStream) =>
val zis = new ZipInputStream(content.open)
Stream.continually(zis.getNextEntry)
.takeWhile(_ != null).map{
entry=>
if(entry!=null) Array(entry.getName+"#"+entry.getSize+"#"+entry.getTime)
else { zis.close; Array.empty[String] }
}
}
dataRDD4.toDF().show(20,false)
|value |
+-------------------------------------------------------------+
|[jobs_merged/merged_20210827/#0#1630031077000] |
|[jobs_merged/merged_20210827/33.json#943379573#1630042433000]|
|[jobs_merged/merged_20210827/13.json#950255271#1630042418000]|
|[jobs_merged/merged_20210827/1.json#950155148#1630042394000] |
|[jobs_merged/merged_20210827/0.json#944714928#1630042444000] |
|[jobs_merged/merged_20210827/8.json#947567194#1630042399000] |
|[jobs_merged/merged_20210827/45.json#944732732#1630042368000]|
|[jobs_merged/merged_20210827/27.json#941420005#1630042380000]|
|[jobs_merged/merged_20210827/23.json#945034057#1630042387000]|
|[jobs_merged/merged_20210827/35.json#946570789#1630042412000]|
|[jobs_merged/merged_20210827/57.json#942500714#1630042396000]|
|[jobs_merged/merged_20210827/54.json#943293945#1630042343000]|
|[jobs_merged/merged_20210827/28.json#950335030#1630042417000]|
|[jobs_merged/merged_20210827/55.json#947131627#1630042413000]|
|[jobs_merged/merged_20210827/15.json#942653802#1630042421000]|
|[jobs_merged/merged_20210827/21.json#945301316#1630042428000]|
|[jobs_merged/merged_20210827/38.json#944542166#1630042386000]|
|[jobs_merged/merged_20210827/12.json#949415970#1630042420000]|
|[jobs_merged/merged_20210827/41.json#946474541#1630042439000]|
|[jobs_merged/merged_20210827/5.json#941914600#1630042368000] |
+-------------------------------------------------------------+
only showing top 20 rows
import com.cotdp.hadoop.ZipFileInputFormat//三方库
import org.apache.hadoop.io.BytesWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapreduce.Job
val zipFileRDD = sc.newAPIHadoopFile(
"hdfs://path/20210816_1629085113.zip",
classOf[ZipFileInputFormat],
classOf[Text],
classOf[BytesWritable],
new Job().getConfiguration())