优化后,总任务使用时间:1小时20分钟=80min
优化的效率提升了,1-80/130=39%
性能提高了39%
默认不配置 exutors-cores excutor-cores 为4 那么如下配置:
那么没利用的cores就只有19*4=76个core,
提交的脚本如下:
spark-submit --deploy-mode cluster --master yarn --driver-memory 10G --executor-memory 20G --conf spark.executor.memoryOverhead=4096 --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 --conf
spark.network.timeout=300s --conf spark.executor.heartbeatInterval=100s --conf spark.driver.maxResultSize=3G --class com.yeahmobi.dmp.mediacode.InstallListMake s3://taobao/leif/dmp_tags-1.0-SNAPSHOT.jar 20190821 0
注意到总cores是1619=304 ,那么core的利用率就仅仅为76/304=***24%**,
等于四分之三的core是闲置的,造成极大的资源浪费
但是又不能将cores调大,调大,就会单个task的内存不够用,
为了使得单个task的使用内存变小,就想到了利用hash将大表变成小表,这样单个task的内存就可以小一点,这样就可以利用所有的cores了,说干就干
优化前的代码为:
def joinAndWriteData(ss: SparkSession, Day: String) : Unit = {
val OneDayAgo = DateTime.parse(Day, DayFormater).minusDays(1).toString(DayFormater)
val SevenDayAgo = DateTime.parse(Day, DayFormater).minusDays(7).toString(DayFormater)
ss.sql("set spark.sql.shuffle.partitions=3800")
val sql =
"""
SELECT
ifa, array_distinct(flatten(collect_list(bundle))) AS bundles ,array_distinct(flatten(collect_list(countrys))) AS countrys, max(updateday) as updateday
FROM
(
SELECT ifa, bundles AS bundle, countrys, updateday from dmp.t_dmp_idfa_bundle_country_array_middle_tbl where day='%s'
UNION ALL
SELECT ifa, media_id AS bundle, array() AS countrys, '%s' as updateday FROM dmp.t_dmp_idfa_bundle_every_tbl
UNION ALL
SELECT ifa, array() AS bundle, country AS countrys, '%s' as updateday FROM dmp.t_dmp_idfa_country_every_tbl
) x GROUP BY ifa HAVING size(bundles)<2000 and size(bundles)>0
""".format(OneDayAgo, Day,Day)
val result = ss.sql(sql).persist(StorageLevel.MEMORY_AND_DISK_SER)
result.printSchema()
println("start write data to middle tbl")
ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl drop if exists partition (day ='$Day')")
val middleTable = s"s3://xiandmpdata.yeahtargeter.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl/day=$Day"
FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(middleTable), true)
result.write.format("orc").save(middleTable)
ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl add if not exists partition (day ='$Day')")
println("write data to middle tbl success")
//删除七天前的分天表数据含分区
ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl drop if exists partition (day ='$SevenDayAgo')")
val SevenDaysSgoMidDirPath = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl/day=$SevenDayAgo"
FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(SevenDaysSgoMidDirPath), true)
println(s"delete day $SevenDayAgo t_dmp_idfa_bundle_country_array_middle_tbl data successed")
println("start write t_dmp_idfa_bundle_country_array_tbl tbl")
val resultTable = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl"
FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(resultTable), true)
result.write.format("orc").save(resultTable)
println("write idfa country array success")
ndWriteData(ss: SparkSession, Day: String) : Unit = {
val OneDayAgo = DateTime.parse(Day, DayFormater).minusDays(1).toString(DayFormater)
val SevenDayAgo = DateTime.parse(Day, DayFormater).minusDays(7).toString(DayFormater)
ss.sql("set spark.sql.shuffle.partitions=3800")
//利用udf hash将数据分成两份,单个task的内存使用率就降低一倍
for(routeCode <- 0 to 1){
val hashSql =
s"""
SELECT
ifa, array_distinct(flatten(collect_list(bundle))) AS bundles ,array_distinct(flatten(collect_list(countrys))) AS countrys, max(updateday) as updateday
FROM
(
SELECT ifa, bundles AS bundle, countrys, updateday from dmp.t_dmp_idfa_bundle_country_array_middle_tbl where day='$OneDayAgo' and $routeCode=getRouteCode(ifa)
UNION ALL
SELECT ifa, media_id AS bundle, array() AS countrys, '$Day' as updateday FROM dmp.t_dmp_idfa_bundle_every_tbl where $routeCode=getRouteCode(ifa)
UNION ALL
SELECT ifa, array() AS bundle, country AS countrys, '$Day' as updateday FROM dmp.t_dmp_idfa_country_every_tbl where $routeCode=getRouteCode(ifa)
) x GROUP BY ifa HAVING size(bundles)<2000 and size(bundles)>0
"""
val result = ss.sql(hashSql).persist(StorageLevel.MEMORY_AND_DISK_SER)
result.printSchema()
println("start write data to middle tbl")
ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode drop if exists partition (day ='$Day')")
val middleTable = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode/day=$Day"
FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(middleTable), true)
result.write.format("orc").save(middleTable)
ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode add if not exists partition (day ='$Day')")
println(s"write data to middle tbl_$routeCode success")
//删除七天前的分天表数据含分区
ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode drop if exists partition (day ='$SevenDayAgo')")
val SevenDaysSgoMidDirPath = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode/day=$SevenDayAgo"
FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(SevenDaysSgoMidDirPath), true)
println(s"delete day $SevenDayAgo t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode data successed")
result.unpersist(true)
}
优化后的提交代码
spark-submit --deploy-mode cluster --master yarn --driver-memory 10G --executor-memory 20G --executor-cores 12 --conf spark.executor.memoryOverhead=4096 --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 --conf
spark.network.timeout=300s --conf spark.executor.heartbeatInterval=100s --conf spark.driver.maxResultSize=3G --class com.yeahmobi.dmp.mediacode.InstallListMake6 s3://taobao/dmp_tags-1.0-SNAPSHOT.jar 20190822 0