由于使用HIVE查询出来的报表数据会因为源数据的异常导致不可用,这个时候需要根据数据情况修改代码
,下面是利用spark针对几个检查项,将检查结果插入到表中
# -*- coding: utf-8 -*-
from pyspark import SparkContext
from pyspark import HiveContext
from pyspark.sql import Row
import math
import os,sys
import datetime
import time
reload(sys)
sys.setdefaultencoding('utf-8')
def f_sort(x):
#python的排序函数
sort_temp=sorted(x,key=lambda x:x[2],reverse=False)
return sort_temp
def rate_violate(x):
result=[1,""]
if len(x)<=1:
result=[1,""]
else:
for i in range(1,len(x)-1):
if math.floor((float(x[i][1])-float(x[i-1][1]))*1000/float(x[i-1][1]))<>math.floor(float(x[i][0])*10):
return [0,x[i][2]]
return result
if __name__ == "__main__":
sc = SparkContext(appName="GfrobFengYunZuheYield")
sqlContext = HiveContext(sc)
sqlContext.sql('use gfrobdb2')
sqlContext.sql('SET hive.exec.dynamic.partition.mode=nonstrict')
str="""
select distinct strategyid
,busidate
,daychangerate
,totalchangerate
,type from gfrobdb2.double_weekend_yield_overview_daily_v6_test
where busidate>date_add('2016-11-22',-130)
and busidate<='2016-11-21'
"""
str_trading="""
select normalday
,istradingday
,nexttradingday
,lasttradingday
from odata_msg.bg_c_pub_tradingday
where exchangecode='101'
and normalday>date_add('2016-11-22',-130)
and normalday<='2016-11-21'
"""
a=sqlContext.sql(str).map(lambda item:item.asDict()).filter(lambda item:item["daychangerate"]<>"0.0")
#there should have no changerate in non-tradingday
yield_temp=a.map(lambda item:(item["busidate"],item["daychangerate"]))
b=sqlContext.sql(str_trading).map(lambda item:item.asDict()).filter(lambda item:item["istradingday"]=="0").map(lambda item:(item["normalday"],item["istradingday"]))
wrong_data=b.join(yield_temp).toDF()
wrong_data.registerTempTable("non_tradingday")
str_insert="""
insert into table stat_error
select '2016-11-21' as busidate
,'double_weekend_yield_overview_daily_v6' as table_name
,'non-tradingday should not have changerate' as error_msg
,count(1) as num_total
from non_tradingday
"""
sqlContext.sql(str_insert)
#the tradingday,but the changerate record is not complete
yield_strategy=a.map(lambda item:(item["strategyid"],item["busidate"]))
yield_strategy_out=yield_strategy.map(lambda item:(item,1))
strategy=yield_strategy.map(lambda item:(1,item[0])).distinct()
tradingday=sqlContext.sql(str_trading).map(lambda item:item.asDict()).filter(lambda item:item["istradingday"]=="1").map(lambda item:(1,item["normalday"]))
complete_data=strategy.fullOuterJoin(tradingday).map(lambda item:item[1]).map(lambda item:(item,1))
strategy_result=complete_data.leftOuterJoin(yield_strategy_out).filter(lambda item:item[1][1]==None).map(lambda item:(item[0][0],item[0][1])).toDF()
strategy_result.registerTempTable("yield_empty")
str_insert="""
insert into table stat_error
select '2016-11-21' as busidate
,'double_weekend_yield_overview_daily_v6' as table_name
,'tradingday have no changerate' as error_msg
,count(1) as num_total
from yield_empty
"""
sqlContext.sql(str_insert)
#the totalchangerate(t) and totalchangerate(t-1) shuld apply the relation
a=sqlContext.sql(str).map(lambda item:item.asDict()).map(lambda item:((item["strategyid"]),(item["daychangerate"],item["totalchangerate"],item["busidate"])))
a1=a.groupByKey().mapValues(list).map(lambda item:(item[0],f_sort(item[1])))
a2=a1.map(lambda item:(item[0],rate_violate(item[1]))).filter(lambda item:item[1][0]==0)
a3=a2.map(lambda item:item).collect()
#for i in a3:
# print i[0].encode('utf-8') z转换编码打印
# print i
if not a2.isEmpty():
a2=a2.toDF()
a2.registerTempTable("rate_violate")
str_insert="""
insert into table stat_error
select '2016-11-21' as busidate
,'double_weekend_yield_overview_daily_v6' as table_name
,'totalchangerate(t) and totalchangerate(t-1) is violated' as error_msg
,count(1) as num_total
from rate_violate
"""
sqlContext.sql(str_insert)