使用spark检查数据质量

由于使用HIVE查询出来的报表数据会因为源数据的异常导致不可用,这个时候需要根据数据情况修改代码

,下面是利用spark针对几个检查项,将检查结果插入到表中


# -*- coding: utf-8 -*-

from pyspark import SparkContext
from pyspark import HiveContext
from pyspark.sql import Row
import math
import os,sys
import datetime
import time
reload(sys)
sys.setdefaultencoding('utf-8')




def f_sort(x): #python的排序函数
 sort_temp=sorted(x,key=lambda x:x[2],reverse=False)
 return sort_temp


def rate_violate(x):  
 result=[1,""]
 if len(x)<=1:
    result=[1,""]
 else:
    for i in range(1,len(x)-1):
        if math.floor((float(x[i][1])-float(x[i-1][1]))*1000/float(x[i-1][1]))<>math.floor(float(x[i][0])*10):
            return [0,x[i][2]]
 return result
    




if __name__ == "__main__":


    sc = SparkContext(appName="GfrobFengYunZuheYield")
    sqlContext = HiveContext(sc)
    sqlContext.sql('use gfrobdb2')
    sqlContext.sql('SET hive.exec.dynamic.partition.mode=nonstrict')
    str="""
        select distinct strategyid
        ,busidate
        ,daychangerate
        ,totalchangerate
        ,type from gfrobdb2.double_weekend_yield_overview_daily_v6_test
        where busidate>date_add('2016-11-22',-130)
        and busidate<='2016-11-21'
        """
    str_trading="""
        select normalday
        ,istradingday
        ,nexttradingday
        ,lasttradingday 
        from odata_msg.bg_c_pub_tradingday
        where exchangecode='101'
        and normalday>date_add('2016-11-22',-130)
        and normalday<='2016-11-21'
               """
    a=sqlContext.sql(str).map(lambda item:item.asDict()).filter(lambda item:item["daychangerate"]<>"0.0")
    #there should have no changerate in non-tradingday
    yield_temp=a.map(lambda item:(item["busidate"],item["daychangerate"]))
    b=sqlContext.sql(str_trading).map(lambda item:item.asDict()).filter(lambda item:item["istradingday"]=="0").map(lambda item:(item["normalday"],item["istradingday"]))
    wrong_data=b.join(yield_temp).toDF()
    wrong_data.registerTempTable("non_tradingday")
    str_insert="""
        insert into table stat_error
        select '2016-11-21' as busidate
        ,'double_weekend_yield_overview_daily_v6' as table_name
        ,'non-tradingday should not have changerate' as error_msg
        ,count(1) as num_total
        from  non_tradingday
              """
    sqlContext.sql(str_insert)


    #the tradingday,but the changerate record is not complete
    yield_strategy=a.map(lambda item:(item["strategyid"],item["busidate"]))
    yield_strategy_out=yield_strategy.map(lambda item:(item,1))
    strategy=yield_strategy.map(lambda item:(1,item[0])).distinct()
    tradingday=sqlContext.sql(str_trading).map(lambda item:item.asDict()).filter(lambda item:item["istradingday"]=="1").map(lambda item:(1,item["normalday"]))
    complete_data=strategy.fullOuterJoin(tradingday).map(lambda item:item[1]).map(lambda item:(item,1))
    strategy_result=complete_data.leftOuterJoin(yield_strategy_out).filter(lambda item:item[1][1]==None).map(lambda item:(item[0][0],item[0][1])).toDF()
    strategy_result.registerTempTable("yield_empty")
    str_insert="""
        insert into table stat_error
        select '2016-11-21' as busidate
        ,'double_weekend_yield_overview_daily_v6' as table_name
        ,'tradingday have no changerate' as error_msg
        ,count(1) as num_total
        from yield_empty
        """
    sqlContext.sql(str_insert)




    #the totalchangerate(t) and totalchangerate(t-1) shuld apply the relation 
    a=sqlContext.sql(str).map(lambda item:item.asDict()).map(lambda item:((item["strategyid"]),(item["daychangerate"],item["totalchangerate"],item["busidate"])))
    a1=a.groupByKey().mapValues(list).map(lambda item:(item[0],f_sort(item[1])))
    a2=a1.map(lambda item:(item[0],rate_violate(item[1]))).filter(lambda item:item[1][0]==0)
    a3=a2.map(lambda item:item).collect()
    #for i in a3:
     #   print i[0].encode('utf-8')  z转换编码打印
     #   print i
    if not a2.isEmpty():
        a2=a2.toDF()
        a2.registerTempTable("rate_violate")
        str_insert="""
            insert into table stat_error
            select '2016-11-21' as busidate
            ,'double_weekend_yield_overview_daily_v6' as table_name
            ,'totalchangerate(t) and totalchangerate(t-1) is violated' as error_msg
            ,count(1) as num_total
            from rate_violate
                 """
        sqlContext.sql(str_insert)

你可能感兴趣的:(spark)