df.withColumn

# df is a DataFrame
def lowerCase(string):
    return string.strip().lower()

lowerCaseUDF = udf(lowerCase, StringType())

for (columnName, kind) in df.dtypes:
    if(kind == "string"):
        df = df.withColumn(columnName, lowerCaseUDF(df[columnName]))

df.select("Tipo_unidad").distinct().show()

def recent_six_months(clrq):
    try:
        time.strptime(clrq, "%Y-%m-%d")
        clrq_date_time = datetime.datetime.strptime(clrq, '%Y-%m-%d')
        now = datetime.datetime(2018,5,1)
        d2 = (now-clrq_date_time).days
        if float(d2/30) <= 6:
            return 'in-6'
        else:
            return 'not-in-6'
    except:
        return 'not-in-6'
# def is_valid_date(str):
#   '''判断是否是一个有效的日期字符串'''
#   try:
#     time.strptime(str, "%Y-%m-%d")
#     return True
#   except:
#     return False

参考:overstackflow链接

你可能感兴趣的:(python,spark海量数据分析)