法1:dataFrame数据写入hive表
def log2Hive():
log=hiveContext.createDataFrame([{"dt":dt,"types":types,"message":msg,"currtime":currTime}]).coalesce(1)#types: "INFO" ,"ERROR"
log.write.mode("append").insertInto("app.app_tion_log")
法2:dataFrame数据写入hive表
temp_vl_org=self.vl_data.join(self.order_slice,["po_no"]).coalesce(10)
os.system("hadoop fs -rm -r -skipTrash dev.db/" + str.lower(temp_vl))
hiveContext.sql("drop table if exists dev."+temp_vl)
temp_vlt_org.write.saveAsTable("dev."+temp_vl)
1.
order_slice= hiveContext.table("app.app_ rage").coalesce(10).where(col("dt")==self.order_new_dt).select("po_no","goods_no",col("distribution_no").alias("dc_id")).distinct().coalesce(10)
2.
log =hiveContext.createDataFrame([{"dt":dt,"types":types,"message":msg,"currtime":currTime}]).coalesce(1) #types:"INFO" ,"ERROR"
3.
hiveContext.sql("showpartitions app.app_ rage").select(func.max("partition").alias("partition")).rdd.map(lambda x:str(x[0])).take(1)[0]
4.dataFrame数据写入hive表
temp_vl_org=self.vl_data.join(self.order_slice,["po_no"]).coalesce(10)
os.system("hadoop fs -rm -r -skipTrash dev.db/" +str.lower(temp_vl))
hiveContext.sql("drop table if exists dev."+temp_vl)
temp_vlt_org.write.saveAsTable("dev."+temp_vl)
#删除表
os.system("hadoop fs -rm -r -skipTrashdev.db/" + str.lower(temp_vl))
#从集市1路径localFolde拷贝数据到集市2Hive表分区数据tbFolder
localFolder="app.db/app_filter /"
partitonName="dt="+self.table_new_dt
tbFolder= "hdfs://102.1.1.1:8080/user/cm_pc/app.db/app_filter/"
os.system("hadoop fs -rm -r -skipTrash"+tbFolder+partitonName)
os.system("hadoop distcp"+localFolder+partitonName+" "+tbFolder+partitonName)
# localFolder+partitonName为源路径,tbFolder+partitonName为目标路径
#以下返回dataFrame类型值
df = hc.sql(sql).coalesce(5)
#dataFrame转rdd
r2 = df.rdd.map(lambda row :((row[0],row[1]),(row[2],row[3],row[4],row[5]))).groupByKey().map(lambda(k, v): sub_process(k, v))
cond=["rowkey","top_10","dt"]
#rdd转dataFrame
result=r2.toDF(cond)
#dataframe存入hive表
result.write.mode("append").insertInto(table_name)
法1:
#rdd转dataFrame
result=r2.toDF(cond)
法2:
order_slice= hiveContext.table("app.apprage").coalesce(10).where(col("dt")==self.ord_new_dt).select("po_no","goods_no",col("distribution_no").alias("dc_id")).distinct().coalesce(10)
法3:
log =hiveContext.createDataFrame([{"dt":dt,"types":types,"message":msg,"currtime":currTime}]).coalesce(1) #types:"INFO" ,"ERROR"
法4:
hiveContext.sql("showpartitions app.apprage")
法5:
从hdfs中读取数据变成dataFrame
input_path为hdfs路径
textFile=sc.textFile(input_path).map(lambdax: x.split("\x01"))
df = textFile.toDF(["seller_no","seller_name", "dept_no", "goods_no","shop_id", "sp_goods_no", "dc_id","erp_warehouse_no","real_outstore_qty", "sp_create_time"]).groupby(["seller_no","seller_name", "dept_no", "goods_no","dc_id", "erp_warehouse_no","sp_create_time"]).agg(func.sum("real_outstore_qty").alias("real_outstore_qty")).where((col("sp_create_time")< endDate) & (col("sp_create_time") >"2015-01-01"))
dataDf = hiveContext.table(self.table_name).where((col("dt")==self.table_new_dt)& (col("so_status")<> "10028") &(col("so_status")<> "10009"))\
.select("seller_no","seller_name","dept_no",
"goods_no",col("shop_id").cast("int"),F.when(func.isnull("sp_goods_no"),-1).otherwise(col("sp_goods_no").cast("int")).alias("sp_goods_no"),
col("distribution_no").cast("int").alias("dc_id"),col("erp_warehouse_no").cast("int"),
col("sp_create_time").cast("date"),F.when(func.isnull("apply_out_qty"),1).otherwise(col("apply_out_qty")).alias("apply_qty"),).coalesce(10)
dataFrame添加列
法1,2
self.groupedDf=dataDf.groupBy("seller_no","dept_no","goods_no","shop_id","sp_goods_no","dc_id","erp_warehouse_no","sp_create_time")\
#添加两列
.agg(func.max("seller_name").alias("seller_name"),func.sum("apply_outstore_qty").alias("apply_outstore_qty"))\
.withColumn("dt",lit(self.table_new_dt))
if __name__ == "__main__":
main()
r = os.system("hadoop fs -test-e /tmp/for/result/e_cast/_SUCCESS")
if r != 0:
raise Exception("1")
else:
print ("执行spark获取eclp预测数据成功!!")
def push2hive(self):
hiveContext.sql("LOAD DATA IN PATH '/tmp/for/result/ec_forecast' OVERWRITE INTO TABLE app.app_ ver1 PARTITION( dt = '"+ _today+"')")
hdfs_path_ver1 ="/user/cmo_ipc/app.db/app_ ver1/dt=" + _today
df1=sc.textFile(hdfs_path_ver1).map(lambda x:x.split("\t")).map(lambdaline:((line[0],line[1],line[2],line[3]),line[8])).groupByKey().map(lambda(k,v): sales_nation_ver1(k,v))