pyspark.sql.dataframe.DataFrame有自带的函数,.toPandas();
下面是别人写的,但是.collect()那里会报错,用自带的toPandas()会报:
Total size of serialized results of 14 tasks (1060.1 MB) is bigger than spark.driver.maxResultSize (1024.0 MB),试了将load的后面加上.repartition(2000):
def _map_to_pandas(self,rdds):
“”" Needs to be here due to pickling issues “”"
return [pd.DataFrame(list(rdds))]
def toPandas(self,df, n_partitions=None):
"""
Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
repartitioned if `n_partitions` is passed.
:param df: pyspark.sql.DataFrame
:param n_partitions: int or None
:return: pandas.DataFrame
"""
if n_partitions is not None: df = df.repartition(n_partitions)
df_pand = df.rdd.mapPartitions(self._map_to_pandas).collect()###
df_pand = pd.concat(df_pand)
df_pand.columns = df.columns
return df_pand
from __future__ import absolute_import
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import numpy
import pandas as pd
from conf.path_conf import *
from sklearn.externals import joblib
import sys, os
# Path for spark source folder
os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"
# Append pyspark to Python Path
sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")
from biz.biz_utils.table_info.feature.shixin_cox_model_input import ShixinCoxModelInputTableInfo
from biz.biz_utils.spark_session_utils.spark_session_utils import SparkSessionUtils
class ShixinCoxModelEvaluate(SparkSessionUtils):
# 失信cox输入模型的数据表
shixin_cox_model_input_tb = ShixinCoxModelInputTableInfo.table_info
# 失信cox输出模型数据表
def set_table_info(self):
pass
def load_file(self):
self.shixin_cox_model_input_tb = self.session.read.load(self.shixin_cox_model_input_tb.location, format="csv",
schema=self.shixin_cox_model_input_tb.schema,
delimiter=',').repartition(2000)
self.shixin_cox_model_input_tb.createOrReplaceTempView("shixin_cox_model_input_tb")
def predict_shixin(self):
clf = joblib.load(
'/home/scdata/hadoop_cox_data/'+'shixin_cox_model.pkl'
)#'/home/sc/PycharmProjects/sc/model-feature-engine/evaluate_model/shixin_cox_evaluate_model/shixin_cox_model.pkl'
input = self.shixin_cox_model_input_tb
input = input.toPandas()
input_x = input['judge_cnt', 'net_fr_judge_cnt']
# print(type(input_x))
# input_x_1 = self.toPandas(input_x,1000)
# input_x_1 = pd.DataFrame(input_x, columns=['judge_cnt', 'net_fr_judge_cnt'])
# input_x_1 = self.session.createDataFrame(input_x)
dict1 = {}
#预测未来12个月,间隔为0.05;
times = numpy.arange(0, 12, 0.05)
ypred = clf.predict_survival_function(input_x, times)
# ypred的列名是对应输入原表的input_x的索引,而原表的input_x的索引对应的是原表中的公司名;
for i in list(input.index):
print(input.loc[i, "company_name"])
dict1[i] = input.loc[i, "company_name"]
# 将ypred的列名对应到原表中的公司名
ypred.rename(columns=dict1, inplace=True)
# 将矩阵进行行列转置
kk = ypred.as_matrix(columns=None).T
# 将矩阵转换成DataFrame,设置索引和列名
rst = pd.DataFrame(kk, index=ypred.columns, columns=list(ypred.index))
#增加一列为公司名,放在开头的第一列的位置
rst.insert(0,'company_name',rst.index)
#将索引换成自然数
rst.index = range(0, len(rst))
#保存到hdfs
self._save(rst,PC.HDFS_RISK_MODEL_AUTO_FEATURE+'shixin_cox_model_predict_results.csv' , header=True, delimiter=',',
mode="overwrite", quote='"')
def run_task(self):
self.load_file()
self.predict_shixin()
if __name__ == '__main__':
# '''/hdfs/riskModelAuto/batch_1/v2_1/feature/'''
ShixinCoxModelEvaluate().run_task()
# print(PC.HDFS_RISK_MODEL_AUTO_FEATURE)