pyspark.sql.dataframe.DataFrame 转换成 pandas.DataFrame

pyspark.sql.dataframe.DataFrame有自带的函数,.toPandas();
下面是别人写的,但是.collect()那里会报错,用自带的toPandas()会报:
Total size of serialized results of 14 tasks (1060.1 MB) is bigger than spark.driver.maxResultSize (1024.0 MB),试了将load的后面加上.repartition(2000):
def _map_to_pandas(self,rdds):
“”" Needs to be here due to pickling issues “”"
return [pd.DataFrame(list(rdds))]

def toPandas(self,df, n_partitions=None):
    """
    Returns the contents of `df` as a local `pandas.DataFrame` in a speedy fashion. The DataFrame is
    repartitioned if `n_partitions` is passed.
    :param df:              pyspark.sql.DataFrame
    :param n_partitions:    int or None
    :return:                pandas.DataFrame
    """
    if n_partitions is not None: df = df.repartition(n_partitions)
    df_pand = df.rdd.mapPartitions(self._map_to_pandas).collect()###
    df_pand = pd.concat(df_pand)
    df_pand.columns = df.columns
    return df_pand
from __future__ import absolute_import
import sys



reload(sys)
sys.setdefaultencoding('utf8')
import numpy
import pandas as pd
from conf.path_conf import *
from sklearn.externals import joblib
import sys, os
# Path for spark source folder
os.environ['SPARK_HOME'] = "/opt/spark-2.0.1-bin-hadoop2.7"
 # Append pyspark to Python Path
sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/")
sys.path.append("/opt/spark-2.0.1-bin-hadoop2.7/python/lib/")


from biz.biz_utils.table_info.feature.shixin_cox_model_input import ShixinCoxModelInputTableInfo

from biz.biz_utils.spark_session_utils.spark_session_utils import SparkSessionUtils


class ShixinCoxModelEvaluate(SparkSessionUtils):
    # 失信cox输入模型的数据表
    shixin_cox_model_input_tb = ShixinCoxModelInputTableInfo.table_info

    # 失信cox输出模型数据表

    def set_table_info(self):
        pass

    def load_file(self):
        self.shixin_cox_model_input_tb = self.session.read.load(self.shixin_cox_model_input_tb.location, format="csv",
                                                                schema=self.shixin_cox_model_input_tb.schema,
                                                                delimiter=',').repartition(2000)

        self.shixin_cox_model_input_tb.createOrReplaceTempView("shixin_cox_model_input_tb")


   


    def predict_shixin(self):
        clf = joblib.load(
            '/home/scdata/hadoop_cox_data/'+'shixin_cox_model.pkl'
            )#'/home/sc/PycharmProjects/sc/model-feature-engine/evaluate_model/shixin_cox_evaluate_model/shixin_cox_model.pkl'
        input = self.shixin_cox_model_input_tb
        input = input.toPandas()
        input_x = input['judge_cnt', 'net_fr_judge_cnt']
        # print(type(input_x))
        # input_x_1 = self.toPandas(input_x,1000)
        # input_x_1 = pd.DataFrame(input_x, columns=['judge_cnt', 'net_fr_judge_cnt'])
        # input_x_1 = self.session.createDataFrame(input_x)
        dict1 = {}
        #预测未来12个月,间隔为0.05;
        times = numpy.arange(0, 12, 0.05)
        ypred = clf.predict_survival_function(input_x, times)

        # ypred的列名是对应输入原表的input_x的索引,而原表的input_x的索引对应的是原表中的公司名;
        for i in list(input.index):
            print(input.loc[i, "company_name"])
            dict1[i] = input.loc[i, "company_name"]
        # 将ypred的列名对应到原表中的公司名
        ypred.rename(columns=dict1, inplace=True)
        # 将矩阵进行行列转置
        kk = ypred.as_matrix(columns=None).T
        # 将矩阵转换成DataFrame,设置索引和列名
        rst = pd.DataFrame(kk, index=ypred.columns, columns=list(ypred.index))

        #增加一列为公司名,放在开头的第一列的位置
        rst.insert(0,'company_name',rst.index)

        #将索引换成自然数
        rst.index = range(0, len(rst))

        #保存到hdfs
        self._save(rst,PC.HDFS_RISK_MODEL_AUTO_FEATURE+'shixin_cox_model_predict_results.csv' , header=True, delimiter=',',
                   mode="overwrite", quote='"')

    def run_task(self):
        self.load_file()
        self.predict_shixin()


if __name__ == '__main__':
    # '''/hdfs/riskModelAuto/batch_1/v2_1/feature/'''
    ShixinCoxModelEvaluate().run_task()
    # print(PC.HDFS_RISK_MODEL_AUTO_FEATURE)

你可能感兴趣的:(python,spark海量数据分析)