id,checking,duration,history,purpose,amount,savings,employed,installp,marital,coapp,resident,property,age,other,housing,existcr,job,depends,telephon,foreign,good_bad
1,1,6,4,3,1169,5,5,4,3,1,4,1,67,3,2,2,3,1,2,1,0
2,2,48,2,3,5951,1,3,2,2,1,2,1,22,3,2,1,3,1,1,1,1
3,4,12,4,6,2096,1,4,2,3,1,3,1,49,3,2,1,2,2,1,1,0
4,1,42,2,2,7882,1,4,2,3,3,4,2,45,3,3,1,3,2,1,1,0
5,1,24,3,0,4870,1,3,3,3,1,4,4,53,3,3,2,3,2,1,1,1
6,4,36,2,6,9055,5,3,2,3,1,4,4,35,3,3,1,2,2,2,1,0
7,4,24,2,2,2835,3,5,3,3,1,4,2,53,3,2,1,3,1,1,1,0
8,2,36,2,1,6948,1,3,2,3,1,2,3,35,3,1,1,4,1,2,1,0
9,4,12,2,3,3059,4,4,2,1,1,4,1,61,3,2,1,2,1,1,1,0
10,2,30,4,0,5234,1,1,4,4,1,2,3,28,3,2,2,4,1,1,1,1
11,2,12,2,0,1295,1,2,3,2,1,1,3,25,3,1,1,3,1,1,1,1
12,1,48,2,9,4308,1,2,3,2,1,4,2,24,3,1,1,3,1,1,1,1
13,2,12,2,3,1567,1,3,1,2,1,1,3,22,3,2,1,3,1,2,1,0
14,1,24,4,0,1199,1,5,4,3,1,4,3,60,3,2,2,2,1,1,1,1
15,1,15,2,0,1403,1,3,2,2,1,4,3,28,3,1,1,3,1,1,1,0
16,1,24,2,3,1282,2,3,4,2,1,2,3,32,3,2,1,2,1,1,1,1
17,4,24,4,3,2424,5,5,4,3,1,4,2,53,3,2,2,3,1,1,1,0
代码:
from pyspark.ml.feature import Normalizer
from pyspark.ml.feature import VectorAssemblerdf=spark.read.csv("/home/jerry/geoplatform/gai_platform/data/feature_filter/res.csv",header=True,inferSchema=True)
vecAssembler = VectorAssembler(inputCols=["duration"],outputCol="c2_new")
mmScaler = MinMaxScaler(inputCol="c2_new",outputCol="mm_c2")
pipeline = Pipeline(stages=[vecAssembler,mmScaler])
pipeline_fit = pipeline.fit(df)
df_min_max = pipeline_fit.transform(df)
df_min_max.show(10)
df_min_max.show(10)
输出:
+---+--------+--------+-------+-------+------+-------+--------+--------+-------+-----+--------+--------+---+-----+-------+-------+---+-------+--------+-------+--------+------+--------------------+
| id|checking|duration|history|purpose|amount|savings|employed|installp|marital|coapp|resident|property|age|other|housing|existcr|job|depends|telephon|foreign|good_bad|c2_new| mm_c2|
+---+--------+--------+-------+-------+------+-------+--------+--------+-------+-----+--------+--------+---+-----+-------+-------+---+-------+--------+-------+--------+------+--------------------+
| 1| 1| 6| 4| 3| 1169| 5| 5| 4| 3| 1| 4| 1| 67| 3| 2| 2| 3| 1| 2| 1| 0| [6.0]|[0.02941176470588...|
| 2| 2| 48| 2| 3| 5951| 1| 3| 2| 2| 1| 2| 1| 22| 3| 2| 1| 3| 1| 1| 1| 1|[48.0]|[0.6470588235294118]|
| 3| 4| 12| 4| 6| 2096| 1| 4| 2| 3| 1| 3| 1| 49| 3| 2| 1| 2| 2| 1| 1| 0|[12.0]|[0.11764705882352...|
| 4| 1| 42| 2| 2| 7882| 1| 4| 2| 3| 3| 4| 2| 45| 3| 3| 1| 3| 2| 1| 1| 0|[42.0]|[0.5588235294117647]|
| 5| 1| 24| 3| 0| 4870| 1| 3| 3| 3| 1| 4| 4| 53| 3| 3| 2| 3| 2| 1| 1| 1|[24.0]|[0.29411764705882...|
| 6| 4| 36| 2| 6| 9055| 5| 3| 2| 3| 1| 4| 4| 35| 3| 3| 1| 2| 2| 2| 1| 0|[36.0]|[0.47058823529411...|
| 7| 4| 24| 2| 2| 2835| 3| 5| 3| 3| 1| 4| 2| 53| 3| 2| 1| 3| 1| 1| 1| 0|[24.0]|[0.29411764705882...|
| 8| 2| 36| 2| 1| 6948| 1| 3| 2| 3| 1| 2| 3| 35| 3| 1| 1| 4| 1| 2| 1| 0|[36.0]|[0.47058823529411...|
| 9| 4| 12| 2| 3| 3059| 4| 4| 2| 1| 1| 4| 1| 61| 3| 2| 1| 2| 1| 1| 1| 0|[12.0]|[0.11764705882352...|
| 10| 2| 30| 4| 0| 5234| 1| 1| 4| 4| 1| 2| 3| 28| 3| 2| 2| 4| 1| 1| 1| 1|[30.0]|[0.38235294117647...|
+---+--------+--------+-------+-------+------+-------+--------+--------+-------+-----+--------+--------+---+-----+-------+-------+---+-------+--------+-------+--------+------+--------------------+
代码:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import Normalizerdf=spark.read.csv("/home/jerry/geoplatform/gai_platform/data/feature_filter/res.csv",header=True,inferSchema=True)
vecAssembler = VectorAssembler(inputCols=["id","duration"])
vecAssembler = VectorAssembler(inputCols=["id","duration"],outputCol="norm_features")
normalizer = Normalizer(p=2.0,inputCol="norm_features",outputCol="norm_test")
pipeline = Pipeline(stages=[vecAssembler,normalizer])
pipeline_fit=pipeline.fit(df)
df = pipeline_fit.transform(df)df.select(["id","duration","norm_features","norm_test"])
DataFrame[id: int, duration: int, norm_features: vector, norm_test: vector]
df.select(["id","duration","norm_features","norm_test"]).show(10)
输出:
+---+--------+-------------+--------------------+
| id|duration|norm_features| norm_test|
+---+--------+-------------+--------------------+
| 1| 6| [1.0,6.0]|[0.16439898730535...|
| 2| 48| [2.0,48.0]|[0.04163054471218...|
| 3| 12| [3.0,12.0]|[0.24253562503633...|
| 4| 42| [4.0,42.0]|[0.09480909262799...|
| 5| 24| [5.0,24.0]|[0.20395425411200...|
| 6| 36| [6.0,36.0]|[0.16439898730535...|
| 7| 24| [7.0,24.0]| [0.28,0.96]|
| 8| 36| [8.0,36.0]|[0.21693045781865...|
| 9| 12| [9.0,12.0]| [0.6,0.8]|
| 10| 30| [10.0,30.0]|[0.31622776601683...|
+---+--------+-------------+--------------------+
代码:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipelinedf=spark.read.csv("/home/jerry/geoplatform/gai_platform/data/feature_filter/res.csv",header=True,inferSchema=True)
stringindexer = StringIndexer(inputCol="amount",outputCol="one_hot_amount")
encoder = OneHotEncoder(dropLast=False,inputCol="one_hot_amount",outputCol="one_hot_test")
pipeline = Pipeline(stages=[stringindexer,encoder])
pipeline_fit=pipeline.fit(df)
df = pipeline_fit.transform(df)
#多列同时进行编码
将所有数据进行字符索引化(一般对于非数值类型特征进行编码-同sklearn labelencode)
for i in df.columns:
#si = StringIndexer(inputCol=k, outputCol=k + '-indexed', handleInvalid="skip")
stringindexer = StrngIndexer(inputCol=i,outputCol=i+"_new") #错误(该列存在空值)处理方式有error:跑出错误,skep:跳过错误继续执行
model = stringindexer.fit(df)
td = model.transform(df)
df = td.drop(i).withColumnRenamed(i+"_new",i)
代码:
from pyspark.ml.feature import PCA
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
df=spark.read.csv("/home/jerry/geoplatform/gai_platform/data/feature_filter/res.csv",header=True,inferSchema=True)
input_col=["id","amount","checking","duration","history"]
vecAssembler = VectorAssembler(inputCols=input_col,outputCol="features")
pca=PCA(k=5,inputCol="features",outputCol="pca_test")
pipeline = Pipeline(stages=[vecAssembler,pca])
pipeline_fit = pipeline.fit(df)
df = pipeline_fit.transform(df)
df.select(['id', 'amount', 'checking', 'duration', 'history','features','pca_test']).show(2)
输出:
+---+------+--------+--------+-------+--------------------+--------------------+
| id|amount|checking|duration|history| features| pca_test|
+---+------+--------+--------+-------+--------------------+--------------------+
|947| 3349| 1| 24| 1|[947.0,3349.0,1.0...|[-3350.3696120054...|
| 17| 2424| 4| 24| 4|[17.0,2424.0,4.0,...|[-2424.0766224788...|
+---+------+--------+--------+-------+--------------------+--------------------+
代码:
from pyspark.ml.feature import QuantileDiscretizer
df=spark.read.csv("/home/jerry/geoplatform/gai_platform/data/feature_filter/res.csv",header=True,inferSchema=True)
quantileDescretizer = QuantileDiscretizer(numBuckets = 4,inputCol="amount",outputCol="quantile_amount",relativeError=0,handleInvalid="error")
quantileDescretizer_model = quantileDescretizer.fit(df)
df = quantileDescretizer_model.transform(df)
df.show(20)
输出:
+---+------+---+--------+--------+-------+-------+-------+--------+--------+-------+-----+--------+--------+-----+-------+-------+---+-------+--------+-------+------------------+-----------+---------------+
| id|amount|age|checking|duration|history|purpose|savings|employed|installp|marital|coapp|resident|property|other|housing|existcr|job|depends|telephon|foreign| sum_money|count_money|quantile_amount|
+---+------+---+--------+--------+-------+-------+-------+--------+--------+-------+-----+--------+--------+-----+-------+-------+---+-------+--------+-------+------------------+-----------+---------------+
|947| 3349| 30| 1| 24| 1| 2| 3| 2| 4| 3| 1| 4| 4| 3| 3| 1| 3| 2| 2| 1| 42055.55| 10| 2.0|
| 17| 2424| 53| 4| 24| 4| 3| 5| 5| 4| 3| 1| 4| 2| 3| 2| 2| 3| 1| 1| 1| 51659.56000000001| 8| 2.0|
|646| 7980| 27| 4| 36| 3| 9| 5| 2| 4| 3| 1| 4| 3| 3| 1| 2| 3| 1| 2| 1| 30752.23| 5| 3.0|
|674| 2080| 24| 4| 6| 4| 0| 3| 3| 1| 4| 1| 2| 3| 3| 2| 1| 3| 1| 1| 1| 49446.53| 10| 1.
进行分箱时对错误值(即该列存在的空值)的处理规则
保留空值:
df = df.drop("quantile_amount")
df = quantileDescretizer.setHandleInvalid("keep").fit(df).transform(df)
df.select(["id","amount","age","checking","count_money","quantile_amount"]).show(20)
跳过空值:
df = df.drop("quantile_amount")
df = quantileDescretizer.setHandleInvalid("skip").fit(df).transform(df)
df.select(["id","amount","age","checking","count_money","quantile_amount"]).show(20)