import pandas as pd
import numpy as np
SegData = pd.read_csv('../data/SegData.csv')
print(SegData.head())
输出结果:
age gender income house store_exp ... Q7 Q8 Q9 Q10 segment
0 57 Female 120963.400958 Yes 529.134363 ... 1 4 2 4 Price
1 63 Female 122008.104950 Yes 478.005781 ... 1 4 1 4 Price
2 59 Male 114202.295294 Yes 490.810731 ... 1 4 1 4 Price
3 60 Male 113616.337078 Yes 347.809004 ... 1 4 2 4 Price
4 51 Male 124252.552787 Yes 379.625940 ... 1 4 2 4 Price
# 通过describe(),发现age的最大值的=是300,出现异常值
print(SegData.describe().T)
输出结果:
count mean ... 75% max
age 1000.0 38.840000 ... 53.000000 300.000000
income 816.0 113543.065222 ... 124572.400926 319704.337941
store_exp 1000.0 1356.850523 ... 597.293077 50000.000000
online_exp 1000.0 2120.181187 ... 2440.774823 9479.442310
store_trans 1000.0 5.350000 ... 7.000000 20.000000
online_trans 1000.0 13.546000 ... 20.000000 36.000000
Q1 1000.0 3.101000 ... 4.000000 5.000000
Q2 1000.0 1.823000 ... 2.000000 5.000000
Q3 1000.0 1.992000 ... 3.000000 5.000000
Q4 1000.0 2.763000 ... 4.000000 5.000000
Q5 1000.0 2.945000 ... 4.000000 5.000000
Q6 1000.0 2.448000 ... 4.000000 5.000000
Q7 1000.0 3.434000 ... 4.000000 5.000000
Q8 1000.0 2.396000 ... 3.000000 5.000000
Q9 1000.0 3.085000 ... 4.000000 5.000000
Q10 1000.0 2.320000 ... 3.000000 5.000000
# 找出age值异常的位置与age的值
r1 = SegData.loc[:, 'age'][SegData.age > 100]
print(r1)
输出结果:
287 300
Name: age, dtype: int64
# 将异常值缺失化
SegData.loc[287, 'age'] = np.nan
import pandas as pd
data = pd.read_excel('../data/normalization_data.xls', header=None)
print(data.head())
输出结果:
0 1 2 3
0 78 521 602 2863
1 144 -600 -521 2245
2 95 -457 468 -1283
3 69 596 695 1054
4 190 527 691 2051
# 标准分数
r = (data - data.mean())/data.std()
print(r)
输出结果:
0 1 2 3
0 -0.905383 0.635863 0.464531 0.798149
1 0.604678 -1.587675 -2.193167 0.369390
2 -0.516428 -1.304030 0.147406 -2.078279
3 -1.111301 0.784628 0.684625 -0.456906
4 1.657146 0.647765 0.675159 0.234796
5 -0.379150 0.401807 0.152139 0.537286
6 0.650438 0.421642 0.069308 0.595564