import numpy as np
import pandas as pd
df = pd.read_csv("D:/pythonproject/train.csv");
label = df['TARGET']
df = df.drop(['ID','TARGET'],axis=1)
missSet=[np.nan,9999999999,-999999]
len(df.iloc[:,0].unique())
208
count_un=df.iloc[:,0:3].apply(lambda x:len(x.unique()))
print count_un
var3 208
var15 100
imp_ent_var16_ult1 596
dtype: int64
np.sum(df.iloc[:,0]==0)
75
count_zero = df.iloc[:,0:3].apply(lambda x:sum(x==0))
print count_zero
var3 75
var15 0
imp_ent_var16_ult1 72301
dtype: int64
np.mean(df.iloc[:,0])
-1523.1992765061825
df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)]
0 2
1 2
2 2
3 2
4 2
5 2
6 2
7 2
8 2
9 2
10 2
11 2
12 2
13 2
14 2
15 2
16 2
17 2
18 229
19 2
20 2
21 2
22 2
23 2
24 2
25 2
26 2
27 2
28 2
29 2
...
75990 2
75991 2
75992 2
75993 2
75994 2
75995 2
75996 2
75997 2
75998 2
75999 2
76000 2
76001 2
76002 2
76003 2
76004 2
76005 2
76006 2
76007 2
76008 2
76009 2
76010 2
76011 2
76012 4
76013 2
76014 2
76015 2
76016 2
76017 2
76018 2
76019 2
Name: var3, Length: 75904, dtype: int64
np.mean(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
2.717577466273187
df_mean=df.iloc[:,0:3].apply(lambda x:np.mean(x[~np.isin(x,missSet)]))
print df_mean
var3 2.717577
var15 33.212865
imp_ent_var16_ult1 86.208265
dtype: float64
print(np.__version__)
1.13.1
np.median(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
2.0
df_median=df.iloc[:,0:3].apply(lambda x:np.median(x[~np.isin(x,missSet)]))
print df_median
var3 2.0
var15 28.0
imp_ent_var16_ult1 0.0
dtype: float64
from scipy import stats
stats.mode(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
ModeResult(mode=array([2], dtype=int64), count=array([74165]))
df_mode = df.iloc[:,0:3].apply(lambda x:stats.mode(x[~np.isin(x,missSet)])[0][0])
print df_mode
var3 2.0
var15 23.0
imp_ent_var16_ult1 0.0
dtype: float64
df_mode_count = df.iloc[:,0:3].apply(lambda x:stats.mode(x[~np.isin(x,missSet)])[1][0])
print df_mode_count
var3 74165
var15 20170
imp_ent_var16_ult1 72301
dtype: int64
df_mode_percentage = df_mode_count/df.shape[0]
print df_mode_percentage
var3 0.975599
var15 0.265325
imp_ent_var16_ult1 0.951079
dtype: float64
np.min(df.iloc[:,0])
-999999
np.min(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)])
0
df_min = df.iloc[:,0:3].apply(lambda x:np.min(x[~np.isin(x,missSet)]))
print df_min
var3 0.0
var15 5.0
imp_ent_var16_ult1 0.0
dtype: float64
df_max= df.iloc[:,0:3].apply(lambda x:np.max(x[~np.isin(x,missSet)]))
print df_max
var3 238.0
var15 105.0
imp_ent_var16_ult1 210000.0
dtype: float64
np.percentile(df.iloc[:,0],(1,5,25,50,75,95,99))
array([ 2., 2., 2., 2., 2., 2., 11.])
df_percentile = np.percentile(df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)],(1,5,25,50,75,95,99))
print df_percentile
[ 2. 2. 2. 2. 2. 2. 11.]
json_quan = {}
for i,name in enumerate(df.iloc[:,0:3].columns):
json_quan[name]=np.percentile(df[name][~np.isin(df[name],missSet)],(1,5,25,50,75,95,99))
print("the {} is {} ={}").format(i,name,json_quan[name])
the 0 is var3 =[ 2. 2. 2. 2. 2. 2. 11.]
the 1 is var15 =[ 22. 23. 23. 28. 40. 60. 77.]
the 2 is imp_ent_var16_ult1 =[ 0. 0. 0. 0. 0. 0. 1500.]
df_quan = pd.DataFrame(json_quan)[df.iloc[:,0:3].columns].T
print df_quan
0 1 2 3 4 5 6
var3 2.0 2.0 2.0 2.0 2.0 2.0 11.0
var15 22.0 23.0 23.0 28.0 40.0 60.0 77.0
imp_ent_var16_ult1 0.0 0.0 0.0 0.0 0.0 0.0 1500.0
df.iloc[:,0].value_counts().iloc[0:5,]
2 74165
8 138
-999999 116
9 110
3 108
Name: var3, dtype: int64
df.iloc[:,0][~np.isin(df.iloc[:,0],missSet)].value_counts().iloc[0:5,]
2 74165
8 138
9 110
3 108
1 105
Name: var3, dtype: int64
json_fre_name={}
json_fre_counts ={}
def fill_fre_top_5(x):
if(len(x))<=5:
new_array = np.full(5,np.nan)
new_array[0:len(x)]=x
return new_array
df['ind_var1_0'].value_counts()
df['imp_sal_var16_ult1'].value_counts()
0.00 75924
3000.00 9
1500.00 5
600.00 4
450.00 3
750.00 3
150.00 3
3600.00 2
3300.00 2
1620.00 2
45.00 2
300.00 2
12000.00 2
210.00 2
30.00 2
2100.00 2
6000.00 2
165.00 1
900.00 1
2244.00 1
15.00 1
360.00 1
1809.00 1
8850.00 1
390.00 1
120.00 1
99.00 1
6270.00 1
10500.00 1
1350.00 1
...
29250.00 1
1800.00 1
2550.00 1
4200.00 1
1650.00 1
3570.00 1
1020.00 1
907.14 1
60.00 1
630.00 1
6060.00 1
1770.00 1
1509.75 1
7050.00 1
105.00 1
2010.00 1
2460.00 1
4608.00 1
1227.00 1
1.50 1
1470.00 1
6900.00 1
86.01 1
5850.00 1
58048.05 1
2872.50 1
5570.13 1
8187.00 1
574.56 1
540.00 1
Name: imp_sal_var16_ult1, Length: 66, dtype: int64
json_fre_name={}
json_fre_count ={}
for i,name in enumerate(df[['ind_var1_0','imp_sal_var16_ult1']].columns):
index_name = df[name][~np.isin(df[name],missSet)].value_counts().iloc[0:5,].index.values
index_name = fill_fre_top_5(index_name)
json_fre_name[name] =index_name
value_count = df[name][~np.isin(df[name],missSet)].value_counts().iloc[0:5,].values
value_count = fill_fre_top_5(value_count)
json_fre_count[name]=value_count
df_fre_name = pd.DataFrame(json_fre_name)[df[['ind_var1_0','imp_sal_var16_ult1']].columns].T
df_fre_count = pd.DataFrame(json_fre_count)[df[['ind_var1_0','imp_sal_var16_ult1']].columns].T
df_fre = pd.concat([df_fre_name,df_fre_count],axis=1)
print df_fre
0 1 2 3 4 0 1 2 \
ind_var1_0 0.0 1.0 NaN NaN NaN 75149.0 871.0 NaN
imp_sal_var16_ult1 0.0 3000.0 1500.0 600.0 450.0 75924.0 9.0 5.0
3 4
ind_var1_0 NaN NaN
imp_sal_var16_ult1 4.0 3.0
df_miss = df.iloc[:,0:3].apply(lambda x:np.sum(np.isin(x,missSet)))
print df_miss
var3 116
var15 0
imp_ent_var16_ult1 0
dtype: int64