null 数据清洗

survival = pd.read_csv(r’C:\DATA\titanic.csv’)
colname = food.columns.tolist()
age = survival[‘age’]
age_is_null = pd.isnull(age)
drop_na_columns = survival.dropna(axis=1)# drop all columns null data
new_data = survival.dropna(axis=0,subset=[‘age’,‘sex’,‘price’])# drop na data in columns age, sex,price
age_null_true = age[age_is_null]
len(age_null_true)
good_age = age[age_is_null==False]

apply function

def hundredth_row(column): #return the hundredth item from a series
hundredth_item=column.loc[99]
return hundredth_item
hundredth = survival.apply(hundredth_row)
hundredth

check null in each column

def not_null_count(column):
column_null =pd.isnull(column)
null = column[column_null]
return len(null)
column_null_count = survival.apply(not_null_count)
print(column_null_count)

check one column data 离散化

def which_class(row):
pclass = row[‘rank’]
if pd.isnull(pclass):
return ‘unknow’
elif pclass ==1:
return ‘First class’
elif pclass ==2:
return ‘second class’
elif pclass ==3:
return ‘third class’
classes = survival.apply(which_class,axis =1)
print(classes)

画图

fig = plt.figure()
ax1 = fig.add_subplot(2,2,1)
ax2 = fig.add_subplot(2,2,2)
ax4 = fig.add_subplot(2,2,4)
plt.show()

你可能感兴趣的:(dataprepare)