字段类型转换 替换
############################################################
#判断字段类型
print('Distance 类型:',dfoff['Distance'].unique())
type(dfoff['Distance'].unique()[2])
dfoff['Distance'].unique()[2] == np.nan
type(np.nan)
##将float类型变换为int类型
#1.将nulL值填充为0 填充为null
dfoff['Date_received']=dfoff['Date_received'].fillna(0)
#加了引号相当于字符,不要加
dfoff['distance'] = dfoff['Distance'].fillna(-1)
dfoff['Date_received']=dfoff['Date_received'].fillna('null')
#2.0 转换类型 替换
#把数据中的null值全部替换为-1
t4.replace('null',-1,inplace=True)
t4.distance = t4.distance.astype('int')
#再把数据中的-1全部替换为NaN
t4.replace(-1,np.nan,inplace=True)
dfoff['distance'] = dfoff['Distance'].replace(np.nan, -1)
dfoff['weekday'].replace('null', np.nan)
dfoff['Date_received']=dfoff['Date_received'].astype('int')
#转换为string类型
dfoff['Date']=dfoff['Date'].astype('str')
#查看某字段下所有数据种类
print('Discount_rate 类型:\n',dfoff['Discount_rate'].unique())
#统计各类型的数量
print(dfoff['label'].value_counts())
#字符拼接 生成列名
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
###################################################################
#日期字段筛选
off_train = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_train.csv',header=0)
off_train.head()
off_train.info()
#float类型较难筛选, 可以把日期字段转换为int类型, 不过注意,字段有空值转换会报错,先替换空值
off_train['date']=off_train['Date'].fillna(0)
off_train['date']=off_train['date'].astype('int')
feature3 = off_train[((off_train['date'] >= 20160315)&(off_train.date<=20160630))|((off_train['date']==0)&(off_train['Date_received']>=20160315)&(off_train['Date_received']<=20160630))]
feature3.head()