查看缺失值
dataframe.isnull( )
填补缺失值
向上填充法 .ffill( )
向下填充法 .bfill( )
将某个特定的值x填充空白值 .fillna(x)
删除含有缺失值的样本 .dropna( )
trips1.csv
start_id,end_id,start_date
0,55,55,'8/29/2013 14:13'
1,55,55,'8/29/2013 14:13'
2,55,55,'8/29/2013 14:13'
3,55,55,'8/29/2013 14:13'
4,55,55,'8/29/2013 14:13'
5,55,55,'8/29/2013 14:13'
trips2.csv
start_id,end_id,start_date
55,,'8/29/2013 15:13'
55,,'8/29/2013 16:13'
55,,'8/29/2013 17:13'
55,,'8/29/2013 18:13'
55,,'8/29/2013 19:13'
55,,'8/29/2013 20:13'
trips3.csv
start_id,end_id,start_date
55,56,'8/29/2013 14:13'
55,55,'8/29/2013 14:14'
55,55,'8/29/2013 14:15'
55,55,'8/29/2013 14:16'
55,55,'8/29/2013 14:17'
55,55,'8/29/2013 14:18'
stations.csv
id,name,adderss
55,'#GuoMao#','Beijing'
55,'#SUN#','ShangHai'
55,'#Park#','Beijing'
55,'#Light#','Shanghai'
55,'#Dark#','ShanDong'
代码如下:
import pandas as pd
stations = pd.read_csv('stations.csv',encoding='utf-8')
print(stations.head())
trips1 = pd.read_csv('trips1.csv',encoding='utf-8')
trips2 = pd.read_csv('trips2.csv',encoding='utf-8')
trips3 = pd.read_csv('trips3.csv',encoding='utf-8')
#将trips1、trips2、trips3合并为一个Dataframe,命名为trips
trips = pd.concat([trips1,trips2,trips3])
#将stations中所有列名称前添加字段‘start_’,并将start_id设置为列索引
stations.columns = stations.columns.map(lambda x:'start_'+x)
#将trips和stations按照起始车站id进行字段匹配并合并,保留所有匹配成功的信息
print('----合并后----')
trips_stations = trips.merge(stations,on='start_id')
print(trips_stations)
#将trips_stations导出为'将'trips_stations.csv'文件
trips_stations.to_csv('trips_stations.csv')
#查看trips_stations中是否包含有缺失值,为空的时候是True
print(trips_stations.isnull())
#输出为空的数据行
isNa_trips_stations = trips_stations.isnull()
print(trips_stations[isNa_trips_stations.any(axis=1)])
print(trips_stations)
#缺失值数据使用他上方的数据进行填充
trips_stations = trips_stations['end_id'].ffill()
print(trips_stations)
#缺失值数据使用他下方的数据进行填充
trips_stations = trips_stations['end_id'].bfill()
print(trips_stations)
#将中位数x填充空白值
docks_median = trips_stations['start_docks'].median()
trips_stations['end_id'] = trips_stations['start_docks'].fillna(docks_median)
# 删除含有缺失值的样本
trips_stations = trips_stations.dropna()
print(trips_stations)
去除空格值
去除字段左边的空格 .lstrip( )
去除字段右边的空格 .rstrip( )
去除左右两边的空格 .strip( )
删除左右边的某个字段,可以将该字段填入括号
trips_stations['start_name'] = trips_stations['start_name'].str.strip('#')
拆分字段
split( )
不带参数的时候,按照空格进行拆分,带参数的时候按照该参数进行拆分。
插入某一列
dataframe.insert( )
参数loc:插入列的位置。
参数column:插入列的名称。
参数value: 插入列的内容。
new_col = trips_stations['start_date'].str.split(' ',1,True)
new_col.columns = ['start_date','start_time']
trips_stations['start_date'] = new_col['start_date']
trips_stations.insert(loc = 3,column='start_time',value = new_col['start_time'])
数据分组
pandas.cut( )
参数 bins 为分组边界。
在dataframe添加一列,确定添加列的位置时,可以将dataframe的columns先转换为list,再使用 .index( ) 函数获得所添加列的位置索引。
bins = [min(trips_stations['start_docks'])-1,13,15,17,19,21,23,25,max(trips_stations.start_docks)+1]
labels = ['13以下','13到15','15到17','17到19','19到21','21到23','23到25','25以上']
cut = (pd.cut(trips_stations['start_docks'],bins,right=False,labels=labels))
col_name = trips_stations.columns.tolist()
trips_stations.insert(loc=col_name.index('start_docks')+1,column = 'start_docks_classification',value = cut)
print(trips_stations.head())
虚拟变量
又称哑变量,通常取值为0或1。引入哑变量可以使问题描述更加简明。
pd.get_dummies( )
参数column:欲转换为虚拟变量的指标。
参数prefix:定义列名称。
#添加虚拟变量
# 将subscription_type转化为虚拟变量,添加在dateframe的最后一列
trips_stations_dummies = pd.get_dummies(trips_stations,columns=['start_subscription_type'],
prefix=['start_subscription_type'],prefix_sep='_')
trips_stations_dummies['start_subscription_type'] = trips_stations['start_subscription_type']
print(trips_stations_dummies.head())