笔记参照课程唐宇迪python数据分析与机器学习实战
笔记方便自己今后回顾和查看,需要详细了解各自Pandas操作,建议学习上述课程
pandas是基于numpy的数据处理库。其数据的基本结构从小到大依次为numpy.array,pandas.core.series.Series以及pandas.core.frame.DataFrame. 简单的pandas特有的结构为series和DataFrame。DataFrame相当于一张广义的表格,其中的每一行以及每一列都是一个series,而series里的内容就是numpy.array。Series 下的结构都可以用.values来提取里面的内容
默认第一行列名,第二行为第一行数据
import pandas as pa
flower_infor=pa.read_csv('Iris.csv')
默认第一行是数据
flower_infor=pa.read_csv('Iris.cvs',header=None)
显示前n行数据,默认为5行
n=10
first_row=flower_infor.head(n)
显示尾n行数据,默认为5行
n=10
last_row=flower_infor.tail(n)
flower_infor.columns
flower_infor.shape
注意下述区别,若使用loc函数,记录方法参照matlab区别在于第一个值为0
若直接使用list,则记录方法参照python
flower_infor.loc[3:6]
output:
s_len s_wid p_len p_wid type
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
6 4.6 3.4 1.4 0.3 Iris-setosa
flower_infor[3:6]
output:
s_len s_wid p_len p_wid type
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
5 5.4 3.9 1.7 0.4 Iris-setosa
columns=['s_len','p_len']
print flower_infor[columns]
将列名转集转化为list
flower_infor.columns.tolist()
将列名集转化为np.array
flower_infor.columns.values
flower_infor.columns.values.dtype
output
object
flower_infor.loc[3:6][['p_len','s_len']]
output
p_len s_len
3 1.5 4.6
4 1.4 5.0
5 1.7 5.4
6 1.4 4.6
#col_names=flower_infor.columns.tolist()
col_names=flower_infor.columns.values
print col_names
len_columns=[]
for c in col_names:
if c.startswith('s'):
len_columns.append(c)
len_infor=flower_infor[len_columns]
print len_infor.head()
output
['s_len' 's_wid' 'p_len' 'p_wid' 'type']
s_len s_wid
0 5.1 3.5
1 4.9 3.0
2 4.7 3.2
3 4.6 3.1
4 5.0 3.6
series 和 一个数之间默认 内 运算 相当于matlab里的bsxfun
num_data=flower_infor.shape[0];
s_len_div_num=flower_infor['s_len']/num_data
s_len_div_num.head()
output
0 0.034000
1 0.032667
2 0.031333
3 0.030667
4 0.033333
维度相同的Series 为matlab 中的 点 运算
s_p_len_ave=(flower_infor['s_len']+flower_infor['p_len'])/2
建立新的列
flower_infor['s_p_len_ave']=s_p_len_ave
print flower_infor.head()
s_p_len_ave.max()
inplace:是否改变原来的顺序
flower_infor.sort_values(['s_len','p_len'],inplace=False,ascending=True)
print flower_infor.head()
s_len s_wid p_len p_wid type s_p_len_ave
13 4.3 3.0 1.1 0.1 Iris-setosa 2.70
42 4.4 3.2 1.3 0.2 Iris-setosa 2.85
38 4.4 3.0 1.3 0.2 Iris-setosa 2.85
8 4.4 2.9 1.4 0.2 Iris-setosa 2.90
41 4.5 2.3 1.3 0.3 Iris-setosa 2.90
flower_infor.sort_values('s_len',inplace=False)
print flower_infor.head()
output:
s_len s_wid p_len p_wid type
0 5.1 3.5 1.4 0.2 Iris-setosa
1 4.9 3.0 1.4 0.2 Iris-setosa
2 4.7 3.2 1.3 0.2 Iris-setosa
3 4.6 3.1 1.5 0.2 Iris-setosa
4 5.0 3.6 1.4 0.2 Iris-setosa
import pandas as pa
import numpy as np
titanic_survial=pa.read_csv('titanic_train.csv')
用逻辑值来给index最方便,最直接
age=titanic_survial['Age']
age_is_null=pa.isnull(age)
age_null_true=age[age_is_null]
age_null_count=len(age_null_true)
因为有缺损值,所以为nan
mean_age=sum(titanic_survial['Age']/len(titanic_survial['Age']))
print mean_age
output
nan
#删除缺失值
good_ages=titanic_survial['Age'][pa.isnull(age)==False]
correct_mean_age=sum(good_ages)/len(good_ages)
print correct_mean_age
177
correct_mean_age=titanic_survial['Age'].mean()
print correct_mean_age
29.6991176471
passenger_classes=[1,2,3]
fares_by_class={}
for this_class in passenger_classes:
#定位到相应的行
pclass_rows=titanic_survial[titanic_survial['Pclass']==this_class]
pclass_fares=pclass_rows['Fare']
fares_for_class=pclass_fares.mean()
fares_by_class[this_class]=fares_for_class
print fares_by_class
output
{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
passenger_survial=titanic_survial.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean)
print passenger_survial
output
Pclass
1 0.629630
2 0.472826
3 0.242363
Name: Survived, dtype: float64
passenger_survial=titanic_survial.pivot_table(index='Pclass',values='Survived',aggfunc=np.mean)
print passenger_survial.values
output
[ 0.62962963 0.47282609 0.24236253]
port_stats=titanic_survial.pivot_table(index='Embarked',values=['Fare','Survived'],aggfunc=np.sum)
print port_stats
output
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
dropna
#如果有个空列,删除列
titanic_survial.dropna(axis=1)
#如果在某行在Age活在Sex中为Na,那么删除行
new_titanic_survial=titanic_survial.dropna(axis=0,subset=['Age','Sex'])
填充用fillna
t=titanic_survial['Age']==38
np.where(t)[0]
output
array([ 1, 25, 61, 108, 224, 332, 357, 465, 471, 716, 822])
reset_index
new_titanic_survial=titanic_survial.sort_values('Age',ascending=False)
print (new_titanic_survial[0:2])
print('----------------------------')
titanic_reindexed=new_titanic_survial.reset_index(drop=True)
print (titanic_reindexed[0:2])
output
PassengerId Survived Pclass Name \
630 631 1 1 Barkworth, Mr. Algernon Henry Wilson
851 852 0 3 Svensson, Mr. Johan
Sex Age SibSp Parch Ticket Fare Cabin Embarked age_labels
630 male 80.0 0 0 27042 30.000 A23 S adult
851 male 74.0 0 0 347060 7.775 NaN S adult
----------------------------
PassengerId Survived Pclass Name Sex \
0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male
1 852 0 3 Svensson, Mr. Johan male
Age SibSp Parch Ticket Fare Cabin Embarked age_labels
0 80.0 0 0 27042 30.000 A23 S adult
1 74.0 0 0 347060 7.775 NaN S adult
在这里apply相当于将函数用于frame里面的每一列
#This function returns the hundredth item from seriers
def hunderdth_row(column):
hundredth_item=column.loc[99]
return hundredth_item
hundredth_Row=titanic_survial.apply(hunderdth_row)
print hundredth_Row.values
[100 0 2 'Kantor, Mr. Sinai' 'male' 34.0 1 0 '244367' 26.0 nan 'S' 'adult']
如果要求函数之用于某几列
def not_null_count(column):
column_null=pa.isnull(column)
null=column[column_null]
return len(null)
###apply 函数是把数据的每一列做为循环应用函数
column_null_count=titanic_survial[['Age']].apply(not_null_count)
print column_null_count
相当于把这个操作在每列进行
len(titanic_survial['Age'][pa.isnull(titanic_survial['Age']).values])
output
177
如果某函数就是针对某列(注意这里的axis=1)
##数据转换
def which_class(row):
pclass=row['Pclass']
if pa.isnull(pclass):
return 'Unknown'
elif pclass==1:
return 'First Class'
elif pclass==2:
return 'Second Class'
elif pclass==3:
return 'Third Class'
classes=titanic_survial.apply(which_class,axis=1)
print classes.head(5)
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
dtype: object
def is_minor(row):
if row['Age']<18:
return True
else:
return False
minors=titanic_survial.apply(is_minor,axis=1)
def generate_age_label(row):
age=row['Age']
if pa.isnull(age):
return 'Unknown'
elif age<18:
return 'minor'
else:
return 'adult'
age_labels=titanic_survial.apply(generate_age_label,axis=1)
print age_labels
titanic_survial['age_labels']=age_labels
age_group_survial=titanic_survial.pivot_table(index='age_labels',values='Survived',aggfunc=np.mean)
print age_group_survial
age_labels
Unknown 0.293785
adult 0.381032
minor 0.539823
Name: Survived, dtype: float64