参考资料:
# 通过array创建DataFrame
t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t)
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
# 指定DataFrame的index和columns
t1 = np.arange(12).reshape(3, 4)
t2 = pd.DataFrame(t1, index=list("123"), columns=list("abcd"))
print(t2)
# a b c d
# 1 0 1 2 3
# 2 4 5 6 7
# 3 8 9 10 11
# 通过字典创建DataFrame
d1 = {'name': "张三", 'age': 18, 'del': 10010}
d2 = {'name': '李四', 'del': 10086}
d3 = {'age': 17}
L = [d1, d2, d3]
t = pd.DataFrame(L)
print(t)
# name age del
# 0 张三 18.0 10010.0
# 1 李四 NaN 10086.0
# 2 NaN 17.0 NaN
t = pd.read_csv("./train.csv")
print(t)
# PassengerId Survived Pclass ... Fare Cabin Embarked
# 0 1 0 3 ... 7.2500 NaN S
# 1 2 1 1 ... 71.2833 C85 C
# 2 3 1 3 ... 7.9250 NaN S
# 3 4 1 1 ... 53.1000 C123 S
# 4 5 0 3 ... 8.0500 NaN S
# .. ... ... ... ... ... ... ...
# 886 887 0 2 ... 13.0000 NaN S
# 887 888 1 1 ... 30.0000 B42 S
# 888 889 0 3 ... 23.4500 NaN S
# 889 890 1 1 ... 30.0000 C148 C
# 890 891 0 3 ... 7.7500 NaN Q
#
# [891 rows x 12 columns]
t = pd.read_csv("./train.csv")
print(t.index) # 获取DataFrame的行索引
# RangeIndex(start=0, stop=891, step=1)
print(t.columns) # 获取DataFrame的列索引
# Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
# 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
# dtype='object')
print(t.shape) # 获取DataFrame的形状
# (891, 12)
print(t.dtypes) # 获取DataFrame中每一个列索引对应的数据类型
# PassengerId int64
# Survived int64
# Pclass int64
# Name object
# Sex object
# Age float64
# SibSp int64
# Parch int64
# Ticket object
# Fare float64
# Cabin object
# Embarked object
# dtype: object
print(t.ndim) # 获取DataFrame的维度
# 2
print(t.head()) # 显示DataFrame的前几行(默认5行)
# PassengerId Survived Pclass ... Fare Cabin Embarked
# 0 1 0 3 ... 7.2500 NaN S
# 1 2 1 1 ... 71.2833 C85 C
# 2 3 1 3 ... 7.9250 NaN S
# 3 4 1 1 ... 53.1000 C123 S
# 4 5 0 3 ... 8.0500 NaN S
#
# [5 rows x 12 columns]
print(t.tail()) # 显示DataFrame的后几行(默认5行)
# PassengerId Survived Pclass ... Fare Cabin Embarked
# 886 887 0 2 ... 13.00 NaN S
# 887 888 1 1 ... 30.00 B42 S
# 888 889 0 3 ... 23.45 NaN S
# 889 890 1 1 ... 30.00 C148 C
# 890 891 0 3 ... 7.75 NaN Q
#
# [5 rows x 12 columns]
print(t.describe()) # 显示DataFrame的统计信息
# PassengerId Survived Pclass ... SibSp Parch Fare
# count 891.000000 891.000000 891.000000 ... 891.000000 891.000000 891.000000
# mean 446.000000 0.383838 2.308642 ... 0.523008 0.381594 32.204208
# std 257.353842 0.486592 0.836071 ... 1.102743 0.806057 49.693429
# min 1.000000 0.000000 1.000000 ... 0.000000 0.000000 0.000000
# 25% 223.500000 0.000000 2.000000 ... 0.000000 0.000000 7.910400
# 50% 446.000000 0.000000 3.000000 ... 0.000000 0.000000 14.454200
# 75% 668.500000 1.000000 3.000000 ... 1.000000 0.000000 31.000000
# max 891.000000 1.000000 3.000000 ... 8.000000 6.000000 512.329200
#
# [8 rows x 7 columns]
t = pd.read_csv("train.csv");
print(t.sort_values(by="Fare", ascending=False)) #按照"Fare"的值进行升序排列
# PassengerId Survived Pclass ... Fare Cabin Embarked
# 258 259 1 1 ... 512.3292 NaN C
# 737 738 1 1 ... 512.3292 B101 C
# 679 680 1 1 ... 512.3292 B51 B53 B55 C
# 88 89 1 1 ... 263.0000 C23 C25 C27 S
# 27 28 0 1 ... 263.0000 C23 C25 C27 S
# .. ... ... ... ... ... ... ...
# 633 634 0 1 ... 0.0000 NaN S
# 413 414 0 2 ... 0.0000 NaN S
# 822 823 0 1 ... 0.0000 NaN S
# 732 733 0 2 ... 0.0000 NaN S
# 674 675 0 2 ... 0.0000 NaN S
#
# [891 rows x 12 columns]
t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t[:2])
print(type(t[:2]))
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
#
t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t[1])
print(type(t[1]))
# 0 0
# 1 4
# 2 8
# Name: 0, dtype: int32
#
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("WXYZ"))
print(t)
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10 11
print(t.loc['a', :])
# W 0
# X 1
# Y 2
# Z 3
# Name: a, dtype: int32
print(t.loc['b', 'X'])
# 5
print(t.loc['a':'c', 'W':'X']) # 冒号右边的标签包含在切片中
# W X
# a 0 1
# b 4 5
# c 8 9
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("WXYZ"))
print(t)
# W X Y Z
# a 0 1 2 3
# b 4 5 6 7
# c 8 9 10 11
print(t.iloc[0, :])
# W 0
# X 1
# Y 2
# Z 3
# Name: a, dtype: int32
print(t.iloc[1, 1])
# 5
print(t.iloc[0:2, 0:1]) # 冒号右边的位置不包含在切片中
# W
# a 0
# b 4
t = pd.read_csv("train.csv")
print(t[(t["Fare"] > 200) & (t["Embarked"] == 'S')])
# PassengerId Survived Pclass ... Fare Cabin Embarked
# 27 28 0 1 ... 263.0000 C23 C25 C27 S
# 88 89 1 1 ... 263.0000 C23 C25 C27 S
# 341 342 1 1 ... 263.0000 C23 C25 C27 S
# 438 439 0 1 ... 263.0000 C23 C25 C27 S
# 527 528 0 1 ... 221.7792 C95 S
# 689 690 1 1 ... 211.3375 B5 S
# 730 731 1 1 ... 211.3375 B5 S
# 779 780 1 1 ... 211.3375 B3 S
#
# [8 rows x 12 columns]
t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("WXYZ"))
t.loc['b', 'W':'X'] = np.nan
t.loc['c', :] = np.nan
print(t)
# W X Y Z
# a 0.0 1.0 2.0 3.0
# b NaN NaN 6.0 7.0
# c NaN NaN NaN NaN
NaN
print(t.isnull())
# W X Y Z
# a False False False False
# b True True False False
# c True True True True
print(t.notnull())
# W X Y Z
# a True True True True
# b False False True True
# c False False False False
NaN
所在的行列print(t.dropna(how="any"))
# W X Y Z
# a 0.0 1.0 2.0 3.0
print(t.dropna(how="all"))
# W X Y Z
# a 0.0 1.0 2.0 3.0
# b NaN NaN 6.0 7.0
NaN
print(t.fillna(0))
# W X Y Z
# a 0.0 1.0 2.0 3.0
# b 0.0 0.0 6.0 7.0
# c 0.0 0.0 0.0 0.0
print(t.fillna(t.mean()))
# W X Y Z
# a 0.0 1.0 2.0 3.0
# b 0.0 1.0 6.0 7.0
# c 0.0 1.0 4.0 5.0
print(t["W"].fillna(t["W"].mean()))
# a 0.0
# b 0.0
# c 0.0
# Name: W, dtype: float64