pandas自学笔记__DataFrame

参考资料:

  • 黑马程序员相关教程

创建

# 通过array创建DataFrame
t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t)
#    0  1   2   3
# 0  0  1   2   3
# 1  4  5   6   7
# 2  8  9  10  11

# 指定DataFrame的index和columns
t1 = np.arange(12).reshape(3, 4)
t2 = pd.DataFrame(t1, index=list("123"), columns=list("abcd"))
print(t2)
#    a  b   c   d
# 1  0  1   2   3
# 2  4  5   6   7
# 3  8  9  10  11

# 通过字典创建DataFrame
d1 = {'name': "张三", 'age': 18, 'del': 10010}
d2 = {'name': '李四', 'del': 10086}
d3 = {'age': 17}
L = [d1, d2, d3]
t = pd.DataFrame(L)
print(t)
#   name   age      del
# 0   张三  18.0  10010.0
# 1   李四   NaN  10086.0
# 2  NaN  17.0      NaN

读取csv文件

t = pd.read_csv("./train.csv")
print(t)
#      PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
# 0              1         0       3  ...   7.2500   NaN         S
# 1              2         1       1  ...  71.2833   C85         C
# 2              3         1       3  ...   7.9250   NaN         S
# 3              4         1       1  ...  53.1000  C123         S
# 4              5         0       3  ...   8.0500   NaN         S
# ..           ...       ...     ...  ...      ...   ...       ...
# 886          887         0       2  ...  13.0000   NaN         S
# 887          888         1       1  ...  30.0000   B42         S
# 888          889         0       3  ...  23.4500   NaN         S
# 889          890         1       1  ...  30.0000  C148         C
# 890          891         0       3  ...   7.7500   NaN         Q
# 
# [891 rows x 12 columns]

DataFrame的属性

t = pd.read_csv("./train.csv")
print(t.index)  # 获取DataFrame的行索引
# RangeIndex(start=0, stop=891, step=1)
print(t.columns)    # 获取DataFrame的列索引
# Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
#        'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
#       dtype='object')
print(t.shape)      # 获取DataFrame的形状
# (891, 12)
print(t.dtypes)     # 获取DataFrame中每一个列索引对应的数据类型
# PassengerId      int64
# Survived         int64
# Pclass           int64
# Name            object
# Sex             object
# Age            float64
# SibSp            int64
# Parch            int64
# Ticket          object
# Fare           float64
# Cabin           object
# Embarked        object
# dtype: object
print(t.ndim)       # 获取DataFrame的维度
# 2
print(t.head())     # 显示DataFrame的前几行(默认5行)
#    PassengerId  Survived  Pclass  ...     Fare Cabin  Embarked
# 0            1         0       3  ...   7.2500   NaN         S
# 1            2         1       1  ...  71.2833   C85         C
# 2            3         1       3  ...   7.9250   NaN         S
# 3            4         1       1  ...  53.1000  C123         S
# 4            5         0       3  ...   8.0500   NaN         S
#
# [5 rows x 12 columns]
print(t.tail())     # 显示DataFrame的后几行(默认5行)
#      PassengerId  Survived  Pclass  ...   Fare Cabin  Embarked
# 886          887         0       2  ...  13.00   NaN         S
# 887          888         1       1  ...  30.00   B42         S
# 888          889         0       3  ...  23.45   NaN         S
# 889          890         1       1  ...  30.00  C148         C
# 890          891         0       3  ...   7.75   NaN         Q
#
# [5 rows x 12 columns]
print(t.describe())     # 显示DataFrame的统计信息
#        PassengerId    Survived      Pclass  ...       SibSp       Parch        Fare
# count   891.000000  891.000000  891.000000  ...  891.000000  891.000000  891.000000
# mean    446.000000    0.383838    2.308642  ...    0.523008    0.381594   32.204208
# std     257.353842    0.486592    0.836071  ...    1.102743    0.806057   49.693429
# min       1.000000    0.000000    1.000000  ...    0.000000    0.000000    0.000000
# 25%     223.500000    0.000000    2.000000  ...    0.000000    0.000000    7.910400
# 50%     446.000000    0.000000    3.000000  ...    0.000000    0.000000   14.454200
# 75%     668.500000    1.000000    3.000000  ...    1.000000    0.000000   31.000000
# max     891.000000    1.000000    3.000000  ...    8.000000    6.000000  512.329200
# 
# [8 rows x 7 columns]

DataFrame的排序

t = pd.read_csv("train.csv");
print(t.sort_values(by="Fare", ascending=False))    	#按照"Fare"的值进行升序排列
#      PassengerId  Survived  Pclass  ...      Fare        Cabin  Embarked
# 258          259         1       1  ...  512.3292          NaN         C
# 737          738         1       1  ...  512.3292         B101         C
# 679          680         1       1  ...  512.3292  B51 B53 B55         C
# 88            89         1       1  ...  263.0000  C23 C25 C27         S
# 27            28         0       1  ...  263.0000  C23 C25 C27         S
# ..           ...       ...     ...  ...       ...          ...       ...
# 633          634         0       1  ...    0.0000          NaN         S
# 413          414         0       2  ...    0.0000          NaN         S
# 822          823         0       1  ...    0.0000          NaN         S
# 732          733         0       2  ...    0.0000          NaN         S
# 674          675         0       2  ...    0.0000          NaN         S
#
# [891 rows x 12 columns]

索引

行索引

t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t[:2])
print(type(t[:2]))
#    0  1  2  3
# 0  0  1  2  3
# 1  4  5  6  7
# 

列索引

t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t[1])
print(type(t[1]))
# 0    0
# 1    4
# 2    8
# Name: 0, dtype: int32
# 

通过标签索引

t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("WXYZ"))
print(t)
#    W  X   Y   Z
# a  0  1   2   3
# b  4  5   6   7
# c  8  9  10  11
print(t.loc['a', :])
# W    0
# X    1
# Y    2
# Z    3
# Name: a, dtype: int32
print(t.loc['b', 'X'])
# 5
print(t.loc['a':'c', 'W':'X'])		# 冒号右边的标签包含在切片中
#    W  X
# a  0  1
# b  4  5
# c  8  9

通过位置索引

t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("WXYZ"))
print(t)
#    W  X   Y   Z
# a  0  1   2   3
# b  4  5   6   7
# c  8  9  10  11
print(t.iloc[0, :])
# W    0
# X    1
# Y    2
# Z    3
# Name: a, dtype: int32
print(t.iloc[1, 1])
# 5
print(t.iloc[0:2, 0:1])		# 冒号右边的位置不包含在切片中
#    W
# a  0
# b  4

布尔索引

t = pd.read_csv("train.csv")
print(t[(t["Fare"] > 200) & (t["Embarked"] == 'S')])
#      PassengerId  Survived  Pclass  ...      Fare        Cabin  Embarked
# 27            28         0       1  ...  263.0000  C23 C25 C27         S
# 88            89         1       1  ...  263.0000  C23 C25 C27         S
# 341          342         1       1  ...  263.0000  C23 C25 C27         S
# 438          439         0       1  ...  263.0000  C23 C25 C27         S
# 527          528         0       1  ...  221.7792          C95         S
# 689          690         1       1  ...  211.3375           B5         S
# 730          731         1       1  ...  211.3375           B5         S
# 779          780         1       1  ...  211.3375           B3         S
# 
# [8 rows x 12 columns]

缺失数据处理

t = pd.DataFrame(np.arange(12).reshape(3, 4), index=list("abc"), columns=list("WXYZ"))
t.loc['b', 'W':'X'] = np.nan
t.loc['c', :] = np.nan
print(t)
#      W    X    Y    Z
# a  0.0  1.0  2.0  3.0
# b  NaN  NaN  6.0  7.0
# c  NaN  NaN  NaN  NaN

判断数据是否为NaN

print(t.isnull())
#        W      X      Y      Z
# a  False  False  False  False
# b   True   True  False  False
# c   True   True   True   True
print(t.notnull())
#        W      X      Y      Z
# a   True   True   True   True
# b  False  False   True   True
# c  False  False  False  False

删除NaN所在的行列

print(t.dropna(how="any"))
#      W    X    Y    Z
# a  0.0  1.0  2.0  3.0
print(t.dropna(how="all"))
#      W    X    Y    Z
# a  0.0  1.0  2.0  3.0
# b  NaN  NaN  6.0  7.0

填充NaN

print(t.fillna(0))
#      W    X    Y    Z
# a  0.0  1.0  2.0  3.0
# b  0.0  0.0  6.0  7.0
# c  0.0  0.0  0.0  0.0
print(t.fillna(t.mean()))
#      W    X    Y    Z
# a  0.0  1.0  2.0  3.0
# b  0.0  1.0  6.0  7.0
# c  0.0  1.0  4.0  5.0
print(t["W"].fillna(t["W"].mean()))
# a    0.0
# b    0.0
# c    0.0
# Name: W, dtype: float64

你可能感兴趣的:(python,数据挖掘,数据分析)