我的视频学习笔记
pandas 基础
# 如果numpy是一个列表 那么pandas更像是一个字典 字典化的numpy
import pandas as pd
import numpy as np
s = pd.Series([1, 3, 6, np.nan, 44, 1]) # np.nan = NaN = null = none
# 0 1.0
# 1 3.0
# 2 6.0
# 3 NaN
# 4 44.0
# 5 1.0
# dtype: float64
dates = pd.date_range('20200314', periods=6) # 输出从2020-03-14开始的6天
# DatetimeIndex(['2020-03-14', '2020-03-15', '2020-03-16', '2020-03-17',
# '2020-03-18', '2020-03-19'],
# dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=['a', 'b', 'c', 'd'])
# 随机生成一个6行4列的矩阵 行以前面的日期作为索引 列以a,b,c,d作为索引
# a b c d
# 2020-03-14 -2.253409 -0.948332 1.925934 -0.255811
# 2020-03-15 0.011358 1.133696 -0.614601 0.387414
# 2020-03-16 -0.265261 -0.891482 -0.913461 -0.471914
# 2020-03-17 -1.734964 -0.305402 -1.263270 -0.710429
# 2020-03-18 0.696185 0.345513 0.245048 0.479076
# 2020-03-19 -0.641163 -0.995509 -1.546538 -0.704261
df1 = pd.DataFrame(np.arange(12).reshape((3, 4)))
# 不定义行和列名称
# 0 1 2 3
# 0 0 1 2 3
# 1 4 5 6 7
# 2 8 9 10 11
df2 = pd.DataFrame({
'A': 1.,
'B': pd.Timestamp('20200314'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(['test', 'train', "test", "train"]),
'F': 'foo'
})
# A B C D E F
# 0 1.0 2020-03-14 1.0 3 test foo
# 1 1.0 2020-03-14 1.0 3 train foo
# 2 1.0 2020-03-14 1.0 3 test foo
# 3 1.0 2020-03-14 1.0 3 train foo
c = df2.dtypes # 每一列的数据类型
# A float64
# B datetime64[ns]
# C float32
# D int32
# E category
# F object
# dtype: object
c = df2.index # 行的标序 Int64Index([0, 1, 2, 3], dtype='int64')
c = df2.columns # 列的标序 Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
c = df2.values # 输出所有值
# [[1.0 Timestamp('2020-03-14 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-03-14 00:00:00') 1.0 3 'train' 'foo']
# [1.0 Timestamp('2020-03-14 00:00:00') 1.0 3 'test' 'foo']
# [1.0 Timestamp('2020-03-14 00:00:00') 1.0 3 'train' 'foo']]
c = df2.describe() # 只能对数字形式的进行描述
# A C D
# count 4.0 4.0 4.0
# mean 1.0 1.0 3.0
# std 0.0 0.0 0.0
# min 1.0 1.0 3.0
# 25% 1.0 1.0 3.0
# 50% 1.0 1.0 3.0
# 75% 1.0 1.0 3.0
# max 1.0 1.0 3.0
c = df2.T # df2的转置
c = df2.sort_index(axis=1, ascending=False) # 按照列 倒序 axis=0 则按行倒序 3 2 1 0 ascending=False为倒序 True为正序
# F E D C B A
# 0 foo test 3 1.0 2020-03-14 1.0
# 1 foo train 3 1.0 2020-03-14 1.0
# 2 foo test 3 1.0 2020-03-14 1.0
# 3 foo train 3 1.0 2020-03-14 1.0
c = df2.sort_values(by='E') # 对值进行排序
# A B C D E F
# 0 1.0 2020-03-14 1.0 3 test foo
# 2 1.0 2020-03-14 1.0 3 test foo
# 1 1.0 2020-03-14 1.0 3 train foo
# 3 1.0 2020-03-14 1.0 3 train foo
选择数据
dates = pd.date_range('20200314', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
c = df.A
c = df['A'] # 两者均为选中A列
# 2020-03-14 0
# 2020-03-15 4
# 2020-03-16 8
# 2020-03-17 12
# 2020-03-18 16
# 2020-03-19 20
# Freq: D, Name: A, dtype: int32
c = df[0: 3] # 前3行
c = df['20200314': '20200316'] # 20200314至20200316行的元素
# A B C D
# 2020-03-14 0 1 2 3
# 2020-03-15 4 5 6 7
# 2020-03-16 8 9 10 11
# select by label: loc 通过标签值筛选
c = df.loc['20200314'] # '20200314'对应的序列
c = df.loc[:, ['A', 'B']] # A B列对应的所有数据
c = df.loc['20200314', ['A', 'B']] # A B列对应的所有'20200314'的数据
# select by position: iloc 通过序列号筛选
c = df.iloc[3] # 第3行
c = df.iloc[3, 1] # 第3行第1个数据
c = df.iloc[3: 5, 1: 3] # 第3行到第4行 第1个到第2个数据
c = df.iloc[[1, 3, 5], [2, 3]]
# Boolean indexing
c = df[df.A < 8] # 所有A的值小于8的序列
# A B C D
# 2020-03-14 0 1 2 3
# 2020-03-15 4 5 6 7
设置值
df.iloc[2, 2] = 111 # 第2行第2个数字改成111
df.loc['20200314', 'B'] = 222 # 第'20200314'行,第'B'列的值改成222
# df[df.A > 4] = 0 # 将A列大于4的行开始往下所有的值都改成0
# A B C D A B C D A B C D F
# 2020-03-14 0 222 2 3 2020-03-14 0 222 2 3 2020-03-14 0 222 2 3 NaN
# 2020-03-15 4 5 6 7 2020-03-15 4 5 6 7 2020-03-15 4 5 6 7 NaN
# 2020-03-16 0 0 0 0 2020-03-16 0 9 111 11 2020-03-16 0 9 111 11 NaN
# 2020-03-17 0 0 0 0 2020-03-17 0 13 14 15 2020-03-17 0 13 14 15 NaN
# 2020-03-18 0 0 0 0 2020-03-18 0 17 18 19 2020-03-18 0 17 18 19 NaN
# 2020-03-19 0 0 0 0 2020-03-19 0 21 22 23 2020-03-19 0 21 22 23 NaN
df.A[df.A > 4] = 0 # 将A列大于4的行开始往下所有A列的值都改成0⬆
df['F'] = np.nan # 添加F列 并将里面的所有值都赋值成non ⬆
df['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('2020-03-14', periods=6))
# 将123456放入E列添加到df里 从2020-03-14的行开始 如果改成2020-03-15 则2020-03-14这行的E值为NaN 1~5顺次赋值
处理丢失数据
dates = pd.date_range('20200314', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
c = df.dropna(axis=0, how='any') # 按行 但凡有NaN的数据 整行都删除 how={'any', 'all'}
# 'all' 的意思是只有这一行的值全部为NaN时才删除这一行 axis=0 是行 axis=1 是列
c = df.fillna(value=1) # 如果检测出NaN 则把这个数字改成1
c = df.isnull() # 返回每一个值是否为NaN
c = any(df.isnull()) # 返回是否有NaN
pandas导入导出
data = pd.read_csv('Students.csv', header=None, encoding='utf-8')
data.to_pickle('Students.pickle')
Concat
# concatenating 合并
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd']) # 创建一个3*4的矩阵 全部都是0
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd']) # 创建一个3*4的矩阵 全部都是1
df3 = pd.DataFrame(np.ones((3, 4)) * 2, columns=['a', 'b', 'c', 'd']) # 创建一个3*4的矩阵 全部都是2
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True) # 上下合并 并忽略原来的index
# join, ['inner', 'outer']
df1 = pd.DataFrame(np.ones((3, 4)) * 0, index=[0, 1, 2], columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, index=[1, 2, 3], columns=['b', 'c', 'd', 'e'])
res = pd.concat([df1, df2], join='inner', ignore_index=True) # 会裁剪出重合的部分 b c d 交集
# b c d a b c d e
# 0 0.0 0.0 0.0 0 0.0 0.0 0.0 0.0 NaN
# 1 0.0 0.0 0.0 1 0.0 0.0 0.0 0.0 NaN
# 2 0.0 0.0 0.0 2 0.0 0.0 0.0 0.0 NaN
# 3 1.0 1.0 1.0 3 NaN 1.0 1.0 1.0 1.0
# 4 1.0 1.0 1.0 4 NaN 1.0 1.0 1.0 1.0
# 5 1.0 1.0 1.0 5 NaN 1.0 1.0 1.0 1.0
res = pd.concat([df1, df2], join='outer', ignore_index=True, sort=True) # 会将没有数据的部分用NaN补充 ⬆ 并集
# res = pd.concat([df1, df2], axis=1, join_axes=[df1.index]) # 左右合并 按照df1的索引 忽略index=3
# append
df1 = pd.DataFrame(np.ones((3, 4)) * 0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4)) * 1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4)) * 1, index=[2, 3, 4], columns=['b', 'c', 'd', 'e'])
res = df1.append(df2, ignore_index=True) # 默认竖向加数据
res = df1.append([df2, df3], sort=False, ignore_index=True)
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res = df1.append(s1, ignore_index=True) # 添加一行
# a b c d
# 0 0.0 0.0 0.0 0.0
# 1 0.0 0.0 0.0 0.0
# 2 0.0 0.0 0.0 0.0
# 3 1.0 2.0 3.0 4.0
merge
# merging two df by key/keys.(may be used in databases)
# simple example one key
left = pd.DataFrame({
'Key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({
'Key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
res = pd.merge(left, right, on='Key') # 按照Key
# consider two keys
left = pd.DataFrame({
'Key1': ['K0', 'K0', 'K1', 'K2'],
'Key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({
'Key1': ['K0', 'K1', 'K1', 'K2'],
'Key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
# how = ['inner', 'outer', 'left', right]
res = pd.merge(left, right, on=['Key1', 'Key2'], how='inner') # 默认是inner
# Key1 Key2 A B C D
# 0 K0 K0 A0 B0 C0 D0
# 1 K1 K0 A2 B2 C1 D1
# 2 K1 K0 A2 B2 C2 D2
res = pd.merge(left, right, on=['Key1', 'Key2'], how='outer') # 没有的部分用NaN填充
res = pd.merge(left, right, on=['Key1', 'Key2'], how='right') # 按照right的key 没有的部分用NaN填充 left同理
# indicator
df1 = pd.DataFrame({
'col1': [0, 1], 'col_left': ['a', 'b']})
df2 = pd.DataFrame({
'col1': [1, 2, 2], 'col_right': [2, 2, 2]})
res = pd.merge(df1, df2, on='col1', how='outer', indicator=True) # 会显示合并的情况 _merge
# col1 col_left col_right _merge
# 0 0 a NaN left_only
# 1 1 b 2.0 both
# 2 2 NaN 2.0 right_only
# 3 2 NaN 2.0 right_only
# give the indicator a custom name
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
# col1 col_left col_right indicator_column
# merged by index
left = pd.DataFrame({
'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({
'C': ['C0', 'C1', 'C2'],
'D': ['D0', 'D1', 'D2']},
index=['K0', 'K2', 'K3'])
# right index and left index
res = pd.merge(left, right, left_index=True, right_index=True, how='outer') # 跟考虑key类似
# handle overlapping
boys = pd.DataFrame({
'k': ['K0', 'K1', 'K2'], 'age': [1, 2, 3]})
girls = pd.DataFrame({
'k': ['K0', 'K0', 'K3'], 'age': [4, 5, 6]})
res = pd.merge(boys, girls, on='k', suffixes=['_boy', '_girl'], how='inner') # 根据k 相同的age名称分别定义为_boy, _girl
# k age_boy age_girl
# 0 K0 1 4
# 1 K0 1 5
plot 画图
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000), index=np.arange(1000))
data = data.cumsum() # 数据累加
data.plot()
plt.show()
# DataFrame
data = pd.DataFrame(np.random.randn(1000, 4), # 1000行 每行4个
index=np.arange(1000),
columns=list('ABCD'))
data = data.cumsum()
data.plot()
plt.show()
# plot method
# 'bar' 条形图 'hist' 'box' 'kde' 'area' 'scatter' 'pie'...
ax = data.plot.scatter(x='A', y='B', color='DarkBlue', label='Class1')
data.plot.scatter(x='A', y='C', color='DarkGreen', label='Class2', ax=ax) # 将其放在ax中 ax=ax
plt.show()