请先看:
机器学习三剑客之Numpy常用算法总结
机器学习三剑客之Pandas常用算法总结上
import pandas as pd
import numpy as np
dates = pd.date_range('20191222', periods=3)
df = pd.DataFrame(np.arange(12).reshape((3, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
"""
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 5 6 7
2019-12-24 8 9 10 11
"""
# index
df.iloc[1, 1] = 1111
print(df)
"""
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 1111 6 7
2019-12-24 8 9 10 11
"""
# label
df.loc['20191224', 'C'] = 2222
print(df)
"""
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 1111 6 7
2019-12-24 8 9 2222 11
"""
# mix
df.ix['20191222', 1] = 3333
print(df)
"""
A B C D
2019-12-22 0 3333 2 3
2019-12-23 4 1111 6 7
2019-12-24 8 9 2222 11
"""
dates2 = pd.date_range('20191222', periods=6)
df2 = pd.DataFrame(np.arange(24).reshape((6, 4)), index=dates2, columns=['A', 'B', 'C', 'D'])
print(df2)
"""
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 5 6 7
2019-12-24 8 9 10 11
2019-12-25 12 13 14 15
2019-12-26 16 17 18 19
2019-12-27 20 21 22 23
"""
df2[df2.A > 12] = 0
print(df2)
df2.A[df2.A > 8] = 0
print(df2)
df2.B[df2.A > 2] = 0
print(df2)
# batch processing
"""
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 5 6 7
2019-12-24 8 9 10 11
2019-12-25 12 13 14 15
2019-12-26 0 0 0 0
2019-12-27 0 0 0 0
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 5 6 7
2019-12-24 8 9 10 11
2019-12-25 0 13 14 15
2019-12-26 0 0 0 0
2019-12-27 0 0 0 0
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 0 6 7
2019-12-24 8 0 10 11
2019-12-25 0 13 14 15
2019-12-26 0 0 0 0
2019-12-27 0 0 0 0
"""
# add a colomn
df2['F'] = np.nan
print df2
"""
A B C D F
2019-12-22 0 1 2 3 NaN
2019-12-23 4 0 6 7 NaN
2019-12-24 8 0 10 11 NaN
2019-12-25 0 13 14 15 NaN
2019-12-26 0 0 0 0 NaN
2019-12-27 0 0 0 0 NaN
"""
# add a colomn using Series
df2['E'] = pd.Series([1, 2, 3, 4, 5, 6], index=dates2)
print df2
"""
A B C D F E
2019-12-22 0 1 2 3 NaN 1
2019-12-23 4 0 6 7 NaN 2
2019-12-24 8 0 10 11 NaN 3
2019-12-25 0 13 14 15 NaN 4
2019-12-26 0 0 0 0 NaN 5
2019-12-27 0 0 0 0 NaN 6
"""
一些总结说明
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
dates = pd.date_range('20191222', periods=4)
df = pd.DataFrame(np.arange(16).reshape((4, 4)), index=dates, columns=['A', 'B', 'C', 'D'])
print(df)
"""
A B C D
2019-12-22 0 1 2 3
2019-12-23 4 5 6 7
2019-12-24 8 9 10 11
2019-12-25 12 13 14 15
"""
# add nan
df.iloc[0, 1] = np.nan
df.iloc[2, 2] = np.nan
print(df)
# drop na (index or colomn)
print(df.dropna(axis=0, how='any'))
print(df.dropna(axis=1, how='any'))
"""
A B C D
2019-12-22 0 NaN 2.0 3
2019-12-23 4 5.0 6.0 7
2019-12-24 8 9.0 NaN 11
2019-12-25 12 13.0 14.0 15
A B C D
2019-12-23 4 5.0 6.0 7
2019-12-25 12 13.0 14.0 15
A D
2019-12-22 0 3
2019-12-23 4 7
2019-12-24 8 11
2019-12-25 12 15
"""
# 前后对比
print(df.dropna(axis=1, how='all'))
"""
A B C D
2019-12-22 0 NaN 2.0 3
2019-12-23 4 5.0 6.0 7
2019-12-24 8 9.0 NaN 11
2019-12-25 12 13.0 14.0 15
"""
df.iloc[1, 1] = np.nan
df.iloc[2, 1] = np.nan
df.iloc[3, 1] = np.nan
print(df)
print(df.dropna(axis=1, how='all'))
"""
A B C D
2019-12-22 0 NaN 2.0 3
2019-12-23 4 NaN 6.0 7
2019-12-24 8 NaN NaN 11
2019-12-25 12 NaN 14.0 15
A C D
2019-12-22 0 2.0 3
2019-12-23 4 6.0 7
2019-12-24 8 NaN 11
2019-12-25 12 14.0 15
"""
# fillna has many parameters
# please see details
# 把nan复制成value
print(df.fillna(value=222.22))
"""
A B C D
2019-12-22 0 222.22 2.00 3
2019-12-23 4 222.22 6.00 7
2019-12-24 8 222.22 222.22 11
2019-12-25 12 222.22 14.00 15
"""
# judge nan
# 一般表格很大的时候用
# nan return True
print(df.isnull())
# 如果有一个值为df.isnull 中有一个为True,则返回True
# 一起使用,则为判断这个大的dataframe中是否含有nan
print(np.any(df.isnull()))
print(np.any(df.isnull()) == True)
"""
A B C D
2019-12-22 False True False False
2019-12-23 False True False False
2019-12-24 False True True False
2019-12-25 False True False False
True
True
"""
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2, 3))*0, columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.ones((2, 3))*1, columns=['a', 'b', 'c'])
df3 = pd.DataFrame(np.ones((2, 3))*22, columns=['a', 'b', 'c'])
print(df1)
print(df2)
print(df3)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
a b c
0 1.0 1.0 1.0
1 1.0 1.0 1.0
a b c
0 22.0 22.0 22.0
1 22.0 22.0 22.0
"""
# 三个进行合并 index也是最初的组合而已,此处axis=0为vertical合并
df_vertical = pd.concat([df1, df2, df3], axis=0)
print(df_vertical)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
0 1.0 1.0 1.0
1 1.0 1.0 1.0
0 22.0 22.0 22.0
1 22.0 22.0 22.0
"""
#进行index的重新洗牌
df_vertical_rightindex = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(df_vertical_rightindex)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 22.0 22.0 22.0
5 22.0 22.0 22.0
"""
# horizontal
df_horizontal = pd.concat([df1, df2, df3], axis=1)
print(df_horizontal)
"""
a b c a b c a b c
0 0.0 0.0 0.0 1.0 1.0 1.0 22.0 22.0 22.0
1 0.0 0.0 0.0 1.0 1.0 1.0 22.0 22.0 22.0
"""
df_horizontal_rightindex = pd.concat([df1, df2, df3], axis=1, ignore_index=True)
print(df_horizontal_rightindex)
"""
0 1 2 3 4 5 6 7 8
0 0.0 0.0 0.0 1.0 1.0 1.0 22.0 22.0 22.0
1 0.0 0.0 0.0 1.0 1.0 1.0 22.0 22.0 22.0
"""
df_horizontal_rightindex.rename(
columns={0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e',
5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'g'}, inplace=True)
print(df_horizontal_rightindex)
"""
a b c d e f g h i
0 0.0 0.0 0.0 1.0 1.0 1.0 22.0 22.0 22.0
1 0.0 0.0 0.0 1.0 1.0 1.0 22.0 22.0 22.0
"""
一些总结:
# different colomns label and index label,need to use 'join' and 'join_axes' parameters
df4 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df5 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
print(df4)
print(df5)
"""
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
"""
# 不存在的用nan填充
df_outer = pd.concat([df4, df5], join='outer', ignore_index=True)
print(df_outer)
"""
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
"""
df_inner = pd.concat([df4, df5], join='inner')
print(df_inner)
"""
b c d
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 0.0 0.0 0.0
2 1.0 1.0 1.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
"""
df_inner_index = pd.concat([df4, df5], join='inner', ignore_index=True)
print(df_inner_index)
"""
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
"""
"""
原df4,df5 便于观看
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
"""
# 不存在的用nan填充
df_h_different_index = pd.concat([df4, df5], axis=1)
print(df_h_different_index)
"""
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
"""
#用df4的index作为合并之后的index,所以不是df4的index部分删掉(4删掉)
df_h_different_index_df4 = pd.concat([df4, df5], axis=1, join_axes=[df4.index])
print(df_h_different_index_df4)
"""
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
"""
df_h_different_index_df5 = pd.concat([df4, df5], axis=1, join_axes=[df5.index])
print(df_h_different_index_df5)
"""
a b c d b c d e
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
"""
一些总结:
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((2, 3))*0, columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.ones((2, 3))*11, columns=['a', 'b', 'c'])
df3 = pd.DataFrame(np.ones((2, 3))*2, columns=['a', 'b', 'c'])
print(df1)
print(df2)
print(df3)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
a b c
0 11.0 11.0 11.0
1 11.0 11.0 11.0
a b c
0 2.0 2.0 2.0
1 2.0 2.0 2.0
"""
df_append1 = df1.append(df2, ignore_index=True)
print(df_append1)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 11.0 11.0 11.0
3 11.0 11.0 11.0
"""
df_append2 = df1.append([df2, df3], ignore_index=True)
print(df_append2)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 11.0 11.0 11.0
3 11.0 11.0 11.0
4 2.0 2.0 2.0
5 2.0 2.0 2.0
"""
s1 = pd.Series([12, 24, 33], index=['a', 'b', 'c'])
df_appned_s = df1.append(s1, ignore_index=True)
print(df_appned_s)
"""
a b c
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 12.0 24.0 33.0
"""
一些总结:
# -*- coding: utf-8 -*-
import pandas as pd
left = pd.DataFrame({'connect': ['con0', 'con1', 'con2', 'con3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'connect' : ['con0', 'con1', 'con2', 'con3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
"""
A B connect
0 A0 B0 con0
1 A1 B1 con1
2 A2 B2 con2
3 A3 B3 con3
C D connect
0 C0 D0 con0
1 C1 D1 con1
2 C2 D2 con2
3 C3 D3 con3
"""
# 两个dataframe通过相同的column,merge在一起
# column label为connect上的元素全部一样的一般情况
merge_l_r_same = pd.merge(left=left, right=right, on='connect')
print(merge_l_r_same)
merge_l_r_same = pd.merge(left=right, right=left, on='connect')
print(merge_l_r_same)
"""
A B connect C D
0 A0 B0 con0 C0 D0
1 A1 B1 con1 C1 D1
2 A2 B2 con2 C2 D2
3 A3 B3 con3 C3 D3
C D connect A B
0 C0 D0 con0 A0 B0
1 C1 D1 con1 A1 B1
2 C2 D2 con2 A2 B2
3 C3 D3 con3 A3 B3
通过这个对比可以明白参数on, 以及left, right的功能
left,right即通过on参数指定的column来进行合并而
left对应的dataframe剩下的column则全都依次放在left
right对应的dataframe剩下的column则全都依次放在right
"""
# on多个column 并且每个column label对应的value有重叠但不完全相同.
left = pd.DataFrame({'key1': ['K1', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K1', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K1'],
'key2': ['K0', 'K1', 'K1', 'K1'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
"""
A B key1 key2
0 A0 B0 K1 K0
1 A1 B1 K0 K0
2 A2 B2 K1 K1
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K0 K1
2 C2 D2 K1 K1
3 C3 D3 K1 K1
"""
# on对应于多个column时需要采用List
# how=['inner', 'outer', 'left', 'right']
# default = 'inner'
merge_l_r_different = pd.merge(left, right, on=['key1', 'key2'])
print(merge_l_r_different)
"""
A B key1 key2 C D
0 A1 B1 K0 K0 C0 D0
1 A2 B2 K1 K1 C2 D2
2 A2 B2 K1 K1 C3 D3
"""
merge_inner = pd.merge(left, right, on=['key1', 'key2'], how='inner')
print(merge_inner)
"""
A B key1 key2 C D
0 A1 B1 K0 K0 C0 D0
1 A2 B2 K1 K1 C2 D2
2 A2 B2 K1 K1 C3 D3
"""
"""
便于观看
left:
A B key1 key2
0 A0 B0 K1 K0
1 A1 B1 K0 K0
2 A2 B2 K1 K1
3 A3 B3 K2 K1
right:
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K0 K1
2 C2 D2 K1 K1
3 C3 D3 K1 K1
"""
merge_outer = pd.merge(left, right, on=['key1', 'key2'], how='outer')
print(merge_outer)
"""
A B key1 key2 C D
0 A0 B0 K1 K0 NaN NaN
1 A1 B1 K0 K0 C0 D0
2 A2 B2 K1 K1 C2 D2
3 A2 B2 K1 K1 C3 D3
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K0 K1 C1 D1
"""
merge_left = pd.merge(left, right, on=['key1', 'key2'], how='left')
print(merge_left)
"""
A B key1 key2 C D
0 A0 B0 K1 K0 NaN NaN
1 A1 B1 K0 K0 C0 D0
2 A2 B2 K1 K1 C2 D2
3 A2 B2 K1 K1 C3 D3
4 A3 B3 K2 K1 NaN NaN
"""
"""
便于观看
left:
A B key1 key2
0 A0 B0 K1 K0
1 A1 B1 K0 K0
2 A2 B2 K1 K1
3 A3 B3 K2 K1
right:
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K0 K1
2 C2 D2 K1 K1
3 C3 D3 K1 K1
"""
merge_right = pd.merge(left, right, on=['key1', 'key2'], how='right')
print(merge_right)
"""
A B key1 key2 C D
0 A1 B1 K0 K0 C0 D0
1 A2 B2 K1 K1 C2 D2
2 A2 B2 K1 K1 C3 D3
3 NaN NaN K0 K1 C1 D1
"""
这里主要是考察merge函数中的how参数的用法,以及对应的operations
这里举两个例子主要分析
"""
A B key1 key2
0 A0 B0 K1 K0
1 A1 B1 K0 K0
2 A2 B2 K1 K1
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K0 K1
2 C2 D2 K1 K1
3 C3 D3 K1 K1
"""
result:
"""
A B key1 key2 C D
0 A1 B1 K0 K0 C0 D0
1 A2 B2 K1 K1 C2 D2
2 A2 B2 K1 K1 C3 D3
"""
进行on=['key1, key2'], how='inner'
请试着和我一起分析:
首先因为通过key1和key2两个column进行合并,观察两列中的[key1,key2]组合完全相同的行
可以明显观察到的是 [K0,K0]以及[K1,K1]相同,因为采用inner方式所以只考虑相同的行即可,
首先取出[K0,K0]中的left的元素放置于左边,对应的right的value放置于右边,接下来一样的操作
对应left中的[K1, K1]有两个right对应,首先先将[K1, K1]的left的元素放置于左边,之后去取
right的中的元素放置于右面,而对于right中的[K1, K1]有两个放置于右边,所以左边还需补一组
一模一样的left中对应的数据放置于左面,即可。
如果how='outer'的话则全部的columns都留下,只是left和right中没有对应的部分,为nan即可,可返回到代码部分重复观看,即可明白。
"""
A B key1 key2 C D
0 A0 B0 K1 K0 NaN NaN
1 A1 B1 K0 K0 C0 D0
2 A2 B2 K1 K1 C2 D2
3 A2 B2 K1 K1 C3 D3
4 A3 B3 K2 K1 NaN NaN
"""
how='left'的意思才估计也能猜到了,就是主要针对于left的所有的[key1, key2]组合,right只是
迎合left的结构。分析如下:
首先left中的[K1,K0]对应于的数据放在左面,而right中不存在则为nan nan,之后[K0,K0]存在对应的right元素,[K1,K1]和上面的分析基本一致,最后到[K2, K1]和[K1, K0]一个意思。
大家可以自行根据以上分析来分析,'outer'和'right'情况
import pandas as pd
# the parameter of indicator
left = pd.DataFrame({'key': [0, 1], 'left': ['a', 'b']})
right = pd.DataFrame({'key': [1, 2, 2], 'right': [2, 2, 2]})
print(left)
print(right)
"""
key left
0 0 a
1 1 b
key right
0 1 2
1 2 2
2 2 2
"""
res_indicator = pd.merge(left=left, right=right, on='key', how='outer', indicator=True)
print(res_indicator)
"""
key left right _merge
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
"""
# give indicater a name that is "dict_name"
# the default name is "_merge"
res_indicator2 = pd.merge(left, right, on='key', how='outer', indicator="idct_name")
print(res_indicator2)
"""
key left right idct_name
0 0 a NaN left_only
1 1 b 2.0 both
2 2 NaN 2.0 right_only
3 2 NaN 2.0 right_only
"""
merge的parameter “indicator”主要作用为显示出最终的left和right一起merge的结果,为left_only:这一index只有left有value,而right无value,both则都有,right_only和left_only相似。
# define the indexes of left and right again
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
print(left)
print(right)
"""
A B
K0 A0 B0
K1 A1 B1
K2 A2 B2
C D
K0 C0 D0
K2 C2 D2
K3 C3 D3
"""
# classification of (left_index, right_index)
res = pd.merge(left, right, left_index=True, right_index=True, how='outer')
print res
"""
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
K3 NaN NaN C3 D3
"""
# classification of (left_index, right_on)
res = pd.merge(left, right, left_index=True, how='outer', right_on='D')
print res
"""
A B C D
K3 A0 B0 NaN K0
K3 A1 B1 NaN K1
K3 A2 B2 NaN K2
K0 NaN NaN C0 D0
K2 NaN NaN C2 D2
K3 NaN NaN C3 D3
"""
boys = pd.DataFrame({'people': ['name1', 'name2', 'name3'], 'age': [1, 2, 3]})
girls = pd.DataFrame({'people': ['name1', 'name4', 'name5'], 'age': [4, 5, 6]})
print(boys)
print(girls)
"""
age people
0 1 name1
1 2 name2
2 3 name3
age people
0 4 name1
1 5 name4
2 6 name5
"""
res = pd.merge(boys, girls, on='people', suffixes=['boys', 'girls'], how='inner')
print(res)
"""
ageboys people agegirls
0 1 name1 4
"""
场景为:
统计男生的name和age,女生的name和age,名字的列都用的people,最后两个表格合并的时候,发现有两个重名的同学,所以不能看作一个,作为overlapping的问题解决方案是,
利用suffixes参数进行age的重命名为agegirls, ageboys.