1.主要问题
主要记录dataframe构造,非连续时间做X轴的处理以及pandas中dataframe的连接操作。
2.Demo代码
# coding:utf8
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import matplotlib.dates as mdates
# 方法1: 默认index是X轴 达到预期
def p1():
ax = fig.add_subplot(311)
ax.plot(df_days['data'], label='data') # 默认以index作为x轴
for a, b in zip(df_days.index.to_list(), df_days['data']):
ax.text(a, b, '{:,.2f}'.format(b), ha='center', fontsize=14, va='bottom', color='blue')
# 画X轴刻度及label
# 方法1:用plt.xticks
# xticks两个参数只在plt有,同时设置刻度跟标签
plt.xticks(df_days.index.to_list(), xlabels, rotation=90)
# ax.xticks(df_days.index.to_list(), xlabels, rotation=90)
# #AttributeError: 'AxesSubplot' object has no attribute 'xticks'
# 画X轴刻度及label
# 方法2:用ax.set_xticks及ax.set_xticklabels
# ax的ticks跟tickslabel要分开设置!!!
# ax.set_xticks(df_days.index.to_list()) # Index作刻度
# ax.set_xticklabels(xlabels, rotation=30)
ax.grid()
ax.legend()
# 方法3:用非index做X轴,若是日期型,或者mdate.date2num转数字为x轴tick,去掉的周末会留空,非预期
def p2():
ax = fig.add_subplot(312)
# plt.plot(df_days["trd_date"], df_days['data'], label='data')
# 等价于,日期即转不转数字,内部都是以数字处理,即当前日期与"0000-01-01 00:00:00"相减的天数
plt.plot(df_days["trd_date"].map(lambda x: mdates.date2num(x)), df_days['data'], label='data')
for a, b in zip(df_days["trd_date"], df_days['data']):
ax.text(mdates.date2num(a), b, '{:,.2f}'.format(b), ha='center', fontsize=14, va='bottom')
plt.xticks(df_days["trd_date"], xlabels, rotation=45)
ax.grid()
ax.legend()
# 方法3:用非Index做X轴,且非日期型 达到预期
def p3():
ax = fig.add_subplot(313)
ax.plot(xlabels, df_days['data'], label='data')
for a, b in zip(df_days.index.to_list(), df_days['data']):
ax.text(a, b, '{:,.2f}'.format(b), ha='center', fontsize=14, va='bottom')
ax.set_xticklabels(xlabels, rotation=35)
ax.grid()
ax.legend()
def get_df_without_56(days=29):
# 起始日期
start_date = datetime.date.today() - datetime.timedelta(days=days)
# 日期序列
day_list = pd.date_range(start_date, periods=days, freq='D')
# 序列里直接过滤掉周末
day_list = day_list[day_list.weekday < 5]
# 生产随机Demo数据
data_list = np.random.rand(days - int(days / 7) * 2) * 100
# 生成dataframe方法1:两步走
df_days = pd.DataFrame(data=day_list, columns=['trd_date'])
df_days['data'] = data_list # 增加数据列
return df_days
def get_df_without_01(days=29):
# 起始日期
start_date = datetime.date.today() - datetime.timedelta(days=days)
# 日期序列
day_list = pd.date_range(start_date, periods=days, freq='D')
# 序列里直接过滤周一周二
day_list = day_list[day_list.weekday > 1]
# 生产随机Demo数据
data_list = np.random.rand(days - int(days / 7) * 2) * 100
# 生成dataframe方法2:zip后一步到位
df_days = pd.DataFrame(list(zip(day_list, data_list)), columns=['trd_date', 'data'])
return df_days
def df_merge_demo(left, right):
# 默认以重叠的列名当做连接键,本例中等同于pd.merge(left,right,on=['trd_date','data'],how='inner')
defaultmerge = pd.merge(left, right)
print("\n测试:defaultmerge=pd.merge(left, right)")
print(defaultmerge.head(100))
# 等同于pd.concat([left,right]).reset_index(drop=True)
defaultouter = pd.merge(left, right, how='outer')
print("\n测试:defaultouter = pd.merge(left, right, how='outer')")
print(defaultouter.head(100))
pdconcat = pd.concat([left, right]).reset_index(drop=True)
print("\n测试:pdconcat= pd.concat([left, right]).reset_index(drop=True)")
print(pdconcat)
# 如果两个对象的key列名不同,可以分别指定,例:pd.merge(left,right,left_on='lkey',right_on='rkey')
leftjoin = pd.merge(left, right, on='trd_date', how='left')
print("\n测试:leftjoin = pd.merge(left, right, on='trd_date', how='left')")
print(leftjoin.head(100))
# 多键连接时将连接键组成列表传入,例:pd.merge(left,right,on=['key1','key2']
rightjoin = pd.merge(left, right, on='trd_date', how='right')
print("\n测试:rightjoin = pd.merge(left, right, on='trd_date', how='right')")
print(rightjoin.head(100))
innerjoin = pd.merge(left, right, on='trd_date', how='inner') # 默认为inner
print("\n测试:innerjoin = pd.merge(left, right, on='trd_date', how='inner')")
print(innerjoin.head(100))
outerjoin = pd.merge(left, right, on='trd_date', how='outer')
print("\n测试:innerjoin = pd.merge(left, right, on='trd_date', how='outer')")
print(outerjoin.head(100))
left = left.set_index('trd_date') # 设置索引
right = right.set_index('trd_date') # 设置索引
# join连接后重建索引
dfjoin = left.join(right, lsuffix='_l', rsuffix='_r',sort=False).reset_index()
print("\n测试:dfjoin = left.join(right).reset_index()")
print(dfjoin.head(100))
dfjoin.rename(columns={'trd_date': '交易日期'}, inplace=True) # 改列名
print("\n测试:dfjoin.rename(columns={'trd_date': '交易日期'}, inplace=True) # 改列名")
print(dfjoin.head(100))
if __name__ == '__main__':
df_days = get_df_without_56(15)
df_days_01 = get_df_without_01(15)
# dataframe merge 测试
df_merge_demo(df_days, df_days_01)
fig = plt.figure(figsize=(20, 10), dpi=80)
# 生成x轴的label,方法1: map然后lambda表达式
xlabels = df_days["trd_date"].map(lambda x: x.strftime('%m-%d'))
# 或者 方法2: for in 循环生成数组
# xlabels = [date.strftime('%m-%d') for (date) in df_days["trd_date"]]
p1()
p2()
p3()
plt.show()
plt.close()
3.执行结果
测试:defaultmerge=pd.merge(left, right)
Empty DataFrame
Columns: [trd_date, data]
Index: []
测试:defaultouter = pd.merge(left, right, how='outer')
trd_date data
0 2020-02-20 28.519733
1 2020-02-21 21.957296
2 2020-02-24 81.709042
3 2020-02-25 49.141968
4 2020-02-26 75.962113
5 2020-02-27 34.297047
6 2020-02-28 8.599335
7 2020-03-02 59.895028
8 2020-03-03 36.150423
9 2020-03-04 41.961246
10 2020-03-05 78.180868
11 2020-02-20 81.522147
12 2020-02-21 66.379312
13 2020-02-22 68.038422
14 2020-02-23 82.831858
15 2020-02-26 44.112500
16 2020-02-27 9.492410
17 2020-02-28 53.929783
18 2020-02-29 19.431358
19 2020-03-01 59.103261
20 2020-03-04 66.754922
21 2020-03-05 66.497167
测试:pdconcat= pd.concat([left, right]).reset_index(drop=True)
trd_date data
0 2020-02-20 28.519733
1 2020-02-21 21.957296
2 2020-02-24 81.709042
3 2020-02-25 49.141968
4 2020-02-26 75.962113
5 2020-02-27 34.297047
6 2020-02-28 8.599335
7 2020-03-02 59.895028
8 2020-03-03 36.150423
9 2020-03-04 41.961246
10 2020-03-05 78.180868
11 2020-02-20 81.522147
12 2020-02-21 66.379312
13 2020-02-22 68.038422
14 2020-02-23 82.831858
15 2020-02-26 44.112500
16 2020-02-27 9.492410
17 2020-02-28 53.929783
18 2020-02-29 19.431358
19 2020-03-01 59.103261
20 2020-03-04 66.754922
21 2020-03-05 66.497167
测试:leftjoin = pd.merge(left, right, on='trd_date', how='left')
trd_date data_x data_y
0 2020-02-20 28.519733 81.522147
1 2020-02-21 21.957296 66.379312
2 2020-02-24 81.709042 NaN
3 2020-02-25 49.141968 NaN
4 2020-02-26 75.962113 44.112500
5 2020-02-27 34.297047 9.492410
6 2020-02-28 8.599335 53.929783
7 2020-03-02 59.895028 NaN
8 2020-03-03 36.150423 NaN
9 2020-03-04 41.961246 66.754922
10 2020-03-05 78.180868 66.497167
测试:rightjoin = pd.merge(left, right, on='trd_date', how='right')
trd_date data_x data_y
0 2020-02-20 28.519733 81.522147
1 2020-02-21 21.957296 66.379312
2 2020-02-26 75.962113 44.112500
3 2020-02-27 34.297047 9.492410
4 2020-02-28 8.599335 53.929783
5 2020-03-04 41.961246 66.754922
6 2020-03-05 78.180868 66.497167
7 2020-02-22 NaN 68.038422
8 2020-02-23 NaN 82.831858
9 2020-02-29 NaN 19.431358
10 2020-03-01 NaN 59.103261
测试:innerjoin = pd.merge(left, right, on='trd_date', how='inner')
trd_date data_x data_y
0 2020-02-20 28.519733 81.522147
1 2020-02-21 21.957296 66.379312
2 2020-02-26 75.962113 44.112500
3 2020-02-27 34.297047 9.492410
4 2020-02-28 8.599335 53.929783
5 2020-03-04 41.961246 66.754922
6 2020-03-05 78.180868 66.497167
测试:innerjoin = pd.merge(left, right, on='trd_date', how='outer')
trd_date data_x data_y
0 2020-02-20 28.519733 81.522147
1 2020-02-21 21.957296 66.379312
2 2020-02-24 81.709042 NaN
3 2020-02-25 49.141968 NaN
4 2020-02-26 75.962113 44.112500
5 2020-02-27 34.297047 9.492410
6 2020-02-28 8.599335 53.929783
7 2020-03-02 59.895028 NaN
8 2020-03-03 36.150423 NaN
9 2020-03-04 41.961246 66.754922
10 2020-03-05 78.180868 66.497167
11 2020-02-22 NaN 68.038422
12 2020-02-23 NaN 82.831858
13 2020-02-29 NaN 19.431358
14 2020-03-01 NaN 59.103261
测试:dfjoin = left.join(right).reset_index()
trd_date data_l data_r
0 2020-02-20 28.519733 81.522147
1 2020-02-21 21.957296 66.379312
2 2020-02-24 81.709042 NaN
3 2020-02-25 49.141968 NaN
4 2020-02-26 75.962113 44.112500
5 2020-02-27 34.297047 9.492410
6 2020-02-28 8.599335 53.929783
7 2020-03-02 59.895028 NaN
8 2020-03-03 36.150423 NaN
9 2020-03-04 41.961246 66.754922
10 2020-03-05 78.180868 66.497167
测试:dfjoin.rename(columns={'trd_date': '交易日期'}, inplace=True) # 改列名
交易日期 data_l data_r
0 2020-02-20 28.519733 81.522147
1 2020-02-21 21.957296 66.379312
2 2020-02-24 81.709042 NaN
3 2020-02-25 49.141968 NaN
4 2020-02-26 75.962113 44.112500
5 2020-02-27 34.297047 9.492410
6 2020-02-28 8.599335 53.929783
7 2020-03-02 59.895028 NaN
8 2020-03-03 36.150423 NaN
9 2020-03-04 41.961246 66.754922
10 2020-03-05 78.180868 66.497167