Sim_Jackson | 2023
导入必要的第三方库
# 导入需要的第三方库
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
dir_ = r'D:\科研论文\Python\数据分析'
os.chdir(dir_)
files = os.listdir(dir_) # 将该地址下的文件都列出来
files
['data.xlsx', 'google.csv', 'reaseach_data.xlsx']
# 数据读取
df = pd.read_excel(files[-1]) # -1即为倒数第一个,'reaseach_data.xlsx'
df
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | NaN | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
1094 | 2022-12-31 | 16547.50 | NaN | NaN | NaN | NaN | NaN | NaN | 34 | 46 | 0 |
1095 rows × 11 columns
df.isna().sum() # 数据的缺失值情况
time 0
X1 0
X2 338
X3 344
X4 312
X5 319
X6 334
X7 315
X8 0
X9 0
X10 0
dtype: int64
df['X2'].fillna(0,inplace=True) # 将空值填充为0,非交易日的SP500数据填补为0.
df
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
1094 | 2022-12-31 | 16547.50 | 0.00 | NaN | NaN | NaN | NaN | NaN | 34 | 46 | 0 |
1095 rows × 11 columns
# 提取交易日的数据
df1 = df[df['X2']!=0]
df1
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1086 | 2022-12-23 | 16796.95 | 3844.82 | NaN | 104.32 | 79.35 | 20.87 | 5.12 | 44 | 65 | 12 |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
757 rows × 11 columns
df1.isna().sum() # 数据的缺失值情况
time 0
X1 0
X2 0
X3 22
X4 0
X5 0
X6 1
X7 2
X8 0
X9 0
X10 0
dtype: int64
df1
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1086 | 2022-12-23 | 16796.95 | 3844.82 | NaN | 104.32 | 79.35 | 20.87 | 5.12 | 44 | 65 | 12 |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
757 rows × 11 columns
df1.fillna(0)
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1086 | 2022-12-23 | 16796.95 | 3844.82 | 0.00 | 104.32 | 79.35 | 20.87 | 5.12 | 44 | 65 | 12 |
1090 | 2022-12-27 | 16717.17 | 3829.25 | 0.00 | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | 0.00 | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
757 rows × 11 columns
trans={"X3":df1['X3'].mean(),"X6":df1['X6'].mean(),"X7":df1['X7'].mean()}
df1.fillna(value=trans,inplace=True)
df1
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.100000 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.750000 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.100000 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.850000 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.950000 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1086 | 2022-12-23 | 16796.95 | 3844.82 | 1789.400408 | 104.32 | 79.35 | 20.87 | 5.12 | 44 | 65 | 12 |
1090 | 2022-12-27 | 16717.17 | 3829.25 | 1789.400408 | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.350000 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.750000 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | 1789.400408 | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
757 rows × 11 columns
ffill:用缺失值前面的一个值代替缺失值
backfill/bfill:缺失值后面的一个值代替前面的缺失值
df.head(10)
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
8 | 2020-01-09 | 7879.07 | 3274.70 | 1550.75 | 97.42 | 59.59 | 12.54 | 2.17 | 6 | 54 | 16 |
9 | 2020-01-10 | 8166.55 | 3265.35 | 1553.60 | 97.38 | 59.12 | 12.56 | 2.21 | 4 | 60 | 10 |
df.fillna(axis=0,method='ffill') # 用这一列前面的一个数填充
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 5 | 36 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1090 | 2022-12-27 | 16717.17 | 3829.25 | 1800.70 | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | 1813.75 | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
1094 | 2022-12-31 | 16547.50 | 0.00 | 1813.75 | 103.49 | 80.51 | 21.67 | 4.43 | 34 | 46 | 0 |
1095 rows × 11 columns
df.head(10)
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
8 | 2020-01-09 | 7879.07 | 3274.70 | 1550.75 | 97.42 | 59.59 | 12.54 | 2.17 | 6 | 54 | 16 |
9 | 2020-01-10 | 8166.55 | 3265.35 | 1553.60 | 97.38 | 59.12 | 12.56 | 2.21 | 4 | 60 | 10 |
df.fillna(axis=0,method='bfill').head(10) # 用这一列前面的一个数填充,后面的第一个不为0的数来填补。
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | 1527.10 | 96.51 | 61.18 | 12.47 | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 5 | 36 | 0 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
8 | 2020-01-09 | 7879.07 | 3274.70 | 1550.75 | 97.42 | 59.59 | 12.54 | 2.17 | 6 | 54 | 16 |
9 | 2020-01-10 | 8166.55 | 3265.35 | 1553.60 | 97.38 | 59.12 | 12.56 | 2.21 | 4 | 60 | 10 |