Sim_Jackson | 2023
# 导入需要的第三方库
import pandas as pd
import os
dir_ = r'D:\科研论文\Python\数据分析'
os.chdir(dir_)
files = os.listdir(dir_) # 将该地址下的文件都列出来
files
['data.xlsx', 'google.csv', 'reaseach_data.xlsx']
# 数据读取
df = pd.read_excel(files[-1]) # -1即为倒数第一个,'reaseach_data.xlsx'
df
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | NaN | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | NaN | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
1094 | 2022-12-31 | 16547.50 | NaN | NaN | NaN | NaN | NaN | NaN | 34 | 46 | 0 |
1095 rows × 11 columns
df['X2'].fillna(0)
0 0.00
1 3257.85
2 3234.85
3 0.00
4 0.00
...
1090 3829.25
1091 3783.22
1092 3849.28
1093 3839.50
1094 0.00
Name: X2, Length: 1095, dtype: float64
df['X2']
0 NaN
1 3257.85
2 3234.85
3 NaN
4 NaN
...
1090 3829.25
1091 3783.22
1092 3849.28
1093 3839.50
1094 NaN
Name: X2, Length: 1095, dtype: float64
df['X2'].fillna(0,inplace=True) # 用填补了缺失值的来替换掉原来带有空值的列
df
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
1094 | 2022-12-31 | 16547.50 | 0.00 | NaN | NaN | NaN | NaN | NaN | 34 | 46 | 0 |
1095 rows × 11 columns
df1 = df[df['X2']!=0] # 得到全部为交易日时间内的数据
df1
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
5 | 2020-01-06 | 7769.22 | 3246.28 | 1573.10 | 96.62 | 62.83 | 13.85 | 2.15 | 7 | 46 | 14 |
6 | 2020-01-07 | 8163.69 | 3237.18 | 1567.85 | 96.96 | 62.69 | 13.79 | 2.16 | 7 | 35 | 17 |
7 | 2020-01-08 | 8079.86 | 3253.05 | 1571.95 | 97.34 | 59.98 | 13.45 | 2.15 | 10 | 53 | 57 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1086 | 2022-12-23 | 16796.95 | 3844.82 | NaN | 104.32 | 79.35 | 20.87 | 5.12 | 44 | 65 | 12 |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
757 rows × 11 columns
df['time']=pd.to_datetime(df['time']) # 将时间格式变为pandas自带的时间戳格式
df['time']>'2021'
0 False
1 False
2 False
3 False
4 False
...
1090 True
1091 True
1092 True
1093 True
1094 True
Name: time, Length: 1095, dtype: bool
df[df['time']>='2021']
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
365 | 2021-01-01 | 29374.15 | 0.00 | NaN | 89.94 | NaN | NaN | 2.55 | 48 | 45 | 20 |
366 | 2021-01-02 | 32127.27 | 0.00 | NaN | NaN | NaN | NaN | NaN | 47 | 59 | 0 |
367 | 2021-01-03 | 32782.02 | 0.00 | NaN | NaN | NaN | NaN | NaN | 48 | 41 | 0 |
368 | 2021-01-04 | 31971.91 | 3700.65 | 1943.20 | 89.88 | 47.33 | 26.97 | 2.59 | 73 | 62 | 0 |
369 | 2021-01-05 | 33992.43 | 3726.86 | 1940.35 | 89.47 | 49.99 | 25.34 | 2.69 | 80 | 40 | 24 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1090 | 2022-12-27 | 16717.17 | 3829.25 | NaN | 104.20 | 79.87 | 21.65 | 5.21 | 47 | 58 | 0 |
1091 | 2022-12-28 | 16552.57 | 3783.22 | 1803.35 | 104.53 | 78.86 | 22.14 | 4.71 | 50 | 40 | 18 |
1092 | 2022-12-29 | 16642.34 | 3849.28 | 1813.75 | 103.97 | 78.71 | 21.44 | 4.56 | 51 | 67 | 44 |
1093 | 2022-12-30 | 16602.59 | 3839.50 | NaN | 103.49 | 80.51 | 21.67 | 4.43 | 42 | 53 | 25 |
1094 | 2022-12-31 | 16547.50 | 0.00 | NaN | NaN | NaN | NaN | NaN | 34 | 46 | 0 |
730 rows × 11 columns
df[df['time']<='2021']
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | 7200.17 | 0.00 | NaN | 96.51 | NaN | NaN | 2.19 | 3 | 10 | 26 |
1 | 2020-01-02 | 6985.47 | 3257.85 | 1527.10 | 96.81 | 61.18 | 12.47 | 2.13 | 4 | 49 | 72 |
2 | 2020-01-03 | 7344.88 | 3234.85 | 1548.75 | 96.91 | 63.04 | 14.02 | 2.11 | 8 | 47 | 65 |
3 | 2020-01-04 | 7410.66 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 58 | 0 |
4 | 2020-01-05 | 7411.32 | 0.00 | NaN | NaN | NaN | NaN | NaN | 5 | 36 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
361 | 2020-12-28 | 27084.81 | 3735.36 | NaN | 90.28 | 47.69 | 21.70 | 2.33 | 77 | 67 | 0 |
362 | 2020-12-29 | 27362.44 | 3727.04 | 1874.30 | 90.00 | 47.99 | 23.08 | 2.47 | 81 | 85 | 0 |
363 | 2020-12-30 | 28840.95 | 3732.04 | 1887.60 | 89.60 | 48.31 | 22.77 | 2.42 | 74 | 65 | 97 |
364 | 2020-12-31 | 29001.72 | 3756.07 | NaN | 89.95 | 48.41 | 22.75 | 2.55 | 72 | 64 | 15 |
365 | 2021-01-01 | 29374.15 | 0.00 | NaN | 89.94 | NaN | NaN | 2.55 | 48 | 45 | 20 |
366 rows × 11 columns
df['time']<='2021-07-20' # 限定时间小于2021年7月20日
0 True
1 True
2 True
3 True
4 True
...
1090 False
1091 False
1092 False
1093 False
1094 False
Name: time, Length: 1095, dtype: bool
df['time']>='2020-07-20' # 限定时间大于2020年7月20日
0 False
1 False
2 False
3 False
4 False
...
1090 True
1091 True
1092 True
1093 True
1094 True
Name: time, Length: 1095, dtype: bool
选择位于2020年7月20日至2021年7月20日之间的数据。
df[(df['time']<='2021-07-20')&(df['time']>='2020-07-20')]
time | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | |
---|---|---|---|---|---|---|---|---|---|---|---|
201 | 2020-07-20 | 9164.23 | 3251.84 | 1815.65 | 95.82 | 40.80 | 24.46 | 1.64 | 81 | 50 | 12 |
202 | 2020-07-21 | 9374.89 | 3257.30 | 1842.55 | 95.16 | 41.58 | 24.84 | 1.67 | 93 | 62 | 32 |
203 | 2020-07-22 | 9525.36 | 3276.02 | 1852.40 | 94.97 | 41.89 | 24.32 | 1.69 | 89 | 60 | 82 |
204 | 2020-07-23 | 9581.07 | 3235.66 | 1878.30 | 94.79 | 41.06 | 26.08 | 1.78 | 88 | 57 | 0 |
205 | 2020-07-24 | 9536.89 | 3215.63 | 1902.10 | 94.35 | 41.34 | 25.84 | 1.80 | 79 | 43 | 54 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
561 | 2021-07-16 | 31421.54 | 4327.16 | 1824.30 | 92.71 | 71.17 | 18.45 | 3.68 | 82 | 19 | 0 |
562 | 2021-07-17 | 31533.07 | 0.00 | NaN | NaN | NaN | NaN | NaN | 67 | 14 | 0 |
563 | 2021-07-18 | 31796.81 | 0.00 | NaN | NaN | NaN | NaN | NaN | 62 | 23 | 0 |
564 | 2021-07-19 | 30817.83 | 4258.49 | 1814.90 | 92.83 | 66.53 | 22.50 | 3.77 | 94 | 30 | 33 |
565 | 2021-07-20 | 29807.35 | 4323.06 | 1823.05 | 92.96 | 66.41 | 19.73 | 3.93 | 89 | 21 | 23 |
365 rows × 11 columns