Pandas第二次作业20200904

练习1 读取 catNames2.csv 文件,完成需求如下:

• 找到所有的使用次数超过800的猫的名字
• 获取用次数最高的名字

#!/user/bin/env python
#-*-coding: utf-8-*-
#@Time           : 2020/9/414:55
#@Author         : GodSpeed
#@File           : Pandas第二次作业01.py
#@Software       : PyCharm

import pandas as pd
import numpy as np

'''
练习1
读取 catNames2.csv 文件,完成需求如下:
• 找到所有的使用次数超过800的猫的名字
• 获取用次数最高的名字
'''


# 读取 catNames2.csv 文件
df_catNames2 = pd.read_csv('catNames2.csv')
print(df_catNames2)
'''
      Row_Labels  Count_AnimalName
0              1                 1
1              2                 2
2          40804                 1
3          90201                 1
4          90203                 1
...          ...               ...
16215      37916                 1
16216      38282                 1
16217      38583                 1
16218      38948                 1
16219      39743                 1

[16220 rows x 2 columns]
'''

# 找到所有的使用次数超过800的猫的名字
# 采用布尔索引
Row_Labels_b_800 = df_catNames2[df_catNames2.Count_AnimalName>800]

#print(type(Row_Labels_b_800)) #
print(Row_Labels_b_800.Row_Labels.values)
# ['BELLA' 'CHARLIE' 'COCO' 'MAX' 'ROCKY']


# 获取用次数最高的名字
# 转换为Series
# 方法1 argma用获取

'''
Numpy 中提供 argmax 函数返回的是输入列表中最大值的位置
def argmax(a, axis=None, out=None)
    a : array_like
        Input array.
    axis : int, optional
    out : array, optional
        如果提供,结果将插入此数组。它应该具有适当的形状和数据类型。
    Returns
'''
#index = np.argmax(df_catNames2.iloc[:,1])
index = np.argmax(df_catNames2['Count_AnimalName'])
#argmax返回的是最大数的索引.argmax有一个参数axis,默认是0,表示第几维的最大值

#print(df_catNames2.iloc[index,0])
print(df_catNames2.loc[index,'Row_Labels'])
#BELLA

# 方法2
# 获取降序排列的第一个值
#print(type(df_catNames2)) #
#print(df_catNames2['Count_AnimalName'].sort_values(ascending=False))
print(df_catNames2.sort_values(by='Count_AnimalName',ascending=False))
'''
      Row_Labels  Count_AnimalName
1156       BELLA              1195
9140         MAX              1153
2660     CHARLIE               856
3251        COCO               852
12368      ROCKY               823
...          ...               ...
6884        J-LO                 1
6888       JOANN                 1
6890        JOAO                 1
6891     JOAQUIN                 1
16219      39743                 1

[16220 rows x 2 columns]
BELLA

Process finished with exit code 0

'''
print(df_catNames2.sort_values(by='Count_AnimalName', ascending=False).iloc[0,0])

#BELLA

练习2

2.1 读取 五粮液2020.xlsx 数据,指定索引为0列为[行索引]

#!/user/bin/env python
#-*-coding: utf-8-*-
#@Time           : 2020/9/417:16
#@Author         : GodSpeed
#@File           : Pandas第二次作业02.py
#@Software       : PyCharm



import pandas as pd
import numpy as np

# 2.1 读取 五粮液2020.xlsx 数据,指定 索引为0列 为 行索引
df_wly = pd.read_excel('五粮液2020.xlsx',index_col=[0])
#print(df_wly)
'''
       ts_code  trade_date    open    high     low   close  pre_close
0    000858.SZ    20200903  235.40  243.00  235.19  238.64     235.00
1    000858.SZ    20200902  235.20  239.78  233.80  235.00     236.24
2    000858.SZ    20200901  237.48  239.80  233.10  236.24     240.00
3    000858.SZ    20200831  242.00  246.70  240.00  240.00     240.50
4    000858.SZ    20200828  235.98  244.86  233.20  240.50     238.08
..         ...         ...     ...     ...     ...     ...        ...
159  000858.SZ    20200108  128.99  129.76  128.05  128.89     129.37
160  000858.SZ    20200107  129.50  131.07  129.00  129.37     129.20
161  000858.SZ    20200106  130.00  130.25  128.52  129.20     130.55
162  000858.SZ    20200103  131.60  132.07  129.61  130.55     132.08
163  000858.SZ    20200102  132.00  133.50  129.59  132.08     133.01

[164 rows x 7 columns]

'''

2.2 查看 该数据的基本信息

#2.2 查看 该数据的基本信息

print(df_wly.shape)                                            # 查看数组形状,返回值为元组
#(164, 6)

print(df_wly.dtypes)                                           # 查看列数据类型
'''
trade_date      int64
open          float64
high          float64
low           float64
close         float64
pre_close     float64
dtype: object
'''
print(df_wly.ndim)                                             # 数据维度,返回为整数
#2

print(df_wly.index)                                            # 行索引
'''
Index(['000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ',
       '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ',
       ...
       '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ',
       '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ', '000858.SZ'],
      dtype='object', name='ts_code', length=164)
'''
print(df_wly.columns)                                          # 列索引
'''
Index(['trade_date', 'open', 'high', 'low', 'close', 'pre_close'], dtype='object')
'''
#print(df_wly.values)                                           # 值
#print(df_wly.head())                                           # 显示头部几行,默认前5行
#print(df_wly.tail())                                           # 显示末尾几行,默认后5行
print(df_wly.info())                                           # 相关信息概述
'''

Index: 164 entries, 000858.SZ to 000858.SZ
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   trade_date  164 non-null    int64  
 1   open        164 non-null    float64
 2   high        164 non-null    float64
 3   low         164 non-null    float64
 4   close       164 non-null    float64
 5   pre_close   164 non-null    float64
dtypes: float64(5), int64(1)
memory usage: 8.3+ KB
None
'''

2.3 计算每一天各指标的差异值

#diff 现在对df进行一阶差分操作:df.diff()
# 这个操作实际等效于:df - df.shift(1)
# df.shift(1)是将df在纵轴上向下平移一次,即df.shitf(1)为:
#现在看一下:df - df.shift(1)是否与df.diff()相同,答案是肯定的。

#函数的完整参数为:DataFrame.diff(periods=1, axis=0)。
# 其中axis 表示纵轴还是横轴,periods表示平移的条目数。
print(df_wly.diff(periods=1))
'''
           trade_date  open  high   low  close  pre_close
ts_code                                                  
000858.SZ         NaN   NaN   NaN   NaN    NaN        NaN
000858.SZ        -1.0 -0.20 -3.22 -1.39  -3.64       1.24
000858.SZ        -1.0  2.28  0.02 -0.70   1.24       3.76
000858.SZ       -70.0  4.52  6.90  6.90   3.76       0.50
000858.SZ        -3.0 -6.02 -1.84 -6.80   0.50      -2.42
...               ...   ...   ...   ...    ...        ...
000858.SZ        -1.0 -0.01 -1.44 -0.20  -1.88       0.48
000858.SZ        -1.0  0.51  1.31  0.95   0.48      -0.17
000858.SZ        -1.0  0.50 -0.82 -0.48  -0.17       1.35
000858.SZ        -3.0  1.60  1.82  1.09   1.35       1.53
000858.SZ        -1.0  0.40  1.43 -0.02   1.53       0.93

[164 rows x 6 columns]
'''

问题:
trade_date为时间格式字符串,但是经过diff后,却按照普通数据进行运算

解决方案:
trade_date可以运用函数映射,把trade_date数据转换为pd.to_datetime后,在计算,代码如下:

#函数映射,时间格式转换
f2 = lambda x:pd.to_datetime(x,format="%Y%m%d")
df_col = df_wly['trade_date']
df_wly['trade_date'] = df_col.apply(f2)

print(df_wly.diff())
'''
          trade_date  open  high   low  close  pre_close
ts_code                                                 
000858.SZ        NaT   NaN   NaN   NaN    NaN        NaN
000858.SZ    -1 days -0.20 -3.22 -1.39  -3.64       1.24
000858.SZ    -1 days  2.28  0.02 -0.70   1.24       3.76
000858.SZ    -1 days  4.52  6.90  6.90   3.76       0.50
000858.SZ    -3 days -6.02 -1.84 -6.80   0.50      -2.42
...              ...   ...   ...   ...    ...        ...
000858.SZ    -1 days -0.01 -1.44 -0.20  -1.88       0.48
000858.SZ    -1 days  0.51  1.31  0.95   0.48      -0.17
000858.SZ    -1 days  0.50 -0.82 -0.48  -0.17       1.35
000858.SZ    -3 days  1.60  1.82  1.09   1.35       1.53
000858.SZ    -1 days  0.40  1.43 -0.02   1.53       0.93

[164 rows x 6 columns]
'''

2.4 计算其 pre_close 的增长率

2.5 将 pre_close 的增长率添加至 wly_data 数据中

#2.4 计算其 pre_close 的增长率
print(df_wly['pre_close'].pct_change())

#2.5 将 pre_close 的增长率添加至 wly_data 数据中
df_wly['pct_change'] = df_wly['pre_close'].pct_change()

print(df_wly)
'''
          trade_date    open    high     low   close  pre_close       pct_change
ts_code                                                                  
000858.SZ 2020-09-03  235.40  243.00  235.19  238.64     235.00       NaN
000858.SZ 2020-09-02  235.20  239.78  233.80  235.00     236.24  0.005277
000858.SZ 2020-09-01  237.48  239.80  233.10  236.24     240.00  0.015916
000858.SZ 2020-08-31  242.00  246.70  240.00  240.00     240.50  0.002083
000858.SZ 2020-08-28  235.98  244.86  233.20  240.50     238.08 -0.010062
...              ...     ...     ...     ...     ...        ...       ...
000858.SZ 2020-01-08  128.99  129.76  128.05  128.89     129.37  0.003724
000858.SZ 2020-01-07  129.50  131.07  129.00  129.37     129.20 -0.001314
000858.SZ 2020-01-06  130.00  130.25  128.52  129.20     130.55  0.010449
000858.SZ 2020-01-03  131.60  132.07  129.61  130.55     132.08  0.011720
000858.SZ 2020-01-02  132.00  133.50  129.59  132.08     133.01  0.007041

[164 rows x 7 columns]
'''
# 2.6 将 pct_change 该列 呈现的 NaN 用0填充
df_wly['pct_change'].fillna(0,inplace=True)
print(df_wly)
'''
          trade_date    open    high     low   close  pre_close  pct_change
ts_code                                                                    
000858.SZ 2020-09-03  235.40  243.00  235.19  238.64     235.00    0.000000
000858.SZ 2020-09-02  235.20  239.78  233.80  235.00     236.24    0.005277
000858.SZ 2020-09-01  237.48  239.80  233.10  236.24     240.00    0.015916
000858.SZ 2020-08-31  242.00  246.70  240.00  240.00     240.50    0.002083
000858.SZ 2020-08-28  235.98  244.86  233.20  240.50     238.08   -0.010062
...              ...     ...     ...     ...     ...        ...         ...
000858.SZ 2020-01-08  128.99  129.76  128.05  128.89     129.37    0.003724
000858.SZ 2020-01-07  129.50  131.07  129.00  129.37     129.20   -0.001314
000858.SZ 2020-01-06  130.00  130.25  128.52  129.20     130.55    0.010449
000858.SZ 2020-01-03  131.60  132.07  129.61  130.55     132.08    0.011720
000858.SZ 2020-01-02  132.00  133.50  129.59  132.08     133.01    0.007041

[164 rows x 7 columns]
'''

#2.7 查看 pre_close 与 pct_change 的相关性
print(df_wly['pre_close'].corr(df_wly['pct_change']))
#-0.02569885399397465

# 2.8 将 pct_change 这列乘以100 保留两位小数 成为百分比
f3 = lambda x:"%.02f"%(x*100)+r"%"
df_wly['pct_change'] = df_wly['pct_change'].apply(f3)
#print(df_wly['pct_change'])

print(df_wly)
'''
          trade_date    open    high     low   close  pre_close pct_change
ts_code                                                                   
000858.SZ 2020-09-03  235.40  243.00  235.19  238.64     235.00      0.00%
000858.SZ 2020-09-02  235.20  239.78  233.80  235.00     236.24      0.53%
000858.SZ 2020-09-01  237.48  239.80  233.10  236.24     240.00      1.59%
000858.SZ 2020-08-31  242.00  246.70  240.00  240.00     240.50      0.21%
000858.SZ 2020-08-28  235.98  244.86  233.20  240.50     238.08     -1.01%
...              ...     ...     ...     ...     ...        ...        ...
000858.SZ 2020-01-08  128.99  129.76  128.05  128.89     129.37      0.37%
000858.SZ 2020-01-07  129.50  131.07  129.00  129.37     129.20     -0.13%
000858.SZ 2020-01-06  130.00  130.25  128.52  129.20     130.55      1.04%
000858.SZ 2020-01-03  131.60  132.07  129.61  130.55     132.08      1.17%
000858.SZ 2020-01-02  132.00  133.50  129.59  132.08     133.01      0.70%

[164 rows x 7 columns]

Process finished with exit code 0

'''

你可能感兴趣的:(Python数据分析专栏)