查看数据偏离程度

import numpy as np
import pylab
import scipy.stats as stats

data = np.mat([[1, 200, 105, 3, False], [2, 165, 80, 2, False],
               [3, 184.5, 120, 2, False], [4, 116, 70.8, 1, False], [5, 270, 150, 4, True]])

col1 = []
for row in data:
    print(row[0, 1])
    col1.append(row[0, 1])

stats.probplot(col1, plot=pylab)
pylab.show()

image.png

余弦相似度：

https://baike.baidu.com/item/%E4%BD%99%E5%BC%A6%E7%9B%B8%E4%BC%BC%E5%BA%A6/17509249

箱线图：

https://baijiahao.baidu.com/s?id=1591167651227320027&wfr=spider&for=pc

最小二乘法

import numpy as np
from matplotlib import pyplot as plt

A = np.array([[5], [4]])
C = np.array([[4], [6]])
B = A.T.dot(C)
AA = np.linalg.inv(A.T.dot(A))
l = AA.dot(B)
P = A.dot(l)
x = np.linspace(-2, 2, 10)
x.shape = (1, 10)
xx = A.dot(x)
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(xx[0, :], xx[1, :])
ax.plot(A[0], A[1], 'ko')

ax.plot([C[0], P[0]], [C[1], P[1]], 'r-o')
ax.plot([0, C[0]], [0, C[1]], 'm-o')

ax.axvline(x=0, color='black')
ax.axhline(y=0, color='black')

margin = 0.1
ax.text(A[0] + margin, A[1] + margin, r"A", fontsize=20)
ax.text(C[0] + margin, C[1] + margin, r"C", fontsize=20)
ax.text(P[0] + margin, P[1] + margin, r"P", fontsize=20)
ax.text(0 + margin, 0 + margin, r"O", fontsize=20)
ax.text(0 + margin, 4 + margin, r"y", fontsize=20)
ax.text(4 + margin, 0 + margin, r"x", fontsize=20)
plt.xticks(np.arange(-2, 3))
plt.yticks(np.arange(-2, 3))

ax.axis('equal')
plt.show()

最小二乘法2

x = [(2, 0, 3), (1, 0, 3), (1, 1, 3), (1,4, 2), (1, 2, 4)]
y = [5, 6, 8, 10, 11]

epsilon = 0.002

alpha = 0.02
diff = [0, 0]
max_itor = 1000
error0 = 0
error1 = 0
cnt = 0
m = len(x)

theta0 = 0
theta1 = 0
theta2 = 0

while True:
    cnt += 1

    for i in range(m):
        diff[0] = (theta0 * x[i][0] + theta1 * x[i][1] + theta2 * x[i][2]) - y[i]
        theta0 -= alpha * diff[0] * x[i][0]
        theta1 -= alpha * diff[0] * x[i][1]
        theta2 -= alpha * diff[0] * x[i][2]

    error1 = 0
    for lp in range(len(x)):
        error1 += (y[lp] - (theta0 + theta1 * x[lp][1] + theta2 * x[lp][2])) ** 2 / 2
    if abs(error1 - error0) < epsilon:
        break
    else:
        error0 = error1

print('theta0 : %f, theta1 : %f, theta2 : %f, error1 : %f' % (theta0, theta1, theta2, error1))
print('Done: theta0 : %f, theta1 : %f, theta2 : %f' % (theta0, theta1, theta2))
print('迭代次数: %d' % cnt)

上面为什么要除以2？

image.png

最小二乘法3

image.png

https://www.derivative-calculator.net/

image.png

拟合

from gm.api import *
import datetime
import numpy as np
import statsmodels.api as sm
from matplotlib import pyplot as plt

set_token("e8978d765c4822e5a85fcaa73e044065cf17b58b")
day_time, hour_and_mins = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')).split(" ")

symbol = "SZSE.000651"
data = history_n("SZSE.000651", frequency="1d", end_time=day_time, count=7, fields="close,open", df=True)

close = data["close"].values
open0 = data["open"].values

open = sm.add_constant(open0)

print('open:', open)
print('close:', close)

model = sm.OLS(close, open)

results = model.fit()
bias, weight = (results.params)

print(bias, " ", weight)

plt.plot(open0, close)

x = np.linspace(34, 37, 5)
plt.plot(x, x * weight + bias)
plt.show()

image.png

指定可变长度参数

*args * 表示将传入的非关键字参数以元组形式集合
**kwargs **表示将传入的关键字参数以字典的形式集合

image.png

for 引用index

for i, element in enumerate(args):
  print("args %d-->%s" % (i, str(element)))

zip

    names = ['raymond', 'rachel', 'matthew']
    colors = ['red', 'green', 'blue', 'yellow']

    print(zip(names, colors)) #返回是一个对象 
    print(list(zip(names, colors))) #list()转换为列表 [('raymond', 'red'), ('rachel', 'green'), ('matthew', 'blue')]

    n = min(len(names), len(colors))
    for i in range(n):
        print(names[i], '--->', colors[i])
    # 推荐更好的简练方法
    for name, color in zip(names, colors):
        print(name, '--->', color)

根据多线程和多进程分别在CPU密集型和I/O密集型任务的执行效果可知，由于python GIL限制，多线程更适合I/O密集型应用。而对于CPU密集型的应用，为了实现更好的并行性，可使用多进程方式让CPU的其他内核加入执行。

       # CPU密集型任务
        def count(n):
            while n > 0:
                n -= 1

        # IO密集型任务
        def count():
            time.sleep(0.01)

广播

    # Numpy 广播特性——标量
    print(array_4x3_a+5)

range

range(4）是简写，等价于range(0, 4, 1)
np.arange(4) # 步长可以为小数

range和arange性能对比

    import functools,time
    # 定义测试代码执行时间的装饰器-三阶
    def timeit_test(number=3, repeat=3):
        def decorator(func):
            @functools.wraps(func)
            def wrapper(*args, **kwargs):
                for i in range(repeat):
                    start = time.perf_counter()
                    for _ in range(number):
                        func(*args, **kwargs)
                    elapsed = (time.perf_counter() - start)
                    print('Time of {} used: {} '.format(i, elapsed))

            return wrapper
        return decorator

        #测试Numpy数组和等价的Python列表性能差距

    if True:
        @timeit_test(number=1, repeat=1)
        def list_test():
            my_list = list(range(1000000))

        @timeit_test(number=1, repeat=1)
        def ndarray_test():
            my_arr = np.arange(1000000)

        list_test() # Time of 0 used: 0.04712673199999998
        ndarray_test() # Time of 0 used: 0.0014547089999999985

    else:
        @timeit_test(number=1, repeat=1)
        def list_test():
            my_list = []
            for num in range(1000000):
                my_list.append(num * 2.0)

        @timeit_test(number=1, repeat=1)
        def ndarray_test():
            my_arr = np.arange(1000000)
            my_arr = my_arr * 2.0

        list_test() # Time of 0 used: 0.15243656000000003
        ndarray_test() # Time of 0 used: 0.009769811999999989

image.png

    print("*****example-3.14*****")
    # ones(shape, dtype=None, order='C')
    array_one = np.ones(shape=(2, 4))
    print(array_one)
    """
    [[1. 1. 1. 1.]
     [1. 1. 1. 1.]]
    """
    print("*********************\n")

    print("*****example-3.15*****")
    # np.full(shape, fill_value, dtype=None, order=’C’)
    array_full = np.full(shape=(2, 4), fill_value=10)
    print(array_full)
    """
    [[10 10 10 10]
     [10 10 10 10]]    
    """

    # np.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None) 等差数列
    # start指定开始值;stop指定终值;num指定元素个数;endpoint指定等差数列是否包含终值
    array_linspace = np.linspace(start=0, stop=5, num=10, endpoint=False)
    print(array_linspace)

image.png

np.dot(): 两个矩阵的点积
np.linalg.inv(): 即使矩阵乘法的逆矩阵
np.linalg.solve(): 求解线性矩阵方程

image.png

dataframe

df可以看成是共享同一个index的Series的集合。

image.png

        s_list = pd.Series([-1.55666192,0.127451231,"str-AA",-1.37775038],
                           index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14'])
        print(s_list) # 列表中包含多种数据类型
        """
        2019-01-11    -1.55666
        2019-01-12    0.127451
        2019-01-13      str-AA
        2019-01-14    -1.37775
        dtype: object
        """

        #data = dict
        s_dict = pd.Series({'2019-01-11' : 0., '2019-01-12' : 1., '2019-01-13' : 2., '2019-01-14' : 3.})
        print(s_dict)
        """
        2019-01-11    0.0
        2019-01-12    1.0
        2019-01-13    2.0
        2019-01-14    3.0
        dtype: float64
        """

series访问

# 创建被访问对象
        series_access = pd.Series([10.23, 11.24, 12.25, 13.26],
                           index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14'])
        print(series_access)
        """
        2019-01-11    10.23
        2019-01-12    11.24
        2019-01-13    12.25
        2019-01-14    13.26
        dtype: float64
        """

        # 访问Series全部元素数值
        print(series_access.values)
        # [10.23 11.24 12.25 13.26]

        # 访问Series全部索引值
        print(series_access.index)
        # Index(['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'], dtype='object')

        # 访问'2019-01-11'索引的元素值
        print(series_access['2019-01-11'])
        # 10.23

        # 访问'2019-01-11'和'2019-01-13'索引的元素值
        print(series_access[['2019-01-11', '2019-01-13']])
        """
        2019-01-11    10.23
        2019-01-13    12.25
        dtype: float64
        """

        # 访问前两个数据
        print(series_access[:2])
        """
        2019-01-11    10.23
        2019-01-12    11.24
        dtype: float64        
        """

df生成

        # DataFrame的生成
        # 以列表组成的字典形式创建DataFrame
        df_list_dict = pd.DataFrame({'Close': [1., 2., 3., 5], 'Open': [1., 2., 3., 4.]},
                                    index=['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'])
        print(df_list_dict) # 创建2行4列的表格
        """
                    Close  Open
        2019-01-11    1.0   1.0
        2019-01-12    2.0   2.0
        2019-01-13    3.0   3.0
        2019-01-14    5.0   4.0
        """
        # 以嵌套列表形式创建DataFrame
        df_list_list = pd.DataFrame([[1., 2., 3., 5],[1., 2., 3., 4.]],
                                    index=['2019-01-11', '2019-01-12'],
                                    columns=['Close','Open','Low','High'])
        print(df_list_list)
        """
                    Close  Open  Low  High
        2019-01-11    1.0   2.0  3.0   5.0
        2019-01-12    1.0   2.0  3.0   4.0
        """

        # 二维ndarray形式创建DataFrame
        ndarray_data = np.zeros((2), dtype=[('Close', 'i4'),('Open', 'f4'),('Low', 'a10')]) # 整数、浮点和字符串
        print(ndarray_data)
        """
        [(0, 0., b'') (0, 0., b'')]
        """
        ndarray_data[:] = [(1,2.,'11.2'), (2,3.,"12.3")]
        df_ndarray = pd.DataFrame(data=ndarray_data, index=['2019-01-11', '2019-01-12']) # 使用默认的定列索引，也可指定列索引columns，这样最终按指定的顺序进行排列
        print(df_ndarray)
        """
                    Close  Open      Low
        2019-01-11      1   2.0  b'11.2'
        2019-01-12      2   3.0  b'12.3'
        """

        # 以Series组成的字典形式创建DataFrame
        series_data = {'Close' : pd.Series([1., 2., 3.], index=['2019-01-11', '2019-01-12', '2019-01-13']),
                       'Open' : pd.Series([1., 2., 3., 4.], index=['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'])}
        df_series = pd.DataFrame(series_data)
        print(df_series)
        """
                    Close  Open
        2019-01-11    1.0   1.0
        2019-01-12    2.0   2.0
        2019-01-13    3.0   3.0
        2019-01-14    NaN   4.0
        """

        df_dict_list = pd.DataFrame([{'Close': 1, 'Open': 2}, {'Close': 5, 'Open': 10, 'High': 20}],
                                    index=['2019-01-11', '2019-01-12'])
        # 如果不指定行索引index DataFrame会自动加上行索引
        print(df_dict_list)
        """
                    Close  High  Open
        2019-01-11      1   NaN     2
        2019-01-12      5  20.0    10
        """

df 访问

       # 创建被访问DataFrame对象
        series_data = {'Close' : pd.Series([10.51, 10.52, 10.53, 10.54], index=['2019-01-11','2019-01-12','2019-01-13','2019-01-14']),
                       'Open' : pd.Series([12.31, 12.32, 12.33, 12.34], index=['2019-01-11', '2019-01-12','2019-01-13','2019-01-14'])}
        df_access = pd.DataFrame(series_data)
        print(df_access)
        """
                    Close   Open
        2019-01-11  10.51  12.31
        2019-01-12  10.52  12.32
        2019-01-13  10.53  12.33
        2019-01-14  10.54  12.34
        """

        # DataFrame的访问
        print("***********************访问全部元素 某行/列元素*******************")
        # 访问DataFrame全部的行索引
        print(df_access.index)
        # Index(['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'], dtype='object')

        # 访问DataFrame全部的列索引
        print(df_access.columns)
        # Index(['Close', 'Open'], dtype='object')

        # 访问DataFrame全部的行和列索引
        print(df_access.axes)
        # [Index(['2019-01-11', '2019-01-12', '2019-01-13', '2019-01-14'], dtype='object'), Index(['Close', 'Open'], dtype='object')]

        # 访问DataFrame全部元素数值
        print(df_access.values)
        """
        [[10.51 12.31]
         [10.52 12.32]
         [10.53 12.33]
         [10.54 12.34]]
        """

        # 访问某列内容
        print(df_access['Open'])
        print(df_access.Open)
        """
        2019-01-11    12.31
        2019-01-12    12.32
        2019-01-13    12.33
        2019-01-14    12.34
        Name: Open, dtype: float64        
        """
        print(type(df_access['Open'])) # 查看列类型
        # 

        # 访问某一行内容
        print(df_access[0:1])
        """
                    Close   Open
        2019-01-11  10.51  12.31    
        """
        print(type(df_access[0:1]))  # 查看行类型
        # 

        print("***************************DataFrame.iloc***************************")
        # 选取了'2019-01-11'行对应的'Close','Open'这两列的元素内容
        print(df_access.loc[['2019-01-11',],['Close','Open']])
        """
                    Close   Open
        2019-01-11  10.51  12.31
        """

        # 选取了所有的行以及列索引为'Close','Open'的元素内容
        print(df_access.loc[:,['Close','Open']])
        """
                    Close   Open
        2019-01-11  10.51  12.31
        2019-01-12  10.52  12.32
        2019-01-13  10.53  12.33
        2019-01-14  10.54  12.34
        """

        # 访问到'2019-01-11'这行的元素
        print(df_access.loc['2019-01-11'])
        """
        Close    10.51
        Open     12.31
        Name: 2019-01-11, dtype: float64
        """

        # 选取了前两行，第一列的元素。
        print(df_access.iloc[0:2,0:1])
        """
                    Close
        2019-01-11  10.51
        2019-01-12  10.52
        """
        # 选取了前两行，所有列的元素
        print(df_access.iloc[0:2])
        """
                    Close   Open
        2019-01-11  10.51  12.31
        2019-01-12  10.52  12.32
        """
        # 除了指定某个范围方式选取外，还可自由选取行和列的位置所对应的数据元素，访问第0行和第2行，第一列和第二列的元素
        print(df_access.iloc[[0,2],[0,1]])
        """
                    Close   Open
        2019-01-11  10.51  12.31
        2019-01-13  10.53  12.33
        """
        # 采用混合标签和位置的方式访问元素 从'Open'列索引中获取第0个和第2个元素
        #print(df_access.ix[[0, 2], ['Open']])
        """
                     Open
        2019-01-11  12.31
        2019-01-13  12.33
        """

        print(df_access.index[[0, 2]])
        # Index(['2019-01-11', '2019-01-13'], dtype='object')
        print(df_access.loc[df_access.index[[0, 2]], ['Open']])
        """
                     Open
        2019-01-11  12.31
        2019-01-13  12.33
        """

        print(df_access.columns.get_indexer(['Open'])) # [1]
        print(df_access.columns.get_loc('Open')) # 1
        print(df_access.iloc[[0, 2], df_access.columns.get_indexer(['Open'])])
        """
                     Open
        2019-01-11  12.31
        2019-01-13  12.33
        """

        print(df_access.index.get_loc('2019-01-12')) # 1

        print("***************************条件表达式访问元素***************************")

        print(df_access.Open > df_access.Open.mean())
        """
        2019-01-11    False
        2019-01-12    False
        2019-01-13     True
        2019-01-14     True
        Name: Open, dtype: bool
        """

        print(df_access[df_access.Open > df_access.Open.mean()])
        """
                    Close   Open
        2019-01-13  10.53  12.33
        2019-01-14  10.54  12.34
        """
        print(df_access.loc[df_access.Open > df_access.Open.mean(),'Close'])
        """
        2019-01-13    10.53
        2019-01-14    10.54
        Name: Close, dtype: float64
        """

降升采样

    print(ts_d.resample('5D', closed='left', label='left').sum())  # 左闭右开 1 - 5
    print(ts_d.resample('5D', closed='right', label='right').sum())  # 左开右闭 2 - 6

    ts_12h_asfreq = ts_d.resample('12H').asfreq()
    print(ts_12h_asfreq)

    ts_12h_ffill = ts_d.resample('12H').ffill()
    print(ts_12h_ffill)

roll循环右移

    print("########## deal with data #####################################################")
    np.random.seed(1)#设置相同的seed 每次生成的随机数相同 便于调试
    #数据data：正态分布随机数组——close
    close_data = np.random.normal(loc=10.0, scale=1.0, size=1000)
    print(f"close_data：\n {format(close_data[0:10])}")#打印前10行
    """
    close_data：
     [11.62434536  9.38824359  9.47182825  8.92703138 10.86540763  7.6984613
     11.74481176  9.2387931  10.3190391   9.75062962]
    """
    #数据data：open
    open_data = np.roll(close_data,1)
    print(f"open_data：\n {format(open_data[0:10])}")#打印前10行
    """
    open_data：
     [ 9.81304498 11.62434536  9.38824359  9.47182825  8.92703138 10.86540763
      7.6984613  11.74481176  9.2387931  10.3190391 ]
    """
    #数据data：high low
    high_data = np.where((open_data > close_data),open_data,close_data)
    print(f"high_data：\n {format(high_data[0:10])}")#打印前10行

date/period range

date_index =pd.date_range('2010-01-01',freq='D',periods=1000)
period_index=pd.period_range('2010-01-01',freq='D',periods=1000)

np.where

high_data = np.where((open_data > close_data),open_data,close_data)
low_data = np.where((open_data <= close_data),open_data,close_data)

apply map

df_stock_object = df_stock.applymap(lambda x:'%0.2f'%x)#保留2位小数
df_stock = df_stock.round(2)#保留2位小数

dropna/fillna

df_stock.dropna(axis=0, how='any', inplace=True)#NAN值删除
df_fillna = df_stock.fillna(method='bfill', axis=0)#NAN值填充

plot

    import matplotlib.pyplot as plt
    # 可视化DataFrame数据
    df_visual = df_stock.loc['2010-01-01':'2012-01-01',['close']].plot(linewidth=1, figsize=(8, 6))
    df_visual.set_xlabel('Time')
    df_visual.set_ylabel('Close price')
    df_visual.set_title('From 2010-01-01 to 2012-01-01')
    df_visual.legend()
    plt.show()

df合并处理

concat: 沿着一条轴（行方向/列方向）将多个对象拼接到一起
merge: 根据一个或者多个键将两个df对象横向合并
join: 根据行索引为连接键将两个df对象横向合并

df遍历

#for in遍历方式
    def forin_looping(df):
        df = df.assign(pct_change = 0)  #采用assign新增一列
        for i in np.arange(0,df.shape[0]):
            df.iloc[i,df.columns.get_loc('pct_change')] = (df.iloc[i]['high'] - df.iloc[i]['low'])/df.iloc[i]['open']
        return df

#iterrows()遍历方式
    def iterrows_loopiter(df):
        df = df.assign(pct_change=0)  # 采用assign新增一列
        for index,row in df.iterrows():
            df.loc[index, 'pct_change'] = (row['high']-row['low'])/row['open']
        return df

    #apply()遍历方式
    df_concat['pct_change'] = df_concat.apply(lambda row: ((row['high']-row['low'])/row['open']), axis = 1)

    #Pandas series 的矢量化方式
    df_concat['pct_change'] = (df_concat['high']-df_concat['low'])/df_concat['open']

    #Numpy arrays的矢量化方式
    df_concat['pct_change'] = (df_concat['high'].values-df_concat['low'].values)/df_concat['open'].values

forin_test() # Time of 0 used: 8.462902736
iterloop_test() # Time of 0 used: 4.0023713690000005
apply_test() # Time of 0 used: 0.25229068800000043
series_test() # Time of 0 used: 0.0036549980000000204
ndarray_test() # Time of 0 used: 0.0018982859999994162

【pandas】[2] 移动窗口rolling的理解

https://blog.csdn.net/xiezhen_zheng/article/details/82319183

python学习笔记

查看数据偏离程度

余弦相似度：

箱线图：

最小二乘法

最小二乘法2

最小二乘法3

拟合

指定可变长度参数

for 引用index

zip

广播

range

range和arange性能对比

dataframe

series访问

df生成

df 访问

降升采样

roll循环右移

date/period range

np.where

apply map

dropna/fillna

plot

df合并处理

df遍历

【pandas】[2] 移动窗口rolling的理解

你可能感兴趣的:(python学习笔记)