利用Python计算数据的Pearson相关系数

步骤一:读取数据

# _*_ coding: utf-8 _*_
import pandas as pd
import numpy as np
df = pd.read_csv("D:data1.csv",index_col='user_id')

步骤二:异常数据处理(如无需预处理跳过数据预处理程序),进行Pearson相关系数计算

运用箱型图分析选出异常值,取前后相邻数据中值替换

def data_Process(df):
    for i in range(1454):
        j = i + 1
        powero = np.array(df.ix[j])
        power = powero[:, 1].flatten()
        powerf = power
        for m in range(35, 608):
            power_s = 0
            power_35 = []
            for n in range(m-35, m):
                power_s += power[n]
                power_35.append(int(power[n]))
            power_35s = sorted(power_35)
            power_m = power_s / 35
            load_u = power_35s[25]
            load_l = power_35s[8]
            iqr = load_u - load_l
            ud = load_u + 1.5 * iqr
            ld = load_l - 1.5 * iqr
            for q in range(m - 35, m):
                if q < 600:
                    if power[q] > ud:
                        powerf[q] = (power[q-1] + power[q + 1]) /2
                        
                    elif power[q] < ld:
                        
                        powerf[q] = (power[q - 1] + power[q + 1]) / 2
                        
                else:
                    if power[q] > ud:
                        powerf[q] = (power[q - 1] + power[q + 1]) /2
                       
                    elif power[q] < ld:
                        powerf[q] = (power[q - 1] + power[q + 1]) / 2
        load_a[j] = powerf
    return load_a
list_re = data_Process(df)
load_ar = np.array(list_re).reshape(1454, 609)
load_art = load_ar.T #将需要进行相关系数计算的数据作为列

index1 = pd.Series(np.arange(1,1455)) #1454列数据进行相关计算
index1 = index1.astype(str)
index1 = 'A'+index1

index2 = pd.Series(np.arange(1,610)) #每个样本609个数据
index2 = index2.astype(str)
index2 = 'B'+index2

#Pearson相关系数计算
load_pcp = pd.DataFrame(load_art, index=index2, columns=index1)
load_pc = load_pcp.corr()
print load_pc

输出效果部分截图如下:

利用Python计算数据的Pearson相关系数_第1张图片

你可能感兴趣的:(Python,机器学习)