1.相关性
先写几个相关性的基础算法和相应的api吧
欧式距离
from scipy import spatial
import math
# 欧式距离
def d_euclidean(*args): #args为所有向量组成数组,其实就是个二维数组
''' 第一个值是其它值的对比对象,其它值都是以第一个值为对比标准'''
""" 返回也是个数组,为和第一个值对比出来的欧氏距离"""
""" 欧式距离公式 d(x,y) = ((y1 - x1)^2 + (y2 - x2)^2 +(y3 - x3)^2 .....)^(0.5)"""
base_vector = args[0]
d_values = []
for o_vector in args:
if base_vector is o_vector:
continue
d_value = 0
for index in range(len(base_vector)):
d_value += pow((o_vector[index] - base_vector[index]),2)
d_values.append(math.sqrt(d_value))
if len(d_values) > 0:
return d_values
else:
return -1
vector1 = [0,1,2,3]
vector2 = [1,2,3,4]
vector3 = [4,3,2,1]
print(d_euclidean(vector1,vector2,vector3))
#结果为 [2.0, 4.898979485566356]
print(spatial.distance.euclidean(vector1,vector2))
print(spatial.distance.euclidean(vector1,vector3))
# 2.0
# 4.898979485566356
#结果是一样的
曼哈顿距离
#曼哈顿距离
import numpy as np
from scipy.spatial.distance import pdist
def d_manhattan(*args):
"""这个就超级简单了,就是求相差的绝对值"""
"""然后把这些绝对值加起来就妥了"""
base_vector = args[0]
d_values = []
for o_vector in args:
if base_vector is o_vector:
continue
d_value = 0
for index in range(len(base_vector)):
d_value += abs((o_vector[index] - base_vector[index]))
d_values.append(d_value)
if len(d_values) > 0:
return d_values
else:
return -1
vector1 = [0,1,2,3]
vector2 = [1,2,3,4]
vector3 = [4,3,2,1]
print(d_manhattan(vector1,vector2,vector3))
# [4,8]
print(d_manhattan(vector2,vector3))
#[8]
print(pdist(np.vstack([vector1,vector2,vector3]),'cityblock')) #这个api会返回所有的相似关系,是ndarray形式
# 结果 [4 8 8]
余弦相似度
#余弦相似度
import numpy as np
from scipy.spatial.distance import pdist
def d_cos(*args):
base_vector = args[0]
a_vals = [np.sum(np.square(x)) for x in args]
b_val = a_vals[0]
d_values = []
for i in range(len(args)):
o_vector = args[i]
if i == 0:
continue
d_value = 0
for index in range(len(base_vector)):
d_value += (o_vector[index] * base_vector[index])
d_values.append(1 - d_value/np.sqrt(b_val*a_vals[i]))
if len(d_values) > 0:
return d_values
else:
return -1
vector1 = [0,1,2,3]
vector2 = [1,2,3,4]
vector3 = [4,3,2,1]
print(d_cos(vector1,vector2,vector3))
#[0.024099927051466796, 0.5120499635257334]
print(d_cos(vector2,vector3))
#[0.33333333333333337]
print(pdist(np.vstack([vector1,vector2,vector3]),'cosine'))
#[0.02409993 0.51204996 0.33333333]
皮尔森相关系数
import numpy as np
from scipy.stats import pearsonr
def get_pearsonr(b,o):
b_s = np.sum(b)
o_s = np.sum(o)
b_ss = np.sum(np.square(b))
o_ss = np.sum(np.square(o))
t_bo = 0
n = len(b)
for i in range(len(b)):
t_bo += b[i] * o[i]
denominator = np.sqrt(b_ss - np.square(b_s)/n)*np.sqrt(o_ss - np.square(o_s)/n)
if denominator:
return (t_bo - (o_s * b_s)/n) / denominator
else:
return 0
def d_pearsonr(*args):
pass
vector1 = np.random.normal(1,100,50)
vector2 = np.random.normal(2,10,50)
vector3 = [4,3,2,1]
print(get_pearsonr(vector1,vector2))
#随机数,得到的结果每次都不一样比如这次的结果是-0.10288209400426439
print(pearsonr(vector1,vector2))
#(-0.10288209400426439, 0.4770990724691394) 这次是一样的,但有时候会不一样,有空研究下怎么还会不一样
scipy的皮尔森函数返回两个值,第一个是皮尔森系数,另一个是p-value也是表示相关性的