优化思想 | 出发点 |
最小二乘法 | 最小均方差 |
最大似然法 | 最大似然概率 |
分解后先是求 x − y x-y x−y,因为一共需要求 n 1 × n 2 n_1\times n_2 n1×n2个距离,我分别把两个矩阵扩展到这个大小,直接减后再reshape。以测试集为例,大小为 n 1 × d n_1\times d n1×d,先是reshape为 ( n 1 , 1 , d ) (n_1, 1,d) (n1,1,d)的张量,这样用tile扩展的时候就可以形成连续相同样本。
a = a r r a y ( [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ) ( 2 × 3 ) a = array([1,2,3],[4,5,6])\quad (2\times 3) a=array([1,2,3],[4,5,6])(2×3)
b = a r r a y ( [ 1 , 2 , 1 ] , [ 6 , 7 , 9 ] , [ 8 , 2 , 0 ] , [ 7 , 1 , 4 ] , [ 9 , 3 , 1 ] ) ( 5 × 3 ) b = array([1,2,1],[6,7,9],[8,2,0],[7,1,4],[9,3,1])\quad (5\times 3) b=array([1,2,1],[6,7,9],[8,2,0],[7,1,4],[9,3,1])(5×3)
t i l e ( a , ( 5 , 1 ) ) = a r r a y ( [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] , [ 1 , 2 , 3 ] , [ 4 , 5 , 6 ] ) ( 10 × 3 ) tile(a, (5,1))=array([1,2,3],[4,5,6],[1,2,3],[4,5,6],[1,2,3],[4,5,6],[1,2,3],[4,5,6],[1,2,3],[4,5,6])\quad (10\times 3) tile(a,(5,1))=array([1,2,3],[4,5,6],[1,2,3],[4,5,6],[1,2,3],[4,5,6],[1,2,3],[4,5,6],[1,2,3],[4,5,6])(10×3)
转换为 a r r a y ( [ [ [ 1 , 2 , 3 ] ] , [ [ 4 , 5 , 6 ] ] ] ) array([[[1,2,3]], [[4,5,6]]]) array([[[1,2,3]],[[4,5,6]]])后
t i l e ( a , ( 1 , 5 , 1 ) ) = a r r a y ( [ [ 1 , 2 , 3 ] , [ 1 , 2 , 3 ] , [ 1 , 2 , 3 ] , [ 1 , 2 , 3 ] , [ 1 , 2 , 3 ] ] , [ [ 4 , 5 , 6 ] , [ 4 , 5 , 6 ] , [ 4 , 5 , 6 ] , [ 4 , 5 , 6 ] , [ 4 , 5 , 6 ] ] ) ( 2 × 5 × 3 ) tile(a, (1,5,1))=array([[1,2,3],[1,2,3],[1,2,3],[1,2,3],[1,2,3]],[[4,5,6],[4,5,6],[4,5,6],[4,5,6],[4,5,6]])\quad (2\times 5\times 3) tile(a,(1,5,1))=array([[1,2,3],[1,2,3],[1,2,3],[1,2,3],[1,2,3]],[[4,5,6],[4,5,6],[4,5,6],[4,5,6],[4,5,6]])(2×5×3)
这样直接扩展训练集为 ( n 1 ∗ n 2 ) × d (n_1 * n_2 )\times d (n1∗n2)×d,相减在reshape就可以得到 n 1 × n 2 n_1\times n_2 n1×n2的矩阵,每一行是一个测试样本与所有训练样本的距离。
def distant(self, x_tr, x_test, dis_type):
if dis_type == 'l2':
dim_tr = x_tr.shape[0]
dim_te = x_test.shape[0]
dim_feature = x_tr.shape[1]
# l2 = ((x - y)**2)**0.5
# expand_tr = x_tr.reshape((-1, dim_te))
## calculate x - y through tile(), and the square root of sum
expand_te = np.tile(x_test.reshape((dim_te,1,-1)), (1 ,dim_tr,1))
expand_te = expand_te.reshape((-1, dim_feature))
expand_tr = np.tile(x_tr,(dim_te,1))
dist_xy = np.sum((expand_te - expand_tr)**2, axis=1)
dist_mat = (dist_xy**0.5).reshape((dim_te,-1))
return dist_mat
if dis_type == 'cosine':
# dimension of sets
dim_tr = x_tr.shape[0]
dim_te = x_test.shape[0]
# cos = A.T.dot(B) / ((A**2)**0.5 + (B**2)**0.5)
## calculate the dot product, and the sqaure root of x and y
frac_u = np.dot(x_test, x_tr.T)
sub_tr = (x_tr**2).sum(axis=1).reshape((-1,1))
sub_tr = np.tile(sub_tr.T**0.5,(dim_te, 1))
sub_te = (x_test**2).sum(axis=1).reshape((-1,1))
sub_te = np.tile(sub_te**0.5,(dim_tr))
frac_d = sub_tr * sub_te
return frac_u / frac_d
initialize prediction list
for sample in shape[0]:
initialize dictionary
for i in range(k):
look up labels
put into dictionary
sort dictionary
append the label into the prediction list
return prediction list
def predict(self, x_tr, y_tr, x_test, y_test, k):
dist_mat = self.distant(x_tr, x_test, 'cosine')
dist_k = dist_mat.argsort(axis=1)
y_predict = np.array([])
for sample in range(dist_k.shape[0]):
class_count = {}
for i in range(k):
vote_label = int(y_tr[dist_k[sample][i]])
class_count[vote_label] = class_count.get(vote_label, 0) + 1
final_vote = sorted(class_count.items(),
key=operator.itemgetter(1), reverse=True)
y_predict = np.append(y_predict, final_vote[0][0])
y_predict = y_predict.reshape((-1,1))
return y_predict
def evalution(self, y_predict, y):
y_test = y_test.ravel()
y_predict = y_predict.ravel()
num_test = y_test.shape[0]
# succeed: 0 fail: 1,-1
outcome = y_predict - y_test
# succeed: 0 fail: 1
outcome[outcome == -1] = 1
num_tp_tn = sum(outcome==0)
num_fp = np.dot(y_predict.T, outcome)
num_tp = sum(y_predict==1) - num_fp
num_tn = num_tp_tn - num_tp
num_fn = sum(y_predict==0) - num_tn
accuracy = num_tp_tn / num_test
precision = num_tp / (num_tp + num_fp)
recall = num_tp / (num_tp + num_fn)
return accuracy, precision, recall
这里我加了一个交叉验证,将训练集又分为训练集和验证集。主要加深了我对numpy的array结构和迭代器yield的理解,可以将这个数组分为几组(整除情况下),取其中的一个组,将剩下的concatnate起来,然后用yield返回一个迭代器,共fold个迭代器(n fold)。每个迭代器里面分别是训练集数据、标签和验证机数据、标签。
def validation(self, x_train, y_train, fold):
dims = x_train.shape[1]
if x_train.shape[0] % fold == 0:
x_fold = []
y_fold = []
x_fold = np.split(x_train, fold)
y_fold = np.split(y_train, fold)
for i in range(fold):
x_tr = np.concatenate(
(np.array(x_fold)[:i], np.array(x_fold)[i+1:]),axis=0)
y_tr = np.concatenate(
(np.array(y_fold)[:i], np.array(y_fold)[i+1:]), axis=0)
x_val = np.array(x_fold)[i]
y_val = np.array(y_fold)[i]
x_tr = x_tr.reshape((-1,dims))
y_tr = y_tr.reshape((-1,1))
yield (x_tr, y_tr, x_val, y_val)
print("Attention: vary size.")
for i, (x_tr, y_tr, x_val, y_val) in enumerate(model.validation(x_train, y_train, 4)):
y_predict = model.predict(x_tr, y_tr, x_val, y_val, k)
our: 0.96875 0.9634146341463414 0.9875
sklearn: 0.96875 0.9634146341463414 0.9875
our: 0.96875 0.9767441860465116 0.9767441860465116
sklearn: 0.96875 0.9767441860465116 0.9767441860465116
our: 0.90625 0.9166666666666666 0.9390243902439024
sklearn: 0.90625 0.9166666666666666 0.9390243902439024
our: 0.8984375 0.8831168831168831 0.9444444444444444
sklearn: 0.8984375 0.8831168831168831 0.9444444444444444
our: 0.125 0.0 0.0
sklearn: 0.96875 0.9634146341463414 0.9875
our: 0.1171875 0.034482758620689655 0.011627906976744186
sklearn: 0.96875 0.9767441860465116 0.9767441860465116
our: 0.203125 0.0 0.0
sklearn: 0.90625 0.9166666666666666 0.9390243902439024
our: 0.25 0.038461538461538464 0.013888888888888888
sklearn: 0.8984375 0.8831168831168831 0.9444444444444444