【数据挖掘】k-邻近算法

#!/usr/bin/python
#-*-encoding:utf-8-*-
import numpy as np
import operator

def classify(vect,dataset,lables,k=5):
    rows = dataset.shape[0]
    diffmat =  np.tile(vect,(rows,1)) - dataset
    diffmat = diffmat ** 2
    diffmat = diffmat.sum(axis=1)#n维数组降维为1维数组
    distances = diffmat ** 0.5
    sortedIndices = distances.argsort()#返回排序后索引
    classCount={}
    for i in range(k):
    	  votedLabel = lables[sortedIndices[i]]
    	  classCount.setdefault(votedLabel,0)
    	  classCount[votedLabel] += classCount[votedLabel] + 1
    #对classCount进行排序
    sortedClassCount = sorted(classCount.iteritems(),
        key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def load(filename,cols):
	  fr = open(filename)
	  arrayLines = fr.readlines()
	  mat = np.zeros((len(arrayLines),cols))
	  label = []
	  labelval=[]
	  valdict={}
	  indexCount=0
	  for line in arrayLines:
	  	  line = line.strip().strip("\n")
	  	  listLine = line.split("\t")
	  	  mat[indexCount,:] = listLine[0:-1]
	  	  label.append(listLine[-1])
	  	  indexCount+=1
	  	  if not valdict.has_key(listLine[-1]):
	  	  	  valdict[listLine[-1]]=float(indexCount)
	  	  labelval.append(valdict[listLine[-1]])
	  	  if indexCount > 100: break
	  return mat,label,labelval
#数据归一化处理	  
def normalize(dataset):
	  #n维数组axis=0维度的最小值,最大值
	  minval = dataset.min(0)	  
	  maxval = dataset.max(0)
	  ranges = maxval - minval
	  normat = np.zeros(np.shape(dataset))
	  minmat = np.tile(minval,(dataset.shape[0],1))
	  mormat = (dataset - minmat)/np.tile(maxval,(dataset.shape[0],1))
	  return mormat

   

你可能感兴趣的:(【数据挖掘】k-邻近算法)