如果一个样本在特征空间中的k个最相似(即特征空间中最邻近)的样本中的大多数属于某一个类别,则该样本也属于这个类别。该方法在定类决策上只依据最邻近的一个或者几个样本的类别来决定待分样本所属的类别。
KNN方法虽然从原理上也依赖于极限定理,但在类别决策时,只与极少量的相邻样本有关。因此,采用这种方法可以较好地避免样本的不平衡问题。另外,由于KNN方法主要靠周围有限的邻近的样本,而不是靠判别类域的方法来确定所属类别的,因此对于类域的交叉或重叠较多的待分样本集来说, KNN方法较其他方法更为适合。
该方法的不足之处是计算量较大,因为对每一个待分类的文本都要计算它到全体已知样本的距离,才能求得它的K个最近邻点。目前常用的解决方法是事先对已知样本点进行剪辑,事先去除对分类作用不大的样本。另外还有一种Reverse KNN法,能降低KNN算法的计算复杂度,提高分类的效率。
该算法比较适用于样本容量比较大的类域的自动分类,而那些样本容量较小的类域采用这种算法比较容易产生误分。
算法步骤:
step.1---初始化距离为最大值
step.2---计算未知样本和每个训练样本的距离dist
step.3---得到目前K个最临近样本中的最大距离maxdist
step.4---如果dist小于maxdist,则将该训练样本作为K-最近邻样本
step.5---重复步骤2、3、4,直到未知样本和所有训练样本的距离都算完
step.6---统计K-最近邻样本中每个类标号出现的次数
step.7---选择出现频率最大的类标号作为未知样本的类标号
/****************************************************************************************\ * K-Nearest Neighbour Classifier * \****************************************************************************************/ // k Nearest Neighbors class CV_EXPORTS CvKNearest : public CvStatModel { public: CvKNearest(); virtual ~CvKNearest(); CvKNearest( const CvMat* _train_data, const CvMat* _responses, const CvMat* _sample_idx=0, bool _is_regression=false, int max_k=32 ); virtual bool train( const CvMat* _train_data, const CvMat* _responses, const CvMat* _sample_idx=0, bool is_regression=false, int _max_k=32, bool _update_base=false ); virtual float find_nearest( const CvMat* _samples, int k, CvMat* results=0, const float** neighbors=0, CvMat* neighbor_responses=0, CvMat* dist=0 ) const; #ifndef SWIG CvKNearest( const cv::Mat& _train_data, const cv::Mat& _responses, const cv::Mat& _sample_idx=cv::Mat(), bool _is_regression=false, int max_k=32 ); virtual bool train( const cv::Mat& _train_data, const cv::Mat& _responses, const cv::Mat& _sample_idx=cv::Mat(), bool is_regression=false, int _max_k=32, bool _update_base=false ); virtual float find_nearest( const cv::Mat& _samples, int k, cv::Mat* results=0, const float** neighbors=0, cv::Mat* neighbor_responses=0, cv::Mat* dist=0 ) const; #endif virtual void clear(); int get_max_k() const; int get_var_count() const; int get_sample_count() const; bool is_regression() const; protected: virtual float write_results( int k, int k1, int start, int end, const float* neighbor_responses, const float* dist, CvMat* _results, CvMat* _neighbor_responses, CvMat* _dist, Cv32suf* sort_buf ) const; virtual void find_neighbors_direct( const CvMat* _samples, int k, int start, int end, float* neighbor_responses, const float** neighbors, float* dist ) const; int max_k, var_count; int total; bool regression; CvVectors* samples; };
//源码引用自:http://www.mysjtu.com/page/M0/S914/914320.html #include "stdafx.h" #include <ml.h> #include <iostream> #include <highgui.h> #include <cv.h> #include <cxcore.h> using namespace cv; using namespace std; int main( int argc, char** argv ) { const int K = 20; int i, j, k, accuracy; float response; int train_sample_count = 100; CvRNG rng_state = cvRNG(-1);//初始化随机数生成器状态 CvMat* trainData = cvCreateMat( train_sample_count, 2, CV_32FC1 ); CvMat* trainClasses = cvCreateMat( train_sample_count, 1, CV_32FC1 ); IplImage* img = cvCreateImage( cvSize( 500, 500 ), 8, 3 ); float _sample[2]; CvMat sample = cvMat( 1, 2, CV_32FC1, _sample ); cvZero( img ); CvMat trainData1, trainData2, trainClasses1, trainClasses2; // form the training samples cvGetRows( trainData, &trainData1, 0, train_sample_count/2 ); //返回数组的一行或在一定跨度内的行 cvRandArr( &rng_state, &trainData1, CV_RAND_NORMAL, cvScalar(200,200), cvScalar(50,50) ); //用随机数填充数组并更新 RNG 状态 cvGetRows( trainData, &trainData2, train_sample_count/2, train_sample_count ); cvRandArr( &rng_state, &trainData2, CV_RAND_NORMAL, cvScalar(300,300), cvScalar(50,50) ); cvGetRows( trainClasses, &trainClasses1, 0, train_sample_count/2 ); cvSet( &trainClasses1, cvScalar(1) ); cvGetRows( trainClasses, &trainClasses2, train_sample_count/2, train_sample_count ); cvSet( &trainClasses2, cvScalar(2) ); // learn classifier CvKNearest knn( trainData, trainClasses, 0, false, K ); CvMat* nearests = cvCreateMat( 1, K, CV_32FC1); for( i = 0; i < img->height; i++ ) { for( j = 0; j < img->width; j++ ) { sample.data.fl[0] = (float)j; sample.data.fl[1] = (float)i; // estimates the response and get the neighbors' labels response = knn.find_nearest(&sample,K,0,0,nearests,0); // compute the number of neighbors representing the majority for( k = 0, accuracy = 0; k < K; k++ ) { if( nearests->data.fl[k] == response) accuracy++; } // highlight the pixel depending on the accuracy (or confidence) cvSet2D( img, i, j, response == 1 ? (accuracy > 5 ? CV_RGB(180,0,0) : CV_RGB(180,120,0)) : (accuracy > 5 ? CV_RGB(0,180,0) : CV_RGB(120,120,0)) ); } } //显示分类后训练集 // display the original training samples for( i = 0; i < train_sample_count/2; i++ ) { CvPoint pt; pt.x = cvRound(trainData1.data.fl[i*2]); pt.y = cvRound(trainData1.data.fl[i*2+1]); cvCircle( img, pt, 2, CV_RGB(255,0,0), CV_FILLED ); pt.x = cvRound(trainData2.data.fl[i*2]); pt.y = cvRound(trainData2.data.fl[i*2+1]); cvCircle( img, pt, 2, CV_RGB(0,255,0), CV_FILLED ); } cvNamedWindow( "classifier result", 1 ); cvShowImage( "classifier result", img ); cvWaitKey(0); cvReleaseMat( &trainClasses ); cvReleaseMat( &trainData ); return 0; }
参考:http://www.cnblogs.com/wengzilin/archive/2013/04/05/3001778.html
http://www.cnblogs.com/seacode/archive/2011/03/09/1979246.html
http://blog.csdn.net/yangtrees/article/details/7482890
http://www.cnblogs.com/xiangshancuizhu/archive/2011/08/06/2129355.html
http://blog.csdn.net/godenlove007/article/details/8084863
http://www.doc88.com/p-710937416243.html
http://blog.csdn.net/xlm289348/article/details/8876353