不废话,先上代码。
package com.pca;
public class Main {
public static void main(String[] args) {
// 原始数据
double[][] rawData = new double[][] {
{ 40.4, 24.7, 7.2, 6.1, 8.3, 8.7, 2.442, 20.0 },
{ 25.0, 12.7, 11.2, 11.0, 12.9, 20.2, 3.542, 9.1 },
{ 13.2, 3.3, 3.9, 4.3, 4.4, 5.5, 0.578, 3.6 },
{ 22.3, 6.7, 5.6, 3.7, 6.0, 7.4, 0.176, 7.3 },
{ 34.3, 11.8, 7.1, 7.1, 8.0, 8.9, 1.726, 27.5 },
{ 35.6, 12.5, 16.4, 16.7, 22.8, 29.3, 3.017, 26.6 },
{ 22.0, 7.8, 9.9, 10.2, 12.6, 17.6, 0.847, 10.6 },
{ 48.4, 13.4, 10.9, 9.9, 10.9, 13.9, 1.772, 1.772 },
{ 40.6, 19.1, 19.8, 19.0, 29.7, 39.6, 2.449, 35.8 },
{ 24.8, 8.0, 9.8, 8.9, 11.9, 16.2, 0.789, 13.7 },
{ 12.5, 9.7, 4.2, 4.2, 4.6, 6.5, 0.874, 3.9 },
{ 1.8, 0.6, 0.7, 0.7, 0.8, 1.1, 0.056, 1.0 },
{ 32.3, 13.9, 9.4, 8.3, 9.8, 13.3, 2.126, 17.1 },
{ 38.5, 9.1, 11.3, 9.5, 12.2, 16.4, 1.327, 11.6 },
{ 26.2, 10.1, 5.6, 15.6, 7.7, 30.1, 0.126, 25.9 } };
PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();
pca.buildPCA(rawData);
int[] selected = pca.getSelected();
for (int i = 0; i < selected.length; i++) {
System.out.print(selected[i] + " ");
}
System.out.println();
System.out.println("======");
double[] eigenValues = pca.getEigenValues();
for (int i = 0; i < eigenValues.length; i++) {
System.out.print(eigenValues[i] + " ");
}
System.out.println();
System.out.println("======");
double[][] eigenVectors = pca.getEigenVectors();
for (int i = 0; i < eigenVectors.length; i++) {
for (int j = 0; j < eigenVectors[0].length; j++) {
System.out.print(eigenVectors[i][j] + " ");
}
System.out.println();
}
System.out.println();
System.out.println("======");
double[] pEigenValues = pca.getPrincipalEigenValues();
for (int i = 0; i < pEigenValues.length; i++) {
System.out.print(pEigenValues[i] + " ");
}
System.out.println();
System.out.println("======");
double[][] pEigenVectors = pca.getPrincipalEigenVectors();
for (int i = 0; i < pEigenVectors.length; i++) {
for (int j = 0; j < pEigenVectors[0].length; j++) {
System.out.print(pEigenVectors[i][j] + " ");
}
System.out.println();
}
System.out.println();
System.out.println("======");
double[][] pData = pca.getPrincipalData();
for (int i = 0; i < pData.length; i++) {
for (int j = 0; j < pData[0].length; j++) {
System.out.print(pData[i][j] + " ");
}
System.out.println();
}
}
}
package com.pca;
import no.uib.cipr.matrix.DenseMatrix;
import no.uib.cipr.matrix.EVD;
import no.uib.cipr.matrix.Matrix;
import no.uib.cipr.matrix.NotConvergedException;
import org.apache.log4j.Logger;
public class PrincipalComponentAnalysis {
private static Logger logger = Logger
.getLogger(PrincipalComponentAnalysis.class);
private double[][] rawData = null; // 原始数据
private double[][] principalData = null; // 主成分提取后的数据
private double[] principalEigenValues = null; // 主要的特征值
private double[][] principalEigenVectors = null; // 主要的特征向量
private double[] eigenValues = null; // 特征值
private double[][] eigenVectors = null; // 特征向量
private int[] selected = null; // 选择的特征值序号
private double proportion = 0.9; // 主成分的比重,默认为90%
/**************** 构造函数里配置一些参数 ****************/
public PrincipalComponentAnalysis() {
}
public PrincipalComponentAnalysis(double proportion) {
this.proportion = proportion;
}
/**************** get方法 ****************/
public double[][] getRawData() {
return rawData;
}
public double[][] getPrincipalData() {
return principalData;
}
public double[] getPrincipalEigenValues() {
return principalEigenValues;
}
public double[][] getPrincipalEigenVectors() {
return principalEigenVectors;
}
public double[] getEigenValues() {
return eigenValues;
}
public double[][] getEigenVectors() {
return eigenVectors;
}
public int[] getSelected() {
return selected;
}
public double getProportion() {
return proportion;
}
/**************** PCA的内部方法 ****************/
// 将原始数据标准化
private double[][] calcStandardlizer(double[][] rawData) {
double[][] standardData = null;
if (rawData != null) {
int N = rawData.length; // 二维矩阵的行数,样本个数
int p = rawData[0].length; // 二维矩阵的列数,属性个数
// 每个属性对应的列是该属性的一个采样,近似为该属性的分布
double[] average = new double[p]; // 每一列的平均值
double[] var = new double[p]; // 每一列的方差
standardData = new double[N][p]; // 标准化后的列向量组成的矩阵
// 取得平均值
for (int k = 0; k < p; k++) {
double temp = 0;
for (int i = 0; i < N; i++) {
temp += rawData[i][k];
}
average[k] = temp / N;
}
// 取得方差
for (int k = 0; k < p; k++) {
double temp = 0;
for (int i = 0; i < N; i++) {
temp += (rawData[i][k] - average[k])
* (rawData[i][k] - average[k]);
}
var[k] = temp / (N - 1);
}
// 取得标准化的矩阵 期望为0,方差为1,简化相关系数的计算公式
for (int i = 0; i < N; i++) {
for (int j = 0; j < p; j++) {
standardData[i][j] = (double) ((rawData[i][j] - average[j]) / Math
.sqrt(var[j]));
}
}
} else {
logger.info("There is no raw data.");
}
return standardData;
}
// 计算样本相关系数矩阵 各列之间相互计算(共p列),形成p*p的矩阵
// 输入为标准化之后的矩阵,利用期望=0,方差=1简化了相关系数的计算
private double[][] calcCoefficientOfAssociation(double[][] standardData) {
double[][] assosiationMatrix = null;
if (standardData != null) {
int n = standardData.length; // 二维矩阵的行号
int p = standardData[0].length; // 二维矩阵的列号
assosiationMatrix = new double[p][p];// 相关系数矩阵
for (int i = 0; i < p; i++) {
for (int j = 0; j < p; j++) {
double temp = 0;
for (int k = 0; k < n; k++) {
temp += standardData[k][i] * standardData[k][j];
}
assosiationMatrix[i][j] = temp / (n - 1);
}
}
}
return assosiationMatrix;
}
// 计算相关系数矩阵的特征值
private double[] calcEigenValue(double[][] assosiationMatrix) {
// assosiationMatrix是一个方阵
double[] eigenValues = null;
if (assosiationMatrix != null) {
DenseMatrix Assosiation = new DenseMatrix(assosiationMatrix);
int len = assosiationMatrix.length;
EVD evd = new EVD(len);
try {
evd.factor(Assosiation);
eigenValues = evd.getRealEigenvalues();
} catch (NotConvergedException e) {
e.printStackTrace();
logger.error(e);
}
}
return eigenValues;
}
// 计算相关系数矩阵的特征向量
private double[][] calcEigenVector(double[][] assosiationMatrix) {
// assosiationMatrix是一个方阵
double[][] eigenVectors = null;
if (assosiationMatrix != null) {
DenseMatrix Assosiation = new DenseMatrix(assosiationMatrix);
int len = assosiationMatrix.length;
EVD evd = new EVD(len);
try {
evd.factor(Assosiation);
DenseMatrix tempMatrix = evd.getLeftEigenvectors();
eigenVectors = new double[len][len];
for (int i = 0; i < len; i++) {
for (int j = 0; j < len; j++) {
eigenVectors[i][j] = tempMatrix.get(i, j);
}
}
} catch (NotConvergedException e) {
e.printStackTrace();
logger.error(e);
}
}
return eigenVectors;
}
// 假设阈值是90%,选取最大的前几个特征值的序号
private int[] selectPrincipalComponent(double[] eigenValues) {
int p = eigenValues.length; // 特征值个数
int[] sortedEigenValueIndex = new int[p]; // 特征值由大到小,序号排序
for (int i = 0; i < p; i++) {
sortedEigenValueIndex[i] = i;
}
double[] temp = new double[p]; // 特征值副本
System.arraycopy(eigenValues, 0, temp, 0, p);
// TODO 排序过程可以再优化
double tmp = 0.0;
int pos = 0;
for (int i = 1; i <= p; i++) {
for (int j = 0; j < p - i - 1; j++) {
if (temp[j] < temp[j + 1]) {
tmp = temp[j];
temp[j] = temp[j + 1];
temp[j + 1] = tmp;
pos = sortedEigenValueIndex[j];
sortedEigenValueIndex[j] = sortedEigenValueIndex[j + 1];
sortedEigenValueIndex[j + 1] = pos;
}
}
}
double total = 0.0; // 特征值的和
for (int i = 0; i < p; i++) {
total += temp[i];
}
int count = 0;
double sum = 0.0;
for (int i = 0; i < p; i++) {
if (sum / total <= proportion) {
sum += temp[i];
count++;
}
}
int[] selected = new int[count];
System.arraycopy(sortedEigenValueIndex, 0, selected, 0, count);
return selected;
}
// 取得主特征值
private double[] calcPrincipalEigenValues(double[] eigenValues,
int[] selected) {
int p = eigenValues.length;
double[] principalEigenValues = new double[selected.length];
for (int i = 0; i < selected.length; i++) {
principalEigenValues[i] = eigenValues[selected[i]];
}
return principalEigenValues;
}
// 取得主特征向量,即变换矩阵
private double[][] calcPrincipalEigenVectors(double[][] eigenVectors,
int[] selected) {
int p = eigenVectors.length;
double[][] principalEigenVectors = new double[p][selected.length];
for (int i = 0; i < selected.length; i++) {
for (int j = 0; j < p; j++) {
principalEigenVectors[j][i] = eigenVectors[j][selected[i]];
}
}
return principalEigenVectors;
}
// 原始数据的主成分数据
private double[][] calcPrincipalComponent(double[][] rawData) {
Matrix A = new DenseMatrix(rawData);
Matrix B = new DenseMatrix(principalEigenVectors);
Matrix C = new DenseMatrix(rawData.length,
principalEigenVectors[0].length);
A.mult(B, C); // C=A*B
double[][] principalData = new double[C.numRows()][C.numColumns()];
for (int i = 0; i < C.numRows(); i++) {
for (int j = 0; j < C.numColumns(); j++) {
principalData[i][j] = C.get(i, j);
}
}
return principalData;
}
/**************** PCA的主流程 ****************/
public void buildPCA(double[][] rawData) {
this.rawData = rawData;
double[][] standardData = calcStandardlizer(rawData);
double[][] assosiationMatrix = calcCoefficientOfAssociation(standardData);
this.eigenValues = calcEigenValue(assosiationMatrix);
this.eigenVectors = calcEigenVector(assosiationMatrix);
this.selected = selectPrincipalComponent(eigenValues);
this.principalEigenValues = calcPrincipalEigenValues(eigenValues,
selected);
this.principalEigenVectors = calcPrincipalEigenVectors(eigenVectors,
selected);
this.principalData = calcPrincipalComponent(rawData);
}
}
PCA的原理和过程不难理解
1. 先计算原始数据的相关系数矩阵
2. 求相关系数矩阵的特征值和特征向量
3. 选择特征值最大的n个特征值对应的特征向量作为转换矩阵
4. 原始数据矩阵和转换矩阵相乘,得到结果
本质上,PCA是一个坐标转换。将原来不好的坐标系转换为好的坐标系。将每条样本的原始属性变为新属性。
第一,各个属性之间独立、不相关
第二,使得各个属性方差较大,数据具有区分度