PCA主成分分析

不废话,先上代码。

package com.pca;

public class Main {
    public static void main(String[] args) {
        // 原始数据
        double[][] rawData = new double[][] {
                { 40.4, 24.7, 7.2, 6.1, 8.3, 8.7, 2.442, 20.0 },
                { 25.0, 12.7, 11.2, 11.0, 12.9, 20.2, 3.542, 9.1 },
                { 13.2, 3.3, 3.9, 4.3, 4.4, 5.5, 0.578, 3.6 },
                { 22.3, 6.7, 5.6, 3.7, 6.0, 7.4, 0.176, 7.3 },
                { 34.3, 11.8, 7.1, 7.1, 8.0, 8.9, 1.726, 27.5 },
                { 35.6, 12.5, 16.4, 16.7, 22.8, 29.3, 3.017, 26.6 },
                { 22.0, 7.8, 9.9, 10.2, 12.6, 17.6, 0.847, 10.6 },
                { 48.4, 13.4, 10.9, 9.9, 10.9, 13.9, 1.772, 1.772 },
                { 40.6, 19.1, 19.8, 19.0, 29.7, 39.6, 2.449, 35.8 },
                { 24.8, 8.0, 9.8, 8.9, 11.9, 16.2, 0.789, 13.7 },
                { 12.5, 9.7, 4.2, 4.2, 4.6, 6.5, 0.874, 3.9 },
                { 1.8, 0.6, 0.7, 0.7, 0.8, 1.1, 0.056, 1.0 },
                { 32.3, 13.9, 9.4, 8.3, 9.8, 13.3, 2.126, 17.1 },
                { 38.5, 9.1, 11.3, 9.5, 12.2, 16.4, 1.327, 11.6 },
                { 26.2, 10.1, 5.6, 15.6, 7.7, 30.1, 0.126, 25.9 } };

        PrincipalComponentAnalysis pca = new PrincipalComponentAnalysis();
        pca.buildPCA(rawData);
        int[] selected = pca.getSelected();
        for (int i = 0; i < selected.length; i++) {
            System.out.print(selected[i] + " ");
        }
        System.out.println();
        System.out.println("======");
        double[] eigenValues = pca.getEigenValues();
        for (int i = 0; i < eigenValues.length; i++) {
            System.out.print(eigenValues[i] + " ");
        }
        System.out.println();
        System.out.println("======");
        double[][] eigenVectors = pca.getEigenVectors();
        for (int i = 0; i < eigenVectors.length; i++) {
            for (int j = 0; j < eigenVectors[0].length; j++) {
                System.out.print(eigenVectors[i][j] + " ");
            }
            System.out.println();
        }
        System.out.println();
        System.out.println("======");
        double[] pEigenValues = pca.getPrincipalEigenValues();
        for (int i = 0; i < pEigenValues.length; i++) {
            System.out.print(pEigenValues[i] + " ");
        }
        System.out.println();
        System.out.println("======");
        double[][] pEigenVectors = pca.getPrincipalEigenVectors();
        for (int i = 0; i < pEigenVectors.length; i++) {
            for (int j = 0; j < pEigenVectors[0].length; j++) {
                System.out.print(pEigenVectors[i][j] + " ");
            }
            System.out.println();
        }
        System.out.println();
        System.out.println("======");
        double[][] pData = pca.getPrincipalData();
        for (int i = 0; i < pData.length; i++) {
            for (int j = 0; j < pData[0].length; j++) {
                System.out.print(pData[i][j] + " ");
            }
            System.out.println();
        }
    }
}

package com.pca;

import no.uib.cipr.matrix.DenseMatrix;
import no.uib.cipr.matrix.EVD;
import no.uib.cipr.matrix.Matrix;
import no.uib.cipr.matrix.NotConvergedException;

import org.apache.log4j.Logger;

public class PrincipalComponentAnalysis {
    private static Logger logger = Logger
            .getLogger(PrincipalComponentAnalysis.class);

    private double[][] rawData = null; // 原始数据
    private double[][] principalData = null; // 主成分提取后的数据
    private double[] principalEigenValues = null; // 主要的特征值
    private double[][] principalEigenVectors = null; // 主要的特征向量
    private double[] eigenValues = null; // 特征值
    private double[][] eigenVectors = null; // 特征向量
    private int[] selected = null; // 选择的特征值序号
    private double proportion = 0.9; // 主成分的比重,默认为90%

    /**************** 构造函数里配置一些参数 ****************/
    public PrincipalComponentAnalysis() {

    }

    public PrincipalComponentAnalysis(double proportion) {
        this.proportion = proportion;
    }

    /**************** get方法 ****************/
    public double[][] getRawData() {
        return rawData;
    }

    public double[][] getPrincipalData() {
        return principalData;
    }

    public double[] getPrincipalEigenValues() {
        return principalEigenValues;
    }

    public double[][] getPrincipalEigenVectors() {
        return principalEigenVectors;
    }

    public double[] getEigenValues() {
        return eigenValues;
    }

    public double[][] getEigenVectors() {
        return eigenVectors;
    }

    public int[] getSelected() {
        return selected;
    }

    public double getProportion() {
        return proportion;
    }

    /**************** PCA的内部方法 ****************/
    // 将原始数据标准化
    private double[][] calcStandardlizer(double[][] rawData) {
        double[][] standardData = null;
        if (rawData != null) {
            int N = rawData.length; // 二维矩阵的行数,样本个数
            int p = rawData[0].length; // 二维矩阵的列数,属性个数
            // 每个属性对应的列是该属性的一个采样,近似为该属性的分布
            double[] average = new double[p]; // 每一列的平均值
            double[] var = new double[p]; // 每一列的方差
            standardData = new double[N][p]; // 标准化后的列向量组成的矩阵

            // 取得平均值
            for (int k = 0; k < p; k++) {
                double temp = 0;
                for (int i = 0; i < N; i++) {
                    temp += rawData[i][k];
                }
                average[k] = temp / N;
            }
            // 取得方差
            for (int k = 0; k < p; k++) {
                double temp = 0;
                for (int i = 0; i < N; i++) {
                    temp += (rawData[i][k] - average[k])
                            * (rawData[i][k] - average[k]);
                }
                var[k] = temp / (N - 1);
            }
            // 取得标准化的矩阵 期望为0,方差为1,简化相关系数的计算公式
            for (int i = 0; i < N; i++) {
                for (int j = 0; j < p; j++) {
                    standardData[i][j] = (double) ((rawData[i][j] - average[j]) / Math
                            .sqrt(var[j]));
                }
            }
        } else {
            logger.info("There is no raw data.");
        }
        return standardData;
    }

    // 计算样本相关系数矩阵 各列之间相互计算(共p列),形成p*p的矩阵
    // 输入为标准化之后的矩阵,利用期望=0,方差=1简化了相关系数的计算
    private double[][] calcCoefficientOfAssociation(double[][] standardData) {
        double[][] assosiationMatrix = null;
        if (standardData != null) {
            int n = standardData.length; // 二维矩阵的行号
            int p = standardData[0].length; // 二维矩阵的列号
            assosiationMatrix = new double[p][p];// 相关系数矩阵
            for (int i = 0; i < p; i++) {
                for (int j = 0; j < p; j++) {
                    double temp = 0;
                    for (int k = 0; k < n; k++) {
                        temp += standardData[k][i] * standardData[k][j];
                    }
                    assosiationMatrix[i][j] = temp / (n - 1);
                }
            }
        }
        return assosiationMatrix;
    }

    // 计算相关系数矩阵的特征值
    private double[] calcEigenValue(double[][] assosiationMatrix) {
        // assosiationMatrix是一个方阵
        double[] eigenValues = null;
        if (assosiationMatrix != null) {
            DenseMatrix Assosiation = new DenseMatrix(assosiationMatrix);
            int len = assosiationMatrix.length;
            EVD evd = new EVD(len);
            try {
                evd.factor(Assosiation);
                eigenValues = evd.getRealEigenvalues();
            } catch (NotConvergedException e) {
                e.printStackTrace();
                logger.error(e);
            }
        }
        return eigenValues;
    }

    // 计算相关系数矩阵的特征向量
    private double[][] calcEigenVector(double[][] assosiationMatrix) {
        // assosiationMatrix是一个方阵
        double[][] eigenVectors = null;
        if (assosiationMatrix != null) {
            DenseMatrix Assosiation = new DenseMatrix(assosiationMatrix);
            int len = assosiationMatrix.length;
            EVD evd = new EVD(len);
            try {
                evd.factor(Assosiation);
                DenseMatrix tempMatrix = evd.getLeftEigenvectors();
                eigenVectors = new double[len][len];
                for (int i = 0; i < len; i++) {
                    for (int j = 0; j < len; j++) {
                        eigenVectors[i][j] = tempMatrix.get(i, j);
                    }
                }
            } catch (NotConvergedException e) {
                e.printStackTrace();
                logger.error(e);
            }
        }
        return eigenVectors;
    }

    // 假设阈值是90%,选取最大的前几个特征值的序号
    private int[] selectPrincipalComponent(double[] eigenValues) {
        int p = eigenValues.length; // 特征值个数
        int[] sortedEigenValueIndex = new int[p]; // 特征值由大到小,序号排序
        for (int i = 0; i < p; i++) {
            sortedEigenValueIndex[i] = i;
        }
        double[] temp = new double[p]; // 特征值副本
        System.arraycopy(eigenValues, 0, temp, 0, p);

        // TODO 排序过程可以再优化
        double tmp = 0.0;
        int pos = 0;
        for (int i = 1; i <= p; i++) {
            for (int j = 0; j < p - i - 1; j++) {
                if (temp[j] < temp[j + 1]) {
                    tmp = temp[j];
                    temp[j] = temp[j + 1];
                    temp[j + 1] = tmp;
                    pos = sortedEigenValueIndex[j];
                    sortedEigenValueIndex[j] = sortedEigenValueIndex[j + 1];
                    sortedEigenValueIndex[j + 1] = pos;
                }
            }
        }

        double total = 0.0; // 特征值的和
        for (int i = 0; i < p; i++) {
            total += temp[i];
        }

        int count = 0;
        double sum = 0.0;
        for (int i = 0; i < p; i++) {
            if (sum / total <= proportion) {
                sum += temp[i];
                count++;
            }
        }
        int[] selected = new int[count];
        System.arraycopy(sortedEigenValueIndex, 0, selected, 0, count);
        return selected;
    }

    // 取得主特征值
    private double[] calcPrincipalEigenValues(double[] eigenValues,
            int[] selected) {
        int p = eigenValues.length;
        double[] principalEigenValues = new double[selected.length];
        for (int i = 0; i < selected.length; i++) {
            principalEigenValues[i] = eigenValues[selected[i]];
        }
        return principalEigenValues;
    }

    // 取得主特征向量,即变换矩阵
    private double[][] calcPrincipalEigenVectors(double[][] eigenVectors,
            int[] selected) {
        int p = eigenVectors.length;
        double[][] principalEigenVectors = new double[p][selected.length];
        for (int i = 0; i < selected.length; i++) {
            for (int j = 0; j < p; j++) {
                principalEigenVectors[j][i] = eigenVectors[j][selected[i]];
            }
        }
        return principalEigenVectors;
    }

    // 原始数据的主成分数据
    private double[][] calcPrincipalComponent(double[][] rawData) {
        Matrix A = new DenseMatrix(rawData);
        Matrix B = new DenseMatrix(principalEigenVectors);
        Matrix C = new DenseMatrix(rawData.length,
                principalEigenVectors[0].length);
        A.mult(B, C); // C=A*B

        double[][] principalData = new double[C.numRows()][C.numColumns()];
        for (int i = 0; i < C.numRows(); i++) {
            for (int j = 0; j < C.numColumns(); j++) {
                principalData[i][j] = C.get(i, j);
            }
        }
        return principalData;
    }

    /**************** PCA的主流程 ****************/
    public void buildPCA(double[][] rawData) {
        this.rawData = rawData;
        double[][] standardData = calcStandardlizer(rawData);
        double[][] assosiationMatrix = calcCoefficientOfAssociation(standardData);
        this.eigenValues = calcEigenValue(assosiationMatrix);
        this.eigenVectors = calcEigenVector(assosiationMatrix);
        this.selected = selectPrincipalComponent(eigenValues);
        this.principalEigenValues = calcPrincipalEigenValues(eigenValues,
                selected);
        this.principalEigenVectors = calcPrincipalEigenVectors(eigenVectors,
                selected);
        this.principalData = calcPrincipalComponent(rawData);
    }
}

PCA的原理和过程不难理解
1. 先计算原始数据的相关系数矩阵
2. 求相关系数矩阵的特征值和特征向量
3. 选择特征值最大的n个特征值对应的特征向量作为转换矩阵
4. 原始数据矩阵和转换矩阵相乘,得到结果

本质上,PCA是一个坐标转换。将原来不好的坐标系转换为好的坐标系。将每条样本的原始属性变为新属性。
第一,各个属性之间独立、不相关
第二,使得各个属性方差较大,数据具有区分度

你可能感兴趣的:(数据挖掘)