文本分类(基于朴素贝叶斯分类器)

一. 概率论基础

1. 条件概率公式:

2. 全概率公式:

3. 由条件概率公式和全概率公式可以导出贝叶斯公式

二. 文本分类

要计算一篇文章D所属的类别c(D),相当于计算生成D的可能性最大的类别,即:

其中P(D)与C无关,故

三. 朴素贝叶斯分类模型

朴素贝叶斯假设:在给定类别C的条件下,所有属性Di相互独立,即,

根据朴素贝叶斯假设,可得

其中,

  :类别c中的训练文本数

  :总训练文本数

  :单词di在类别c中出现的次数

综上可得,

四. 具体代码( 源代码

程序采用java语言进行编写,运用搜狗语料库进行训练。具体程序代码如下:

Main.java——主程序,负责读取待分类文章以及调用分类器

package classifierDemo;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

public class Main {

    public static void main(String[] args) throws IOException {
        String article = "";
        // 读取文章
        String path = "article.txt";
        InputStreamReader is = new InputStreamReader(new FileInputStream(path),
                "gbk");
        BufferedReader br = new BufferedReader(is);
        String temp = br.readLine();
        while (temp != null) {
            article += temp;
            temp = br.readLine();
        }
        br.close();

        System.out.println(article + "\n");
        // 对文章进行分类
        TrainDataManager train = new TrainDataManager();
        train.execute(article);
    }
}
View Code

TrainData.java——分类器,对输入文本进行分类

package classifierDemo;

import java.io.File;
import java.io.IOException;
import java.util.Vector;

public class TrainDataManager {
    private static final String dirName = "trainingData/Sample"; // 训练集所在目录
    CountDirectory countDir = new CountDirectory();
    private int zoomFactor = 5; // 放大倍数

    /**
     * 计算先验概率 p(ci)=某个类别文章数/训练文本总数
     * 
     * @param className
     *            类别名称
     * @return
     */
    public double priorProbability(String className) {
        double probability = 0.0;
        probability = (double) countDir.countClass(className)
                / countDir.countSum();
        return probability;
    }

    public void execute(String article) throws IOException {

        // 进行分词
        Vector strs = ChineseSpliter.splitWords(article);

        File dir = new File(dirName);
        File[] files = dir.listFiles(); // 目录下的所有文件
        String className;
        double countc;
        double product = 1;
        Vector probability = new Vector();
        double temp;
        // 计算文本属于每个类别的概率
        for (File f : files) {
            className = f.getName();
            countc = countDir.countClass(className);
            // 计算文本中某个词属于特定类别中的概率
            for (String word : strs) {
                temp = (countDir.countWordInClass(word, className) + 1)
                        / countc * zoomFactor;// 避免所得结果过小,对结果进行放大
                product *= temp;
            }
            probability.add(priorProbability(className) * product);
            product = 1;
        }
        double max = 0;
        int maxId = 0;
        for (int i = 0; i < files.length; i++) {
            if (max < probability.get(i)) {
                max = probability.get(i);
                maxId = i;
            }
        }
        System.out.println("文章所属分类为:" + files[maxId].getName());
    }
}
View Code

CountDiretory.java——用于计算训练集中的各种频次

package classifierDemo;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;

/**
 * 计算各种频次
 * 
 * @author Administrator
 * 
 */
public class CountDirectory {
    private static final String dirName = "trainingData/Sample"; // 训练集所在目录

    public int countSum() {
        File dir = new File(dirName);
        File[] files = dir.listFiles(); // 目录下的所有文件
        String subName = ""; // 子目录的路径名称
        int sum = 0; // 训练集中所有类别的总文本数

        // 计算所有文件的总数
        for (int i = 0; i < files.length; i++) {
            subName = files[i].getName();
            sum += countClass(subName);
        }
        return sum;
    }

    /**
     * 用于计算某个类别下的文章总数
     * 
     * @param className
     *            类别名称
     * @return 给定类别目录下的文件总数
     */
    public int countClass(String className) {
        String classPath = dirName + "/" + className;
        File subDir = new File(classPath);// 子目录
        File[] subFiles = subDir.listFiles(); // 子目录下的所有文件
        return subFiles.length;
    }

    /**
     * 计算某个类别中包含给定词的文章数目
     * 
     * @param word
     *            给定的词
     * @param className
     *            类别名称
     * @return className中包含word的文章数
     */
    public int countWordInClass(String word, String className)
            throws IOException {
        int count = 0;// 总数
        String classPath = dirName + "/" + className;
        File subDir = new File(classPath);
        File[] subFiles = subDir.listFiles();
        String filePath = "";
        // 计算word在各篇文章中出现的次数
        for (int i = 0; i < subFiles.length; i++) {
            // 读取文章
            filePath = subFiles[i].getAbsolutePath();
            InputStreamReader is = new InputStreamReader(new FileInputStream(
                    filePath), "gbk");
            BufferedReader br = new BufferedReader(is);
            String temp = br.readLine();
            String line = "";
            while (temp != null) {
                line += temp;
                temp = br.readLine();

            }
            br.close();

            if (line.contains(word))
                count++;
        }
        return count;
    }

}
View Code

ChineseSpliter.java——中文分词器,对输入字串进行中文分词

package classifierDemo;

import java.io.IOException;
import java.util.Vector;

import jeasy.analysis.MMAnalyzer;

/**
 * 中文分词器 对输入文本进行分词处理
 * 
 */
public class ChineseSpliter {
    private static String splitToken = "|"; // 定义用于分隔的标记

    /**
     * 对给定文本进行中文分词
     * 
     * @param article
     *            待分词的文章
     * @return 分词后的结果向量
     */
    public static Vector splitWords(String article) {
        String result = null;
        MMAnalyzer analyzer = new MMAnalyzer();
        try {
            result = analyzer.segment(article, splitToken);
        } catch (IOException e) {
            e.printStackTrace();
        }
        Vector vector = stringToVector(result);
        // 去除停用词
        StopWordsHandler stopWords = new StopWordsHandler();
        vector = stopWords.DropStopWords(vector);
        return vector;
    }

    /**
     * 将分词结果字符串转化为向量形式
     * 
     * @param str
     *            字符串形式
     * @return 字符串对应的向量形式
     */
    public static Vector stringToVector(String str) {
        int index;
        Vector vector = new Vector();
        index = str.indexOf(splitToken);
        String temp;
        while (index != -1) {
            temp = str.substring(0, index);
            vector.add(temp);
            str = str.substring(index + 1, str.length());
            index = str.indexOf(splitToken);
        }
        return vector;
    }
}
View Code

StopWordsHandler.java——停用词处理

package classifierDemo;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;

/**
 * 
 * 停用词处理器
 * 
 */
public class StopWordsHandler {
    private List stopWordsList;// 常用停用词
    private String path = "chineseStopWords.txt";

    public StopWordsHandler() {
        try {
            stopWordsList = readStopWords();
        } catch (Exception e) {
            System.out.println(e);
        }
    }

    public StopWordsHandler(String path) throws FileNotFoundException,
            IOException {
        this.path = path;
        stopWordsList = readStopWords();
    }

    /**
     * 读取停用词表
     * 
     * @return 所有停用词
     */
    public List readStopWords() throws FileNotFoundException,
            IOException {
        List stopWordsList1 = new ArrayList();
        InputStreamReader is = new InputStreamReader(new FileInputStream(path),
                "gbk");
        BufferedReader br = new BufferedReader(is);
        String line = br.readLine();
        while (line != null) {
            stopWordsList1.add(line);
            line = br.readLine();
        }
        br.close();
        return stopWordsList1;
    }

    /**
     * 判断是否为停用词
     * 
     * @param word
     *            给定的文本
     * @return 是否为停用词
     */
    public boolean IsStopWord(String word) {
        for (int i = 0; i < stopWordsList.size(); ++i) {
            if (word.equalsIgnoreCase(stopWordsList.get(i)))
                return true;
        }
        return false;
    }

    /**
     * 去掉停用词
     * 
     * @param oldWords
     *            分词后的文本
     * @return 去停用词后结果
     */
    public Vector DropStopWords(Vector oldWords) {
        Vector v1 = new Vector();
        for (int i = 0; i < oldWords.size(); ++i) {
            if (IsStopWord(oldWords.elementAt(i)) == false) {// 不是停用词
                v1.add(oldWords.elementAt(i));
            }
        }
        return v1;
    }
}
View Code

 

五. 参考文献:

基于朴素贝叶斯分类器的文本分类算法(洞庭散人) 

朴素贝叶斯分类器的研究与应用(王国才)

 

转载于:https://www.cnblogs.com/jiajia920716/p/3135407.html

你可能感兴趣的:(文本分类(基于朴素贝叶斯分类器))