让DeepLearning4j阅读小说并给出关联度最高的词

DeepLearning4j是一个java的神经网络框架,便于java程序员使用神经网络来完成一些机器学习工程。

不管什么机器学习框架,NLP是一个不能不谈的领域,DL4J也提供了nlp的相关实现。其中入门的例子就是从一大堆文字中找到最相关的词。

我们先来看看官方的demo,然后再模仿一个类似的程序,只不过是阅读中文的小说。

官方的demo叫Word2VecRawTextExample,我们直接新建一个java的maven项目,pom.xml如下:



    4.0.0

    com.tianyalei
    wolf_ml_mnist
    1.0-SNAPSHOT
    
    
        UTF-8
        1.0.0-beta
        1.0.0-beta
        1.0.0-beta
        1.1.7
        2.10
    

    
        
        
            org.deeplearning4j
            deeplearning4j-core
            ${dl4j.version}
        
        
        
        
            org.nd4j
            nd4j-native-platform
            
            ${nd4j.version}
        
        
            org.deeplearning4j
            deeplearning4j-nlp
            ${dl4j.version}
        
        
            org.deeplearning4j
            deeplearning4j-ui_2.11
            ${dl4j.version}
        
        
            ch.qos.logback
            logback-classic
            ${logback.version}
        
        
        
        
            net.sf.trove4j
            trove4j
            3.0.3
        
        
            commons-cli
            commons-cli
            1.2
        
        
    
package com.tianyalei.nlp;

import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.nd4j.linalg.io.ClassPathResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.Collection;

/**
 * Created by agibsonccc on 10/9/14.
 *
 * Neural net that processes text into wordvectors. See below url for an in-depth explanation.
 * https://deeplearning4j.org/word2vec.html
 */
public class Word2VecRawTextExample {

    private static Logger log = LoggerFactory.getLogger(Word2VecRawTextExample.class);

    public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = new ClassPathResource("raw_sentences.txt").getFile().getAbsolutePath();

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = new BasicLineIterator(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();

        /*
            CommonPreprocessor will apply the following regex to each token: [\d\.:,"'\(\)\[\]|/?!;]+
            So, effectively all numbers, punctuation symbols and some special symbols are stripped off.
            Additionally it forces lower case for all tokens.
         */
        t.setTokenPreProcessor(new CommonPreprocessor());

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                //是一个词在语料中必须出现的最少次数。本例中出现不到五次的词都不予学习。
                .minWordFrequency(5)
                //是网络在处理一批数据时允许更新系数的次数。迭代次数太少,网络可能来不及学习所有能学到的信息;迭代次数太多则会导致网络定型时间变长。
                .iterations(1)
                //指定词向量中的特征数量,与特征空间的维度数量相等。以500个特征值表示的词会成为一个500维空间中的点。
                .layerSize(100)
                .seed(42)
                .windowSize(5)
                //告知网络当前定型的是哪一批数据集
                .iterate(iter)
                //将当前一批的词输入网络
                .tokenizerFactory(t)
                .build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");

        // Prints out the closest 10 words to "day". An example on what to do with these Word Vectors.
        log.info("Closest Words:");
        Collection lst = vec.wordsNearestSum("day", 10);
        //Collection lst = vec.wordsNearest(Arrays.asList("king", "woman"), Arrays.asList("queen"), 10);
        log.info("10 Words closest to 'day': {}", lst);
    }
}

这就是NLP的helloworld级的入门项目,目标是从给定的raw_sentences.txt中找到与day最相近的词,将资源放到resource中,运行该程序即可。

运行结果:

可以看到,day的最相近的词有week、night、year等,还算非常靠谱。至于原理呢,大家可以在文档里去搜索day这个词,看看它的附近的词和用法,然后再去搜索week、night等词的旁边的词和用法,就大概知道怎么回事了。

该文用的相关资源去我项目里找https://github.com/tianyaleixiaowu/wolf_ml_mnist

从代码的注释中可以看看基本的概念,下面我们来让它学习一下中文的小说,并给出最接近的词。

和英文自带空格分词不同,中文是额外需要一个中文分词器的,不然中文全是一句一句的,根本分不开。所以我们在让机器学习读中文前,需要先将中文句子分成一个个的词语。让DeepLearning4j阅读小说并给出关联度最高的词_第1张图片

中文分词器很多,论使用的简易程度和效果,还是复旦的NLP比较靠谱,https://github.com/FudanNLP/fnlp。

GitHub上面有文档,讲怎么使用的,这里我直接简单说一下,下载models里的三个.m文件,和libs里的fnlp-code.jar,将jar添加为工程的依赖lib即可。至于复旦nlp额外需要的两个jar,我已经放在pom.xml里了。

让DeepLearning4j阅读小说并给出关联度最高的词_第2张图片

然后就可以使用fnlp来对文档进行分词了。我们选择的文档是天龙八部tlbb.txt,这是没分词时的样子。让DeepLearning4j阅读小说并给出关联度最高的词_第3张图片

分词的代码

package com.tianyalei.nlp.tlbb;

import java.io.*;

/**
 * 运行后将得到一个分词后的文档
 * @author wuweifeng wrote on 2018/6/29.
 */
public class FenCi {
    private FudanTokenizer tokenizer = new FudanTokenizer();

    public void processFile() throws Exception {
        String filePath = this.getClass().getClassLoader().getResource("text/tlbb.txt").getPath();
        BufferedReader in = new BufferedReader(new FileReader(filePath));

        File outfile = new File("/Users/wuwf/project/tlbb_t.txt");
        if (outfile.exists()) {
            outfile.delete();
        }
        FileOutputStream fop = new FileOutputStream(outfile);

        // 构建FileOutputStream对象,文件不存在会自动新建
        String line = in.readLine();
        OutputStreamWriter writer = new OutputStreamWriter(fop, "UTF-8");
        while (line != null) {
            line = tokenizer.processSentence(line);
            writer.append(line);
            line = in.readLine();
        }
        in.close();
        writer.close(); // 关闭写入流,同时会把缓冲区内容写入文件
        fop.close(); // 关闭输出流,释放系统资源
    }

    public static void main(String[] args) throws Exception {
         new FenCi().processFile();
    }
}
package com.tianyalei.nlp.tlbb;

import org.fnlp.ml.types.Dictionary;
import org.fnlp.nlp.cn.tag.CWSTagger;
import org.fnlp.nlp.corpus.StopWords;
import org.fnlp.util.exception.LoadModelException;

import java.io.IOException;
import java.util.List;

/**
 * @author wuweifeng wrote on 2018/6/29.
 */
public class FudanTokenizer {
    private CWSTagger tag;

    private StopWords stopWords;

    public FudanTokenizer() {
        String path = this.getClass().getClassLoader().getResource("").getPath();
        System.out.println(path);
        try {
            tag = new CWSTagger(path + "models/seg.m");
        } catch (LoadModelException e) {
            e.printStackTrace();
        }

    }

    public String processSentence(String context) {
        return tag.tag(context);
    }

    public String processSentence(String sentence, boolean english) {
        if (english) {
            tag.setEnFilter(true);
        }
        return tag.tag(sentence);
    }

    public String processFile(String filename) {
        return tag.tagFile(filename);
    }

    /**
     * 设置分词词典
     */
    public boolean setDictionary() {
        String dictPath = this.getClass().getClassLoader().getResource("models/dict.txt").getPath();

        Dictionary dict;
        try {
            dict = new Dictionary(dictPath);
        } catch (IOException e) {
            return false;
        }
        tag.setDictionary(dict);
        return true;
    }

    /**
     * 去除停用词
     */
    public List flitStopWords(String[] words) {
        try {
            return stopWords.phraseDel(words);
        } catch (Exception e) {
            e.printStackTrace();
            return null;
        }
    }

}    

然后运行一下,过一会就得到了分词后的文档tlbb_t.txt,将分词后的拷贝到resource下,将来机器学的就是分词后的文档。

让DeepLearning4j阅读小说并给出关联度最高的词_第4张图片

package com.tianyalei.nlp.tlbb;

import org.deeplearning4j.models.embeddings.loader.WordVectorSerializer;
import org.deeplearning4j.models.word2vec.Word2Vec;
import org.deeplearning4j.text.sentenceiterator.BasicLineIterator;
import org.deeplearning4j.text.sentenceiterator.SentenceIterator;
import org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor;
import org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory;
import org.deeplearning4j.text.tokenization.tokenizerfactory.TokenizerFactory;
import org.nd4j.linalg.io.ClassPathResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.Collection;

/**
 * @author wuweifeng wrote on 2018/6/29.
 */
public class Tlbb {
    private static Logger log = LoggerFactory.getLogger(Tlbb.class);

    public static void main(String[] args) throws IOException {
        String filePath = new ClassPathResource("text/tlbb_t.txt").getFile().getAbsolutePath();
        log.info("Load & Vectorize Sentences....");

        SentenceIterator iter = new BasicLineIterator(new File(filePath));

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder().
                minWordFrequency(5)
                .iterations(1)
                .layerSize(100)
                .seed(42)
                .windowSize(5)
                .iterate(iter)
                .tokenizerFactory(t)
                .build();
        log.info("Fitting Word2Vec model....");
        vec.fit();
        log.info("Writing word vectors to text file....");

        // Write word vectors to file
        log.info("Writing word vectors to text file....");

        WordVectorSerializer.writeWordVectors(vec, "tlbb_vectors.txt");
        WordVectorSerializer.writeFullModel(vec, "tlbb_model.txt");
        String[] names = {"萧峰", "乔峰", "段誉", "虚竹", "王语嫣", "阿紫", "阿朱", "木婉清"};
        log.info("Closest Words:");

        for (String name : names) {
            System.out.println(name + ">>>>>>");
            Collection lst = vec.wordsNearest(name, 10);
            System.out.println(lst);
        }


    }

}

代码和之前的demo区别不大,运行后,就能看到这几个人的关联度最高的词了。

让DeepLearning4j阅读小说并给出关联度最高的词_第5张图片


参考篇:https://blog.csdn.net/a398942089/article/details/51970691

你可能感兴趣的:(神经网络,机器学习入门)