词形还原-European Language Lemmatizer使用笔记

       所谓词形还原(lemmatizer),即把一个任何形式的英语单词还原到一般形式,与词根还原不同(stemmer),后者是抽取一个单词的词根。

       最近在做跨语言文本分类的研究,需要词形还原,穷搜一番后找到了这个俄罗斯人写的工具,貌似还不错。现在把自己的探索笔记列在这里,以方便来人。

1、下载地址:http://lemmatizer.org/en/download.html

2、依次安装4个组件,其中有一个是俄文词典,可以不装。安装过程很简单:http://lemmatizer.org/en/setup.html。安装时要用到cmake,下载地址:http://www.cmake.org/

3、装英文词典时遇到问题,make报错,找不到turglem/txml.h,跑到usr/local/include/turglem下一看,果然没有,在usr/local/include/txml下有个txml.hpp,拷贝到前一个目录下,名字改成txml.h,再make。还有错:‘void txml::determination_object::load_from_file(const std::string&)’ 的原型不匹配类 ‘txml::determination_object’ 中的任何一个。大意是说txml.cpp的实现中,这个函数与头文件中的声明不一致。于是打开turglem-english-0.2/txml.cpp一看,也没有太大的问题,头文件中声明的方法参数是char *,cpp文件中是string&,于是自己把源代码txml.cpp中的实现改成char *版本。再make,顺利通过!

4、下载例子程序:Usage examples ,cmake、make,run,运行成功。

5、自己写个程序,直接改例子中的test1.c,命名为lemmatizer.cpp,放到例子文件夹下,再改下例子文件夹中的cmakelists.txt,在最后加上一句:

ADD_EXECUTABLE(lemmatizer lemmatizer.cpp)
TARGET_LINK_LIBRARIES(lemmatizer ${TURGLEM_LIBRARY} ${MAFSA_LIBRARY}
${TURGLEM_ENGLISH_LIBRARY})

cmake、make、run,运行成功!

6、测试句子:

Mrs. Clinton, speaking Wednesday afternoon in Seoul after conferring with President Lee Myung-bak, said the South s conclusion that one of its warships had been sunk by a North Korean torpedo was inescapable, and she endorsed Mr. Lee s moves to cut trade ties and redesignate the North its archenemy.

转换后:

Mrs. Clinton, SPEAKING WEDNESDAY AFTERNOON IN SEOUL AFTER CONFER WITH PRESIDENT LEE Myung-bak, SAY THE SOUTH CONCLUSION THAT ONE OF IT WARSHIP HAVE BE SINK BY A NORTH KOREAN TORPEDO BE inescapable, AND SHE ENDORSED Mr. LEE MOVE TO CUT TRADE TIE AND REDESIGNATE THE NORTH IT archenemy.

注意到,有一些词,比如speaking、endorsed没有被转换,前者可以被认为是名词形式,后者则可能是在词典里面没有。

总之European Language Lemmatizer功能比较完善,方便使用,而且本身支持输出一个单词的所有形式,如果再结合pos进行选择的话,效果会更好。

//附:英语文本词形还原程序lemmatizer.cpp

#include <turglem/lemmatizer.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <string>
#include <sstream>
#include <iostream>
#include <fstream>
#include <turglem/english/charset_adapters.h>
#include <MAFSA/charset_adapter.h>

using namespace std;

MAFSA_conv_string_to_letters my_s2l = 0;
MAFSA_conv_letters_to_string my_l2s = 0;

string convert(turglem lem, string word)
{
const char * s = word.c_str();
MAFSA_letter l[32];
ssize_t ssz_src;
size_t sz_lem_res;
size_t sz;
int lem_res[32 * 2];

ssz_src = my_s2l(s, l, 32);
if (ssz_src < 0)
{
   //含有非英语字符,直接返回,不转换
   //printf("can't convert word <%s> to letters! Is it english word in UTF-8?/n", s);
   return word;
}
    
sz_lem_res = turglem_lemmatize(lem, l, ssz_src, lem_res, 32, ENGLISH_LETTER_DELIM, 1);
    if (sz_lem_res == 0)
{
   //没有转换结果,直接返回
   return word;
}

MAFSA_letter out_letters[32];
char buf[500];
sz = turglem_build_form(lem, l, ssz_src, out_letters, 32, lem_res[0], lem_res[1], 0);
sz = my_l2s(out_letters, sz, buf, 500);
return string(buf);
}

string strToLower(string str)
{
    for(int i = 0; i < str.size(); i++)
    { 
        if(isalpha(str[i]))
        {
            str[i] = tolower(str[i]);
        }
    }
    return str;
}


int main(int argc, char **argv)
{
turglem lem = 0;
int err_no;
int err_what;

if (argc != 3)
{
   cout << "command infile ofile" << endl;
   return 1;
}

    string inpath(argv[1]);
    string opath(argv[2]);

    fstream infile(inpath.c_str(), ios_base::in);
    fstream ofile(opath.c_str(), ios_base::out);
    if(!infile || !ofile)
    {
        cout << "open file error!" << endl;
        return 1;
    }

//加载词典
cout << "we try to load english dictionaries from /usr/local/share/turglem/english/n";
lem = turglem_load("/usr/local/share/turglem/english/dict_english.auto",
      "/usr/local/share/turglem/english/prediction_english.auto",
      "/usr/local/share/turglem/english/paradigms_english.bin",
      &err_no, &err_what);
if (0 == lem)
{
   printf("FAILED: err_no/err_what = %d/%d/n (error loading %s: %s: %s)/n",
    err_what, err_no,
    turglem_error_what_string(err_what),
    turglem_error_no_string(err_no),
    strerror(errno));
   return -1;
}

my_s2l = LEM_ENGLISH_conv_string_to_letters_utf8;
my_l2s = LEM_ENGLISH_conv_letters_to_string_utf8;

    string rmchars = ",./?<>[]{}/"/';:1234567890-+=_()";
    string line;
    //此处的文件格式为,每行一个文本,文本中各个单词之间用空格分开
    while(getline(infile, line))
    {
        line = strToLower(line);
        for(int j = 0; j < line.size(); j++)
        {
            for(int i = 0; i < rmchars.size(); i++)
            {
                if(line[j] == rmchars[i])
                {
                    line[j] = ' ';
                }
            }
        }

        istringstream iss(line);
        string word;
        while(iss >> word)
        {
            string normWord = convert(lem, word);
            ofile << strToLower(normWord) << " ";
        }
        ofile << endl;
    }
turglem_close(lem);
return 0;
}

你可能感兴趣的:(ios,String,object,command,File,library)