English Morphology

最近参与一个小project,需要编写一个针对英文单词的stem 算法。

1. 最为常见的stem 算法 就是The English (Porter2) stemming algorithm http://snowball.tartarus.org/algorithms/english/stemmer.html

// This file was generated automatically by the Snowball to Java compiler



package org.tartarus.snowball.ext;



import org.tartarus.snowball.Among;



 /**

  * This class was automatically generated by a Snowball to Java compiler 

  * It implements the stemming algorithm defined by a snowball script.

  */



public class englishStemmer extends org.tartarus.snowball.SnowballStemmer {



private static final long serialVersionUID = 1L;



        private final static englishStemmer methodObject = new englishStemmer ();



                private final static Among a_0[] = {

                    new Among ( "arsen", -1, -1, "", methodObject ),

                    new Among ( "commun", -1, -1, "", methodObject ),

                    new Among ( "gener", -1, -1, "", methodObject )

                };



                private final static Among a_1[] = {

                    new Among ( "'", -1, 1, "", methodObject ),

                    new Among ( "'s'", 0, 1, "", methodObject ),

                    new Among ( "'s", -1, 1, "", methodObject )

                };



                private final static Among a_2[] = {

                    new Among ( "ied", -1, 2, "", methodObject ),

                    new Among ( "s", -1, 3, "", methodObject ),

                    new Among ( "ies", 1, 2, "", methodObject ),

                    new Among ( "sses", 1, 1, "", methodObject ),

                    new Among ( "ss", 1, -1, "", methodObject ),

                    new Among ( "us", 1, -1, "", methodObject )

                };



                private final static Among a_3[] = {

                    new Among ( "", -1, 3, "", methodObject ),

                    new Among ( "bb", 0, 2, "", methodObject ),

                    new Among ( "dd", 0, 2, "", methodObject ),

                    new Among ( "ff", 0, 2, "", methodObject ),

                    new Among ( "gg", 0, 2, "", methodObject ),

                    new Among ( "bl", 0, 1, "", methodObject ),

                    new Among ( "mm", 0, 2, "", methodObject ),

                    new Among ( "nn", 0, 2, "", methodObject ),

                    new Among ( "pp", 0, 2, "", methodObject ),

                    new Among ( "rr", 0, 2, "", methodObject ),

                    new Among ( "at", 0, 1, "", methodObject ),

                    new Among ( "tt", 0, 2, "", methodObject ),

                    new Among ( "iz", 0, 1, "", methodObject )

                };



                private final static Among a_4[] = {

                    new Among ( "ed", -1, 2, "", methodObject ),

                    new Among ( "eed", 0, 1, "", methodObject ),

                    new Among ( "ing", -1, 2, "", methodObject ),

                    new Among ( "edly", -1, 2, "", methodObject ),

                    new Among ( "eedly", 3, 1, "", methodObject ),

                    new Among ( "ingly", -1, 2, "", methodObject )

                };



                private final static Among a_5[] = {

                    new Among ( "anci", -1, 3, "", methodObject ),

                    new Among ( "enci", -1, 2, "", methodObject ),

                    new Among ( "ogi", -1, 13, "", methodObject ),

                    new Among ( "li", -1, 16, "", methodObject ),

                    new Among ( "bli", 3, 12, "", methodObject ),

                    new Among ( "abli", 4, 4, "", methodObject ),

                    new Among ( "alli", 3, 8, "", methodObject ),

                    new Among ( "fulli", 3, 14, "", methodObject ),

                    new Among ( "lessli", 3, 15, "", methodObject ),

                    new Among ( "ousli", 3, 10, "", methodObject ),

                    new Among ( "entli", 3, 5, "", methodObject ),

                    new Among ( "aliti", -1, 8, "", methodObject ),

                    new Among ( "biliti", -1, 12, "", methodObject ),

                    new Among ( "iviti", -1, 11, "", methodObject ),

                    new Among ( "tional", -1, 1, "", methodObject ),

                    new Among ( "ational", 14, 7, "", methodObject ),

                    new Among ( "alism", -1, 8, "", methodObject ),

                    new Among ( "ation", -1, 7, "", methodObject ),

                    new Among ( "ization", 17, 6, "", methodObject ),

                    new Among ( "izer", -1, 6, "", methodObject ),

                    new Among ( "ator", -1, 7, "", methodObject ),

                    new Among ( "iveness", -1, 11, "", methodObject ),

                    new Among ( "fulness", -1, 9, "", methodObject ),

                    new Among ( "ousness", -1, 10, "", methodObject )

                };



                private final static Among a_6[] = {

                    new Among ( "icate", -1, 4, "", methodObject ),

                    new Among ( "ative", -1, 6, "", methodObject ),

                    new Among ( "alize", -1, 3, "", methodObject ),

                    new Among ( "iciti", -1, 4, "", methodObject ),

                    new Among ( "ical", -1, 4, "", methodObject ),

                    new Among ( "tional", -1, 1, "", methodObject ),

                    new Among ( "ational", 5, 2, "", methodObject ),

                    new Among ( "ful", -1, 5, "", methodObject ),

                    new Among ( "ness", -1, 5, "", methodObject )

                };



                private final static Among a_7[] = {

                    new Among ( "ic", -1, 1, "", methodObject ),

                    new Among ( "ance", -1, 1, "", methodObject ),

                    new Among ( "ence", -1, 1, "", methodObject ),

                    new Among ( "able", -1, 1, "", methodObject ),

                    new Among ( "ible", -1, 1, "", methodObject ),

                    new Among ( "ate", -1, 1, "", methodObject ),

                    new Among ( "ive", -1, 1, "", methodObject ),

                    new Among ( "ize", -1, 1, "", methodObject ),

                    new Among ( "iti", -1, 1, "", methodObject ),

                    new Among ( "al", -1, 1, "", methodObject ),

                    new Among ( "ism", -1, 1, "", methodObject ),

                    new Among ( "ion", -1, 2, "", methodObject ),

                    new Among ( "er", -1, 1, "", methodObject ),

                    new Among ( "ous", -1, 1, "", methodObject ),

                    new Among ( "ant", -1, 1, "", methodObject ),

                    new Among ( "ent", -1, 1, "", methodObject ),

                    new Among ( "ment", 15, 1, "", methodObject ),

                    new Among ( "ement", 16, 1, "", methodObject )

                };



                private final static Among a_8[] = {

                    new Among ( "e", -1, 1, "", methodObject ),

                    new Among ( "l", -1, 2, "", methodObject )

                };



                private final static Among a_9[] = {

                    new Among ( "succeed", -1, -1, "", methodObject ),

                    new Among ( "proceed", -1, -1, "", methodObject ),

                    new Among ( "exceed", -1, -1, "", methodObject ),

                    new Among ( "canning", -1, -1, "", methodObject ),

                    new Among ( "inning", -1, -1, "", methodObject ),

                    new Among ( "earring", -1, -1, "", methodObject ),

                    new Among ( "herring", -1, -1, "", methodObject ),

                    new Among ( "outing", -1, -1, "", methodObject )

                };



                private final static Among a_10[] = {

                    new Among ( "andes", -1, -1, "", methodObject ),

                    new Among ( "atlas", -1, -1, "", methodObject ),

                    new Among ( "bias", -1, -1, "", methodObject ),

                    new Among ( "cosmos", -1, -1, "", methodObject ),

                    new Among ( "dying", -1, 3, "", methodObject ),

                    new Among ( "early", -1, 9, "", methodObject ),

                    new Among ( "gently", -1, 7, "", methodObject ),

                    new Among ( "howe", -1, -1, "", methodObject ),

                    new Among ( "idly", -1, 6, "", methodObject ),

                    new Among ( "lying", -1, 4, "", methodObject ),

                    new Among ( "news", -1, -1, "", methodObject ),

                    new Among ( "only", -1, 10, "", methodObject ),

                    new Among ( "singly", -1, 11, "", methodObject ),

                    new Among ( "skies", -1, 2, "", methodObject ),

                    new Among ( "skis", -1, 1, "", methodObject ),

                    new Among ( "sky", -1, -1, "", methodObject ),

                    new Among ( "tying", -1, 5, "", methodObject ),

                    new Among ( "ugly", -1, 8, "", methodObject )

                };



                private static final char g_v[] = {17, 65, 16, 1 };



                private static final char g_v_WXY[] = {1, 17, 65, 208, 1 };



                private static final char g_valid_LI[] = {55, 141, 2 };



        private boolean B_Y_found;

        private int I_p2;

        private int I_p1;



                private void copy_from(englishStemmer other) {

                    B_Y_found = other.B_Y_found;

                    I_p2 = other.I_p2;

                    I_p1 = other.I_p1;

                    super.copy_from(other);

                }



                private boolean r_prelude() {

            int v_1;

            int v_2;

            int v_3;

            int v_4;

            int v_5;

                    // (, line 25

                    // unset Y_found, line 26

                    B_Y_found = false;

                    // do, line 27

                    v_1 = cursor;

                    lab0: do {

                        // (, line 27

                        // [, line 27

                        bra = cursor;

                        // literal, line 27

                        if (!(eq_s(1, "'")))

                        {

                            break lab0;

                        }

                        // ], line 27

                        ket = cursor;

                        // delete, line 27

                        slice_del();

                    } while (false);

                    cursor = v_1;

                    // do, line 28

                    v_2 = cursor;

                    lab1: do {

                        // (, line 28

                        // [, line 28

                        bra = cursor;

                        // literal, line 28

                        if (!(eq_s(1, "y")))

                        {

                            break lab1;

                        }

                        // ], line 28

                        ket = cursor;

                        // <-, line 28

                        slice_from("Y");

                        // set Y_found, line 28

                        B_Y_found = true;

                    } while (false);

                    cursor = v_2;

                    // do, line 29

                    v_3 = cursor;

                    lab2: do {

                        // repeat, line 29

                        replab3: while(true)

                        {

                            v_4 = cursor;

                            lab4: do {

                                // (, line 29

                                // goto, line 29

                                golab5: while(true)

                                {

                                    v_5 = cursor;

                                    lab6: do {

                                        // (, line 29

                                        if (!(in_grouping(g_v, 97, 121)))

                                        {

                                            break lab6;

                                        }

                                        // [, line 29

                                        bra = cursor;

                                        // literal, line 29

                                        if (!(eq_s(1, "y")))

                                        {

                                            break lab6;

                                        }

                                        // ], line 29

                                        ket = cursor;

                                        cursor = v_5;

                                        break golab5;

                                    } while (false);

                                    cursor = v_5;

                                    if (cursor >= limit)

                                    {

                                        break lab4;

                                    }

                                    cursor++;

                                }

                                // <-, line 29

                                slice_from("Y");

                                // set Y_found, line 29

                                B_Y_found = true;

                                continue replab3;

                            } while (false);

                            cursor = v_4;

                            break replab3;

                        }

                    } while (false);

                    cursor = v_3;

                    return true;

                }



                private boolean r_mark_regions() {

            int v_1;

            int v_2;

                    // (, line 32

                    I_p1 = limit;

                    I_p2 = limit;

                    // do, line 35

                    v_1 = cursor;

                    lab0: do {

                        // (, line 35

                        // or, line 41

                        lab1: do {

                            v_2 = cursor;

                            lab2: do {

                                // among, line 36

                                if (find_among(a_0, 3) == 0)

                                {

                                    break lab2;

                                }

                                break lab1;

                            } while (false);

                            cursor = v_2;

                            // (, line 41

                            // gopast, line 41

                            golab3: while(true)

                            {

                                lab4: do {

                                    if (!(in_grouping(g_v, 97, 121)))

                                    {

                                        break lab4;

                                    }

                                    break golab3;

                                } while (false);

                                if (cursor >= limit)

                                {

                                    break lab0;

                                }

                                cursor++;

                            }

                            // gopast, line 41

                            golab5: while(true)

                            {

                                lab6: do {

                                    if (!(out_grouping(g_v, 97, 121)))

                                    {

                                        break lab6;

                                    }

                                    break golab5;

                                } while (false);

                                if (cursor >= limit)

                                {

                                    break lab0;

                                }

                                cursor++;

                            }

                        } while (false);

                        // setmark p1, line 42

                        I_p1 = cursor;

                        // gopast, line 43

                        golab7: while(true)

                        {

                            lab8: do {

                                if (!(in_grouping(g_v, 97, 121)))

                                {

                                    break lab8;

                                }

                                break golab7;

                            } while (false);

                            if (cursor >= limit)

                            {

                                break lab0;

                            }

                            cursor++;

                        }

                        // gopast, line 43

                        golab9: while(true)

                        {

                            lab10: do {

                                if (!(out_grouping(g_v, 97, 121)))

                                {

                                    break lab10;

                                }

                                break golab9;

                            } while (false);

                            if (cursor >= limit)

                            {

                                break lab0;

                            }

                            cursor++;

                        }

                        // setmark p2, line 43

                        I_p2 = cursor;

                    } while (false);

                    cursor = v_1;

                    return true;

                }



                private boolean r_shortv() {

            int v_1;

                    // (, line 49

                    // or, line 51

                    lab0: do {

                        v_1 = limit - cursor;

                        lab1: do {

                            // (, line 50

                            if (!(out_grouping_b(g_v_WXY, 89, 121)))

                            {

                                break lab1;

                            }

                            if (!(in_grouping_b(g_v, 97, 121)))

                            {

                                break lab1;

                            }

                            if (!(out_grouping_b(g_v, 97, 121)))

                            {

                                break lab1;

                            }

                            break lab0;

                        } while (false);

                        cursor = limit - v_1;

                        // (, line 52

                        if (!(out_grouping_b(g_v, 97, 121)))

                        {

                            return false;

                        }

                        if (!(in_grouping_b(g_v, 97, 121)))

                        {

                            return false;

                        }

                        // atlimit, line 52

                        if (cursor > limit_backward)

                        {

                            return false;

                        }

                    } while (false);

                    return true;

                }



                private boolean r_R1() {

                    if (!(I_p1 <= cursor))

                    {

                        return false;

                    }

                    return true;

                }



                private boolean r_R2() {

                    if (!(I_p2 <= cursor))

                    {

                        return false;

                    }

                    return true;

                }



                private boolean r_Step_1a() {

            int among_var;

            int v_1;

            int v_2;

                    // (, line 58

                    // try, line 59

                    v_1 = limit - cursor;

                    lab0: do {

                        // (, line 59

                        // [, line 60

                        ket = cursor;

                        // substring, line 60

                        among_var = find_among_b(a_1, 3);

                        if (among_var == 0)

                        {

                            cursor = limit - v_1;

                            break lab0;

                        }

                        // ], line 60

                        bra = cursor;

                        switch(among_var) {

                            case 0:

                                cursor = limit - v_1;

                                break lab0;

                            case 1:

                                // (, line 62

                                // delete, line 62

                                slice_del();

                                break;

                        }

                    } while (false);

                    // [, line 65

                    ket = cursor;

                    // substring, line 65

                    among_var = find_among_b(a_2, 6);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 65

                    bra = cursor;

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 66

                            // <-, line 66

                            slice_from("ss");

                            break;

                        case 2:

                            // (, line 68

                            // or, line 68

                            lab1: do {

                                v_2 = limit - cursor;

                                lab2: do {

                                    // (, line 68

                                    // hop, line 68

                                    {

                                        int c = cursor - 2;

                                        if (limit_backward > c || c > limit)

                                        {

                                            break lab2;

                                        }

                                        cursor = c;

                                    }

                                    // <-, line 68

                                    slice_from("i");

                                    break lab1;

                                } while (false);

                                cursor = limit - v_2;

                                // <-, line 68

                                slice_from("ie");

                            } while (false);

                            break;

                        case 3:

                            // (, line 69

                            // next, line 69

                            if (cursor <= limit_backward)

                            {

                                return false;

                            }

                            cursor--;

                            // gopast, line 69

                            golab3: while(true)

                            {

                                lab4: do {

                                    if (!(in_grouping_b(g_v, 97, 121)))

                                    {

                                        break lab4;

                                    }

                                    break golab3;

                                } while (false);

                                if (cursor <= limit_backward)

                                {

                                    return false;

                                }

                                cursor--;

                            }

                            // delete, line 69

                            slice_del();

                            break;

                    }

                    return true;

                }



                private boolean r_Step_1b() {

            int among_var;

            int v_1;

            int v_3;

            int v_4;

                    // (, line 74

                    // [, line 75

                    ket = cursor;

                    // substring, line 75

                    among_var = find_among_b(a_4, 6);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 75

                    bra = cursor;

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 77

                            // call R1, line 77

                            if (!r_R1())

                            {

                                return false;

                            }

                            // <-, line 77

                            slice_from("ee");

                            break;

                        case 2:

                            // (, line 79

                            // test, line 80

                            v_1 = limit - cursor;

                            // gopast, line 80

                            golab0: while(true)

                            {

                                lab1: do {

                                    if (!(in_grouping_b(g_v, 97, 121)))

                                    {

                                        break lab1;

                                    }

                                    break golab0;

                                } while (false);

                                if (cursor <= limit_backward)

                                {

                                    return false;

                                }

                                cursor--;

                            }

                            cursor = limit - v_1;

                            // delete, line 80

                            slice_del();

                            // test, line 81

                            v_3 = limit - cursor;

                            // substring, line 81

                            among_var = find_among_b(a_3, 13);

                            if (among_var == 0)

                            {

                                return false;

                            }

                            cursor = limit - v_3;

                            switch(among_var) {

                                case 0:

                                    return false;

                                case 1:

                                    // (, line 83

                                    // <+, line 83

                                    {

                                        int c = cursor;

                                        insert(cursor, cursor, "e");

                                        cursor = c;

                                    }

                                    break;

                                case 2:

                                    // (, line 86

                                    // [, line 86

                                    ket = cursor;

                                    // next, line 86

                                    if (cursor <= limit_backward)

                                    {

                                        return false;

                                    }

                                    cursor--;

                                    // ], line 86

                                    bra = cursor;

                                    // delete, line 86

                                    slice_del();

                                    break;

                                case 3:

                                    // (, line 87

                                    // atmark, line 87

                                    if (cursor != I_p1)

                                    {

                                        return false;

                                    }

                                    // test, line 87

                                    v_4 = limit - cursor;

                                    // call shortv, line 87

                                    if (!r_shortv())

                                    {

                                        return false;

                                    }

                                    cursor = limit - v_4;

                                    // <+, line 87

                                    {

                                        int c = cursor;

                                        insert(cursor, cursor, "e");

                                        cursor = c;

                                    }

                                    break;

                            }

                            break;

                    }

                    return true;

                }



                private boolean r_Step_1c() {

            int v_1;

            int v_2;

                    // (, line 93

                    // [, line 94

                    ket = cursor;

                    // or, line 94

                    lab0: do {

                        v_1 = limit - cursor;

                        lab1: do {

                            // literal, line 94

                            if (!(eq_s_b(1, "y")))

                            {

                                break lab1;

                            }

                            break lab0;

                        } while (false);

                        cursor = limit - v_1;

                        // literal, line 94

                        if (!(eq_s_b(1, "Y")))

                        {

                            return false;

                        }

                    } while (false);

                    // ], line 94

                    bra = cursor;

                    if (!(out_grouping_b(g_v, 97, 121)))

                    {

                        return false;

                    }

                    // not, line 95

                    {

                        v_2 = limit - cursor;

                        lab2: do {

                            // atlimit, line 95

                            if (cursor > limit_backward)

                            {

                                break lab2;

                            }

                            return false;

                        } while (false);

                        cursor = limit - v_2;

                    }

                    // <-, line 96

                    slice_from("i");

                    return true;

                }



                private boolean r_Step_2() {

            int among_var;

                    // (, line 99

                    // [, line 100

                    ket = cursor;

                    // substring, line 100

                    among_var = find_among_b(a_5, 24);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 100

                    bra = cursor;

                    // call R1, line 100

                    if (!r_R1())

                    {

                        return false;

                    }

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 101

                            // <-, line 101

                            slice_from("tion");

                            break;

                        case 2:

                            // (, line 102

                            // <-, line 102

                            slice_from("ence");

                            break;

                        case 3:

                            // (, line 103

                            // <-, line 103

                            slice_from("ance");

                            break;

                        case 4:

                            // (, line 104

                            // <-, line 104

                            slice_from("able");

                            break;

                        case 5:

                            // (, line 105

                            // <-, line 105

                            slice_from("ent");

                            break;

                        case 6:

                            // (, line 107

                            // <-, line 107

                            slice_from("ize");

                            break;

                        case 7:

                            // (, line 109

                            // <-, line 109

                            slice_from("ate");

                            break;

                        case 8:

                            // (, line 111

                            // <-, line 111

                            slice_from("al");

                            break;

                        case 9:

                            // (, line 112

                            // <-, line 112

                            slice_from("ful");

                            break;

                        case 10:

                            // (, line 114

                            // <-, line 114

                            slice_from("ous");

                            break;

                        case 11:

                            // (, line 116

                            // <-, line 116

                            slice_from("ive");

                            break;

                        case 12:

                            // (, line 118

                            // <-, line 118

                            slice_from("ble");

                            break;

                        case 13:

                            // (, line 119

                            // literal, line 119

                            if (!(eq_s_b(1, "l")))

                            {

                                return false;

                            }

                            // <-, line 119

                            slice_from("og");

                            break;

                        case 14:

                            // (, line 120

                            // <-, line 120

                            slice_from("ful");

                            break;

                        case 15:

                            // (, line 121

                            // <-, line 121

                            slice_from("less");

                            break;

                        case 16:

                            // (, line 122

                            if (!(in_grouping_b(g_valid_LI, 99, 116)))

                            {

                                return false;

                            }

                            // delete, line 122

                            slice_del();

                            break;

                    }

                    return true;

                }



                private boolean r_Step_3() {

            int among_var;

                    // (, line 126

                    // [, line 127

                    ket = cursor;

                    // substring, line 127

                    among_var = find_among_b(a_6, 9);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 127

                    bra = cursor;

                    // call R1, line 127

                    if (!r_R1())

                    {

                        return false;

                    }

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 128

                            // <-, line 128

                            slice_from("tion");

                            break;

                        case 2:

                            // (, line 129

                            // <-, line 129

                            slice_from("ate");

                            break;

                        case 3:

                            // (, line 130

                            // <-, line 130

                            slice_from("al");

                            break;

                        case 4:

                            // (, line 132

                            // <-, line 132

                            slice_from("ic");

                            break;

                        case 5:

                            // (, line 134

                            // delete, line 134

                            slice_del();

                            break;

                        case 6:

                            // (, line 136

                            // call R2, line 136

                            if (!r_R2())

                            {

                                return false;

                            }

                            // delete, line 136

                            slice_del();

                            break;

                    }

                    return true;

                }



                private boolean r_Step_4() {

            int among_var;

            int v_1;

                    // (, line 140

                    // [, line 141

                    ket = cursor;

                    // substring, line 141

                    among_var = find_among_b(a_7, 18);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 141

                    bra = cursor;

                    // call R2, line 141

                    if (!r_R2())

                    {

                        return false;

                    }

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 144

                            // delete, line 144

                            slice_del();

                            break;

                        case 2:

                            // (, line 145

                            // or, line 145

                            lab0: do {

                                v_1 = limit - cursor;

                                lab1: do {

                                    // literal, line 145

                                    if (!(eq_s_b(1, "s")))

                                    {

                                        break lab1;

                                    }

                                    break lab0;

                                } while (false);

                                cursor = limit - v_1;

                                // literal, line 145

                                if (!(eq_s_b(1, "t")))

                                {

                                    return false;

                                }

                            } while (false);

                            // delete, line 145

                            slice_del();

                            break;

                    }

                    return true;

                }



                private boolean r_Step_5() {

            int among_var;

            int v_1;

            int v_2;

                    // (, line 149

                    // [, line 150

                    ket = cursor;

                    // substring, line 150

                    among_var = find_among_b(a_8, 2);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 150

                    bra = cursor;

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 151

                            // or, line 151

                            lab0: do {

                                v_1 = limit - cursor;

                                lab1: do {

                                    // call R2, line 151

                                    if (!r_R2())

                                    {

                                        break lab1;

                                    }

                                    break lab0;

                                } while (false);

                                cursor = limit - v_1;

                                // (, line 151

                                // call R1, line 151

                                if (!r_R1())

                                {

                                    return false;

                                }

                                // not, line 151

                                {

                                    v_2 = limit - cursor;

                                    lab2: do {

                                        // call shortv, line 151

                                        if (!r_shortv())

                                        {

                                            break lab2;

                                        }

                                        return false;

                                    } while (false);

                                    cursor = limit - v_2;

                                }

                            } while (false);

                            // delete, line 151

                            slice_del();

                            break;

                        case 2:

                            // (, line 152

                            // call R2, line 152

                            if (!r_R2())

                            {

                                return false;

                            }

                            // literal, line 152

                            if (!(eq_s_b(1, "l")))

                            {

                                return false;

                            }

                            // delete, line 152

                            slice_del();

                            break;

                    }

                    return true;

                }



                private boolean r_exception2() {

                    // (, line 156

                    // [, line 158

                    ket = cursor;

                    // substring, line 158

                    if (find_among_b(a_9, 8) == 0)

                    {

                        return false;

                    }

                    // ], line 158

                    bra = cursor;

                    // atlimit, line 158

                    if (cursor > limit_backward)

                    {

                        return false;

                    }

                    return true;

                }



                private boolean r_exception1() {

            int among_var;

                    // (, line 168

                    // [, line 170

                    bra = cursor;

                    // substring, line 170

                    among_var = find_among(a_10, 18);

                    if (among_var == 0)

                    {

                        return false;

                    }

                    // ], line 170

                    ket = cursor;

                    // atlimit, line 170

                    if (cursor < limit)

                    {

                        return false;

                    }

                    switch(among_var) {

                        case 0:

                            return false;

                        case 1:

                            // (, line 174

                            // <-, line 174

                            slice_from("ski");

                            break;

                        case 2:

                            // (, line 175

                            // <-, line 175

                            slice_from("sky");

                            break;

                        case 3:

                            // (, line 176

                            // <-, line 176

                            slice_from("die");

                            break;

                        case 4:

                            // (, line 177

                            // <-, line 177

                            slice_from("lie");

                            break;

                        case 5:

                            // (, line 178

                            // <-, line 178

                            slice_from("tie");

                            break;

                        case 6:

                            // (, line 182

                            // <-, line 182

                            slice_from("idl");

                            break;

                        case 7:

                            // (, line 183

                            // <-, line 183

                            slice_from("gentl");

                            break;

                        case 8:

                            // (, line 184

                            // <-, line 184

                            slice_from("ugli");

                            break;

                        case 9:

                            // (, line 185

                            // <-, line 185

                            slice_from("earli");

                            break;

                        case 10:

                            // (, line 186

                            // <-, line 186

                            slice_from("onli");

                            break;

                        case 11:

                            // (, line 187

                            // <-, line 187

                            slice_from("singl");

                            break;

                    }

                    return true;

                }



                private boolean r_postlude() {

            int v_1;

            int v_2;

                    // (, line 203

                    // Boolean test Y_found, line 203

                    if (!(B_Y_found))

                    {

                        return false;

                    }

                    // repeat, line 203

                    replab0: while(true)

                    {

                        v_1 = cursor;

                        lab1: do {

                            // (, line 203

                            // goto, line 203

                            golab2: while(true)

                            {

                                v_2 = cursor;

                                lab3: do {

                                    // (, line 203

                                    // [, line 203

                                    bra = cursor;

                                    // literal, line 203

                                    if (!(eq_s(1, "Y")))

                                    {

                                        break lab3;

                                    }

                                    // ], line 203

                                    ket = cursor;

                                    cursor = v_2;

                                    break golab2;

                                } while (false);

                                cursor = v_2;

                                if (cursor >= limit)

                                {

                                    break lab1;

                                }

                                cursor++;

                            }

                            // <-, line 203

                            slice_from("y");

                            continue replab0;

                        } while (false);

                        cursor = v_1;

                        break replab0;

                    }

                    return true;

                }



                public boolean stem() {

            int v_1;

            int v_2;

            int v_3;

            int v_4;

            int v_5;

            int v_6;

            int v_7;

            int v_8;

            int v_9;

            int v_10;

            int v_11;

            int v_12;

            int v_13;

                    // (, line 205

                    // or, line 207

                    lab0: do {

                        v_1 = cursor;

                        lab1: do {

                            // call exception1, line 207

                            if (!r_exception1())

                            {

                                break lab1;

                            }

                            break lab0;

                        } while (false);

                        cursor = v_1;

                        lab2: do {

                            // not, line 208

                            {

                                v_2 = cursor;

                                lab3: do {

                                    // hop, line 208

                                    {

                                        int c = cursor + 3;

                                        if (0 > c || c > limit)

                                        {

                                            break lab3;

                                        }

                                        cursor = c;

                                    }

                                    break lab2;

                                } while (false);

                                cursor = v_2;

                            }

                            break lab0;

                        } while (false);

                        cursor = v_1;

                        // (, line 208

                        // do, line 209

                        v_3 = cursor;

                        lab4: do {

                            // call prelude, line 209

                            if (!r_prelude())

                            {

                                break lab4;

                            }

                        } while (false);

                        cursor = v_3;

                        // do, line 210

                        v_4 = cursor;

                        lab5: do {

                            // call mark_regions, line 210

                            if (!r_mark_regions())

                            {

                                break lab5;

                            }

                        } while (false);

                        cursor = v_4;

                        // backwards, line 211

                        limit_backward = cursor; cursor = limit;

                        // (, line 211

                        // do, line 213

                        v_5 = limit - cursor;

                        lab6: do {

                            // call Step_1a, line 213

                            if (!r_Step_1a())

                            {

                                break lab6;

                            }

                        } while (false);

                        cursor = limit - v_5;

                        // or, line 215

                        lab7: do {

                            v_6 = limit - cursor;

                            lab8: do {

                                // call exception2, line 215

                                if (!r_exception2())

                                {

                                    break lab8;

                                }

                                break lab7;

                            } while (false);

                            cursor = limit - v_6;

                            // (, line 215

                            // do, line 217

                            v_7 = limit - cursor;

                            lab9: do {

                                // call Step_1b, line 217

                                if (!r_Step_1b())

                                {

                                    break lab9;

                                }

                            } while (false);

                            cursor = limit - v_7;

                            // do, line 218

                            v_8 = limit - cursor;

                            lab10: do {

                                // call Step_1c, line 218

                                if (!r_Step_1c())

                                {

                                    break lab10;

                                }

                            } while (false);

                            cursor = limit - v_8;

                            // do, line 220

                            v_9 = limit - cursor;

                            lab11: do {

                                // call Step_2, line 220

                                if (!r_Step_2())

                                {

                                    break lab11;

                                }

                            } while (false);

                            cursor = limit - v_9;

                            // do, line 221

                            v_10 = limit - cursor;

                            lab12: do {

                                // call Step_3, line 221

                                if (!r_Step_3())

                                {

                                    break lab12;

                                }

                            } while (false);

                            cursor = limit - v_10;

                            // do, line 222

                            v_11 = limit - cursor;

                            lab13: do {

                                // call Step_4, line 222

                                if (!r_Step_4())

                                {

                                    break lab13;

                                }

                            } while (false);

                            cursor = limit - v_11;

                            // do, line 224

                            v_12 = limit - cursor;

                            lab14: do {

                                // call Step_5, line 224

                                if (!r_Step_5())

                                {

                                    break lab14;

                                }

                            } while (false);

                            cursor = limit - v_12;

                        } while (false);

                        cursor = limit_backward;                        // do, line 227

                        v_13 = cursor;

                        lab15: do {

                            // call postlude, line 227

                            if (!r_postlude())

                            {

                                break lab15;

                            }

                        } while (false);

                        cursor = v_13;

                    } while (false);

                    return true;

                }



        public boolean equals( Object o ) {

            return o instanceof englishStemmer;

        }



        public int hashCode() {

            return englishStemmer.class.getName().hashCode();

        }







}
porter2 stemming algorithm

 然而,porter stemming 仅仅是一个基于后缀的词干提取技术,它仅仅定义了一些基本的后缀规则,能识别出"books"->"book"等. 然而针对一些诸如 "bought"->"buy","brought"->"bring"等异常形式并不能识别出来。

2. The dragon toolkit (http://dragon.ischool.drexel.edu/download.asp)

然后发现上面nlp 处理工具,其中的EngLemmatiser 类就是stem类,能提取出单词的词干。

它首先定义一些基本点后缀规则(只有十几条),然后定义一些独立于这些规则的异常词库(master slave 的形式,这样就能基本实现单词词干的正确提取,解决了porter stemming 存在的问题。

String dictionaryPath = "lemmatiser";

        EngLemmatiser lemmatiser =  new EngLemmatiser(dictionaryPath, false, true);



        String a = "brought";

        String lemmatizedWord = lemmatiser.lemmatize(a);

        System.out.println(lemmatizedWord);
View Code

然而我还是觉得,在规则基础之上附加词典的技术过于死板,不够灵活。

3. Stanford CoreNLP

后来发现斯坦福大学的一个NLP工具,其中提取词干的技术:针对大量语料库进行机器学习,利用有限自动机提炼并生成规则(不必附加词典)。能完美解决词干的提取问题,准确率很高。它对地名、人名等专有词识别不出来,但达到了基本的需求。

    String word="magnificus";

        Morphology morph=new Morphology();

        System.out.println(morph.stem(word));
View Code

 

你可能感兴趣的:(english)