通配符匹配算法的java实现

看过一些关于通配符匹配的实现不甚满意,于是写了一种java实现,仅供参考。

    /**
     * 平均O(n+m)  最坏O(n*m)
     * 约定 * : ANY>=0    ,   ? : ANY=1
     *
     * @author Spance.Wong
     */
    static class WildCardMatcher {

        /**
         * 仅为了方便实验
         *
         * @param input
         * @param pattern
         * @return
         */
        static List<String> matches(String input, String pattern) {
            String[] pa = pattern.split("\\*+");    // 分割不是重点,故未做重点实现
            return matches(input, pa);
        }

        /**
         * 从input中查找通配符序列
         *
         * @param input
         * @param patterns
         * @return
         */
        static List<String> matches(CharSequence input, String[] patterns) {
            int n = input.length(), m = patterns.length;
            List<String> result = new ArrayList<String>();
            for (int i = 0; i < n; ) {
                int left = -1, right = -1;
                for (int j = 0; j < m; j++) {   // 以i为起点,执行m趟匹配,每趟i至少前进p[j].length长度
                    long region = lookBehind(input, i, patterns[j]);
                    if (j != 0 && region >= 0) {   // 模式序列的第二个开始使用贪婪匹配
                        long greedyRegion;
                        for (int k = (int) region + 1; ; k = (int) greedyRegion + 1) {
                            greedyRegion = lookBehind(input, k, patterns[j]);
                            if (greedyRegion > 0)    // 贪婪找到,继续贪婪尝试
                                region = greedyRegion;
                            else
                                break;
                        }
                    }
                    if (region < 0) {   // pattern[j]失败,则本趟失败
                        i = ((int) -region) + 1;
                        break;
                    } else {
                        i = (int) region + 1;
                        if (j == 0)     // 模式序列的第一个找到,记左边界,在高32位
                            left = (int) (region >> 32);
                        if (j == m - 1)     // 模式序列的最后一个找到,记右边界,在低32位
                            right = (int) region;
                    }
                }
                if (left >= 0 && right >= 0)
                    result.add(input.subSequence(left, right + 1).toString());
            }
            return result;
        }

        /**
         * 在input的i位置开始向后扫描非贪婪查找pattern,在pattern尾匹配时回溯确认
         *
         * @param in
         * @param i
         * @param pattern
         * @return
         */
        static long lookBehind(CharSequence in, int i, CharSequence pattern) {
            int len = in.length(), pLen = pattern.length(), _pMax = pLen - 1;
            char pEnd = pattern.charAt(_pMax);
            if (len - i >= pLen) {
                for (i = i + _pMax; i < len; i++) {  // 以 i + pLen - 1 起步
                    if (in.charAt(i) == pEnd || pEnd == '?') {     // 与pa末尾相同,i即右边界
                        if (pLen == 1)
                            return ((long) i) << 32 | i;
                        for (int j = i - 1; j >= i - _pMax; j--) {   // 则至多回溯pLen长找左边界
                            char p = pattern.charAt(_pMax - i + j);
                            if (in.charAt(j) == p || p == '?') {
                                if (j == i - _pMax)     // 找到左边界即j
                                    return ((long) j) << 32 | i;
                            } else
                                break;
                        }
                    }
                }
            }
            return -i;
        }
    }

    // 若干测试
    public static void main(String[] args) {
        assertAndPrint(WildCardMatcher.matches("assbsavb", "a*b"), "assbsavb");
        assertAndPrint(WildCardMatcher.matches("assbsavb", "a??b"), "assb");
        assertAndPrint(WildCardMatcher.matches("assbsavb", "a?b"), "avb");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "a??"), "ass", "avb");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "?"));
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "??sb"), "assb");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "b*s"), "bs");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "?s"), "as", "bs");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "s?"), "ss", "sa");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "s?s"), "sbs");
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "s"));
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "z"));
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "Z"));
        assertAndPrint(WildCardMatcher.matches("assbsavbz", "s?b?a"), "ssbsa");
        assertAndPrint(WildCardMatcher.matches("assbsavbcsb", "a*sb"), "assbsavbcsb");
        assertAndPrint(WildCardMatcher.matches("assbsavbcsb", "a*s*b"), "assbsavbcsb");
        assertAndPrint(WildCardMatcher.matches("assbsavbcsb", "a*s?"), "assbsavbcsb");
        assertAndPrint(WildCardMatcher.matches("assbsavbcsb", "a*a?b"), "assbsavb");

        String file = readExternal();
        int loop = 100;
        int count = 0;
        String[] pc = "</???>".split("\\*+");
        long t1 = System.currentTimeMillis();
        for (int i = 0; i < loop; i++) {
            count += WildCardMatcher.matches(file, pc).size();
        }
        long t2 = System.currentTimeMillis();
        System.out.printf("wildcard matcher time=%d, count=%d %n", t2 - t1, count);

        count = 0;
        Pattern pa = Pattern.compile("(</.{3}>)");
        t1 = System.currentTimeMillis();
        for (int i = 0; i < loop; i++) {
            java.util.regex.Matcher ma = pa.matcher(file);
            while (ma.find()) {
                count += ma.groupCount();
            }
        }
        t2 = System.currentTimeMillis();
        System.out.printf("regex matcher time=%d, count=%d %n", t2 - t1, count);
    }

同样也证明了,通配符(简单的模糊匹配)有着比正则更快的效率。

你可能感兴趣的:(通配符)