crawler_工具类_RegexUtils_正则帮助类

package com.cph.crawler.core.utils;



import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;



import org.apache.commons.logging.Log;

import org.apache.commons.logging.LogFactory;



/**

 * 

 * @ClassName: RegexUtils

 * @Description: 正则帮助类

 * @author cphmvp

 * @date 2013-9-9 下午3:48:59<br>

 *       适合单次抽取结果,不适合遍历抽取

 * 

 */

public final class RegexUtils {

    private RegexUtils() {



    }



    private static Log logger = LogFactory.getLog(RegexUtils.class);

    private static Pattern defaultPattern;

    private static Matcher defaultMatcher;

    private static final String NOT_MATCHER_DATA = "没有匹配到对应数据";



    /**

     * 返回单行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static String getString(String input, Pattern pattren, int group) {

        if (pattren.toString().equals(

                "<span class=\"l\">当前位置:([\\s\\S]*?)</span>")) {

            System.out.println("warn");

        }

        String result = "";

        String splitStr = "⊙";

        defaultMatcher = pattren.matcher(input);

        while (defaultMatcher.find()) {

            result = defaultMatcher.group(group).trim() + "" + splitStr;

        }

        result = result.trim().replaceAll("</?[^>]+>", "");

        result = result.replaceAll("&gt;", ">");

        result = result.replaceAll("\r\n", "");

        result = result.replaceAll("\\r\\n", "");

        result = result.replaceAll("\\s", "");

        result = result.replaceAll("&nbsp", " ");

        result = result.replace("\n", "");

        result = result.replace("\t", "");

        result = result.replace("^p", "");

        result = result.replaceAll("⊙", " ");

        return result.trim();

    }



    /**

     * 返回单行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static String getString(String input, String regex, int group) {

        String result = " ";

        defaultMatcher = getMatcher(input, regex);

        while (defaultMatcher.find()) {

            result = defaultMatcher.group(group).trim();

        }

        getLog(result);

        return result;

    }



    /**

     * 获得可匹配对象

     * 

     * @param input

     * @param regex

     * @return

     */

    public static Matcher getMatcher(String input, String regex) {

        defaultPattern = getPattern(regex);

        defaultMatcher = defaultPattern.matcher(input);

        return defaultMatcher;

    }



    /**

     * 获得模式对象

     * 

     * @param regex

     * @return

     */

    public static Pattern getPattern(String regex) {

        defaultPattern = Pattern.compile(regex);

        return defaultPattern;

    }



    /**

     * 返回多行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static List<String> getStringList(String input, String regex,

            int group) {

        List<String> resultList = new ArrayList<String>();

        defaultMatcher = getMatcher(input, regex);

        while (defaultMatcher.find()) {

            resultList.add(defaultMatcher.group().trim());

        }

        if (resultList.size() < 1) {

            logger.error(NOT_MATCHER_DATA);

        }

        return resultList;

    }



    /**

     * 返回多行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static List<Integer> getIntList(String input, String regex, int group) {

        List<Integer> resultList = new ArrayList<Integer>();

        defaultMatcher = getMatcher(input, regex);

        while (defaultMatcher.find()) {

            resultList.add(Integer.parseInt(defaultMatcher.group().trim()));

        }

        if (resultList.size() < 1) {

            logger.error(NOT_MATCHER_DATA);

        }

        return resultList;

    }



    /**

     * 返回多行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static String getString(String input, String regex) {

        String result = " ";

        defaultMatcher = getMatcher(input, regex);

        while (defaultMatcher.find()) {

            result = defaultMatcher.group().trim();

        }

        getLog(result);

        return result;

    }



    /**

     * 返回单行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static int getInt(String input, String regex, int group) {

        int result = -1;

        defaultMatcher = getMatcher(input, regex);

        while (defaultMatcher.find()) {

            result = Integer.parseInt(defaultMatcher.group(group).trim());

        }

        getLog(result);

        return result;

    }



    /**

     * 返回单行结果集

     * 

     * @param input

     * @param regex

     * @param group

     * @return

     */

    public static int getInt(String input, String regex) {

        int result = -1;

        defaultMatcher = getMatcher(input, regex);

        while (defaultMatcher.find()) {

            result = Integer.parseInt(defaultMatcher.group().trim());

        }

        getLog(result);

        return result;

    }



    /**

     * 匹配中国邮政编码

     * 

     * @param postcode

     *            邮政编码

     * @return 验证成功返回true,验证失败返回false

     */

    public static boolean checkPostcode(String postcode) {

        String regex = "[1-9]\\d{5}";

        return Pattern.matches(regex, postcode);

    }



    private static void getLog(String result) {

        if (result.trim().equals("")) {

            logger.error(NOT_MATCHER_DATA);

        }

    }



    private static void getLog(Integer result) {



        if (-1 == result) {

            logger.error(NOT_MATCHER_DATA);

        }

    }

}

 

你可能感兴趣的:(regex)