Java正则匹配过滤移除html标签以及获取img完整标签工具类

工具类记录于此,以供参考。 

package com.gccp.translate.biz.util;

import org.apache.commons.lang3.StringUtils;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author hilbert.xu
 * @date 2019/5/20
 */
public class HtmlUtil {

    /**
     * 过滤所有以html标签
     */
    private final static String REG_HTML = "<([^>]*)>";
    /**
     * img标签
     */
    private static final String REG_IMG = "(|\\/>))";

    /**
     * @param htmlStr
     * @return 删除Html标签
     */
    public static String delHTMLTag(String htmlStr) {
        Pattern p_html = Pattern.compile(REG_HTML, Pattern.CASE_INSENSITIVE);
        Matcher m_html = p_html.matcher(htmlStr);
        // 过滤html标签
        htmlStr = m_html.replaceAll("");
        return htmlStr;
    }

    /**
     * 获取完整img标签
     *
     * @param html
     * @return
     */
    public static String[] getImgs(String html) {
        Pattern p_image;
        Matcher m_image;
        String str = "";
        String[] images = null;
        p_image = Pattern.compile(REG_IMG, Pattern.CASE_INSENSITIVE);
        m_image = p_image.matcher(html);
        while (m_image.find()) {
            String tempSelected = m_image.group();
            if (StringUtils.isBlank(str)) {
                str = tempSelected;
            } else {
                String temp = tempSelected;
                str = str + "," + temp;
            }
        }
        if (StringUtils.isNotBlank(str)) {
            images = str.split(",");
        }
        return images;
    }

}

 

你可能感兴趣的:(Java,工具)