2013-正则表达式解析文本

      在项目中可能会出现这样的场景:需要从一段文本中解析出数据,

列如:需要从下文找出注红的数据

 FSI/*CX

S KA   909Y22MAR PEK1630 2020HKG0X    333   

S CX   806Y23MAR HKG1150 1315ORD0S    77W   

01 YOW2+YX2            17758 CNY                    INCL TAX

*SYSTEM DEFAULT-CHECK OPERATING CARRIER 

*INTERLINE AGREEMENT PRICING APPLIED

*ATTN PRICED ON 21JAN14*1307

 BJS

XHKG YOW2            NVB      NVA22MAR 2PC  

 CHI YX2             NVB      NVA22MAR 2PC  

FARE  CNY   16480   

TAX   CNY     90CN CNY     94HK CNY   1094XT

TOTAL CNY   17758   

22MAR14BJS KA X/HKG563.99CX CHI Q4.25 2140.91NUC2709.15END R

OE6.081590  

XT CNY 106US CNY 31XA CNY 43XY CNY 34YC CNY 880YR   

ENDOS 02 *T1

*AUTO BAGGAGE INFORMATION AVAILABLE - SEE FSB   

RFSONLN/1E /EFEP_13/FCC=T/ 

通过下面这个解析类,可以实现我们的功能,主要用到了正则表达式的()捕获功能

package cn.test;


import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class QTaxParser1 {
	private static final String QTAX_PATTERN = "^[0-9]{1,2}(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC).*";
	private static final String TAX_PATTERN = "^TAX.*";
	private static final String NUM_PATTERN = "([0-9]+)([A-Z]+) *";
	private static final String QNUM_PATTERN = "Q([0-9]+\\.{0,1}[0-9]*)";
	private static final String QROE_PATTERN = "\\s+R\\s*O\\s*E\\s*(([0-9]\\s*)+(\\.\\s*){0,1}([0-9]\\s*)*)\\s+";
	private static final String RATE_PATTERN = "=([0-9]+\\.{0,1}[0-9]*)";

	private Map<String, Double> tax = new HashMap<String, Double>();
	private List<Double> qTax = new ArrayList<Double>();
	private Double roe;
	private static Logger log = LoggerFactory.getLogger(QTaxParser1.class);

	public Map<String, Double> getTax(String txt){
		// 分解出TAX 行
		List<String> taxLine = parase(txt, TAX_PATTERN);

		if ((taxLine != null) && (taxLine.size() > 0)) {
			// 处理TAX 行
			List<String> taxItem = parase(taxLine.get(0), NUM_PATTERN);

			for (int i = 0; i < taxItem.size(); i += 2) {
				tax.put(taxItem.get(i + 1), Double.parseDouble(taxItem.get(i)));
			}
		}
		
		return tax;
	}
	
	public List<Double> getQTax(String txt){
		// 分解出TAX 行
		List<String> taxLine = parase(txt, TAX_PATTERN);

		if ((taxLine != null) && (taxLine.size() > 0)) {
			// 分解出Q行
			List<String> qTaxLine = parase(txt, QTAX_PATTERN, false);

			if ((qTaxLine != null) && (qTaxLine.size() > 0)) {
				// 处理QTAX 行
				List<String> qTaxItem = parase(qTaxLine.get(0), QNUM_PATTERN);
				// 提取Q值
				for (int i = 0; i < qTaxItem.size(); i++) {
					qTax.add(Double.parseDouble(qTaxItem.get(i)));
				}
				
			}

		}
		return qTax;
	}
	
	public Double getROE(String txt) {
		// 分解出ROE行
		List<String> roeItem = parase(txt, QROE_PATTERN);
		// 提取ROE值
		if (roeItem.size() > 0) {
			roe = Double.parseDouble(roeItem.get(0).replaceAll("\\s*", ""));
		}
		return roe;
	}

	public boolean isTaxPage(String txt) {
		Pattern ptn = Pattern.compile(QTAX_PATTERN, Pattern.MULTILINE);
		Matcher m = ptn.matcher(txt);
		if (m.find()) {
			log.debug("TAX Match:" + m.group());
			return true;
		}
		return false;
	}

	public String getRateValue(String txt) {
		List<String> rates = parase(txt, RATE_PATTERN);
		if (rates.size() > 0) {
			return parase(txt, RATE_PATTERN).get(0);
		} else {
			return null;
		}

	}

	private List<String> parase(String txt, String pattern) {
		return parase(txt, pattern, true);
	}

	private static List<String> parase(String txt, String pattern, boolean grouped) {

		Pattern ptn = Pattern.compile(pattern, Pattern.MULTILINE);
		Matcher m = ptn.matcher(txt);

		List<String> matches = new ArrayList<String>();

		if (!grouped || (m.groupCount() == 0)) {
			if (m.find()) {
				matches.add(m.group());
			}
		} else {

			while (m.find()) {

				for (int i = 1; i <= m.groupCount(); i++) {
					matches.add(m.group(i));
				}
			}
		}

		return matches;

	}

}
测试用例

package itour.cn.fare.gateway;

import cn.test.QTaxParser1;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;

public class QTaxParserTest {
	public static void main(String[] args) {
		QTaxParser1 parser = new QTaxParser1();
		String txt =" FSICH/*CX  "+ 
"\n"+
"S KA   909Y22MAR PEK1630 2020HKG0X    333  "+ 
"\n"+
"S CX   806Y23MAR HKG1150 1315ORD0S    77W   "+
"\n"+
"01 YOW2+YX2  CH        13464 CNY                    INCL TAX"+
"\n"+
"*SYSTEM DEFAULT-CHECK OPERATING CARRIER "+
"\n"+
"*INTERLINE AGREEMENT PRICING APPLIED"+
"\n"+
"*ACCOMPANIED VALIDATION-ALL PAX MUST BE TKTD AT SAME TIME  "+ 
"\n"+
"*VERIFY AGE REQUIREMENTS"+
"\n"+
"*ATTN PRICED ON 21JAN14*1158"+
"\n"+
"BJS"+
"\n"+
"XHKG YOW2     CH25   NVB      NVA22MAR 2PC "+ 
"\n"+
" CHI YX2      CH25   NVB      NVA22MAR 2PC "+ 
"\n"+
"FARE  CNY   12370   "+
"\n"+
"TAX    EXEMPT CN   CNY    106US CNY    988XT"+
"\n"+
"TOTAL CNY   13464   "+
"\n"+
"22MAR14BJS KA X/HKG422.99CX CHI Q4.25 1605.68NUC2032.92END R"+
"\n"+
"OE6.081590  "+
"\n"+
"XT CNY 31XA CNY 43XY CNY 34YC CNY 880YR "+
"\n"+
"ENDOS 02 *T1"+
"\n"+
"*AUTO BAGGAGE INFORMATION AVAILABLE - SEE FSB "+  
"\n"+
"RFSONLN/1E /EFEP_23/FCC=T/";

		System.out.println(JSONObject.fromObject(parser.getTax(txt)).toString());
		System.out.println(JSONArray.fromObject(parser.getQTax(txt)).toString());
		System.out.println(JSONArray.fromObject(parser.getROE(txt)).toString());
    }
}


你可能感兴趣的:(正则表达式,捕获)