Extract text of special font style using htmlparser

package extractor;

import java.io.File;
import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import sap.FileInputReader;

public class HTMLElementExtractor {
	public void extractBoldFont(String content, String tag)
			throws ParserException {
		/*
		 * use regular expression to extract italic text, may not be complete
		 * e.g. in <i><span>blahblah...</span></i>, "blahblah..." will be
		 * ignored ArrayList<String> italics = new ArrayList<String>();
		 * 
		 * Matcher matcher =
		 * Pattern.compile(">[A-Za-z ]+</i>").matcher(content); while
		 * (matcher.find()) { String str=matcher.group(); Matcher m =
		 * Pattern.compile("[A-Za-z ]+").matcher(str); if (m.find()) {
		 * System.out.println(m.group()); } }
		 */

		Parser parser = new Parser(content);
		NodeFilter filter = new TagNameFilter(tag);
		NodeList list = parser.extractAllNodesThatMatch(filter);
		NodeIterator iterator = list.elements();
		while (iterator.hasMoreNodes()) {
			TagNode node = (TagNode) iterator.nextNode();
			if (tag == "a" && node.getFirstChild() != null) {
				// <a href="../../7d/a109d5efcc4644a9f2da2ab27e50dd/content.htm"
				// title="Go to specified document">Search Task Panel for BI
				// Data Services</a>
				System.out.println(node.getFirstChild().toPlainTextString());
			} else if (tag == "i" || tag == "b") {// || tag == "em", from the
													// content, it seems "<em>"
													// is not what we intended
				Node nd = node.getNextSibling();
				if (nd instanceof TextNode) {
					// <i>blah</i>
					System.out.println(nd.toPlainTextString());
				} else if (nd instanceof TagNode) {
					// <i><span class="SAPXDPNavigationPath "
					// title="Navigation path">blah</span></i>
					System.out.println(nd.getFirstChild().toPlainTextString());
				}
			}
		}

	}

	public static void main(String[] args) {
		HTMLElementExtractor extractor = new HTMLElementExtractor();
		String content;
		try {
			content = new FileInputReader(new File("test/input/content.htm"))
					.getStringContent();
			extractor.extractBoldFont(content, "b");
		} catch (IOException e) {
			e.printStackTrace();
		} catch (ParserException e) {
			e.printStackTrace();
		}

	}

}
 

你可能感兴趣的:(HtmlParser)