Html解析生成纯文本-使用SAX以及htmlcleaner

阅读更多
package testlucene;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;

import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Logger;
import org.htmlcleaner.HtmlCleaner;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

public class SAXxhtml extends DefaultHandler {
	/**
	 * Logger for this class
	 */
	private static final Logger logger = Logger.getLogger(SAXxhtml.class);

	public StringBuffer sb = new StringBuffer();
	public boolean usable = true;
	private String sPath = "";

	public SAXxhtml() {
		super();
		// TODO Auto-generated constructor stub
		// PropertyConfigurator.configure("log4j.properties");
		BasicConfigurator.configure();
	}

	public void startElement(String namespaceURI, String localName,
			String rawName, Attributes atts) throws SAXException {
		if (rawName.equals("style") || rawName.equals("script")) {
			usable = false;
		}

	}

	// 解析完成后的统计工作
	public void endDocument() throws SAXException {
		try {
			PrintWriter pw = new PrintWriter(new FileOutputStream(sPath));
			pw.print(sb.toString());
			pw.flush();
		} catch (FileNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}

	public void characters(char[] ch, int start, int length) {
		String charEncontered = new String(ch, start, length);
		/*
		 * if (!charEncontered.startsWith(" 
 

 

 

具体思路是Html->xml,然后就可以用sax对xml解析,但是程序总调不通,有人能帮助解决一下么?

你可能感兴趣的:(HTML,Java,log4j,XML,Apache)