NekoHtml版本:nekohtml-1.9.14
Jar文件:nekohtml.jar;xercesImpl.jar;xml-apis.jar
/**将一个文件夹下的一类html文件解析成同名网页正文文本文件存至另一文件夹**/
public class ParseDir {
private File sourceDir;
private File targetDir;
private ParsePage pp;
public void execute(String sourceDir, String targetDir, String ElementId, Boolean addTitle, String chartSet) throws Exception{
this.sourceDir = new File(sourceDir);
this.targetDir = new File(targetDir);
if (!this.targetDir.exists()) this.targetDir.mkdirs(); //目标不存在则新建
File[] files = this.sourceDir.listFiles();
pp = new ParsePage();
for (File file : files) {
String filename = file.getName();
if (filename.endsWith(".htm") || filename.endsWith("html")) { //只解析htm/html页面
pp.execute(file.getPath(), targetDir + "/"
+ filename.substring(0, filename.lastIndexOf(".") - 1)
+ ".txt", ElementId, addTitle, chartSet);
}
}
}
public static void main(String[] args) throws Exception {
ParseDir p = new ParseDir();
p.execute("E:/", "E:/5", "news_content", true, "utf-8");
}
}
==============
/**根据divId使用NekoHtml解析单篇文档**/
public class ParsePage {
private static DOMParser parser;
public ParsePage() throws Exception {
parser = new DOMParser();
parser.setProperty(
"http://cyberneko.org/html/properties/default-encoding",
"utf-8");
}
public void execute(String sourceFilename, String targetFilename, String ElementId, Boolean addTitle, String chartSet) throws Exception {
Reader r = new InputStreamReader(new FileInputStream(sourceFilename), chartSet); // 显示声明所使用的流编码
parser.parse(new InputSource(r));
Document doc = parser.getDocument();
PrintWriter pw = new PrintWriter(new FileWriter(targetFilename));
if (addTitle) pw.append(doc.getElementsByTagName("title").item(0).getTextContent() + "\r\n"); //写入标题内容
pw.append(TextExtractor(doc.getElementById(ElementId)));
pw.flush();
pw.close();
}
public static String TextExtractor(Node root) {
if (root.getNodeType() == Node.TEXT_NODE) { // 若是文本节点的话,直接返回,并保持原换行格式
return root.getTextContent().trim() + "\r\n";
}
if (root.getNodeType() == Node.ELEMENT_NODE) {
Element elmt = (Element) root;
if (elmt.getTagName().equals("STYLE") || elmt.getTagName().equals("SCRIPT")) // 抛弃脚本
return "";
NodeList children = elmt.getChildNodes();
StringBuffer text = new StringBuffer();
for (int i = 0; i < children.getLength(); i++) {
text.append(TextExtractor(children.item(i)));
}
return text.toString();
}
return ""; // 对其它类型的节点,返回空值
}
}