第一次使用htmlparser到现在已经有4个月了。现在想整理一下,备忘。
package epson; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.Tag; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.BodyTag; import org.htmlparser.tags.HeadTag; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.MetaTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableRow; import org.htmlparser.tags.TableTag; import org.htmlparser.tags.TitleTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.util.SimpleNodeIterator; import org.htmlparser.visitors.TextExtractingVisitor; public class HtmlAnalysis { /** * @param args */ private String metaDataString; private String title; private String charset; private String contentType; private String content; private String link; private String localPath ; private Parser parser = null; private String htmlsource=null; public static final String META_KEYWORDS="keywords"; public static final String META_AUTHOR="author"; public static final String META_DESCRIPTION="description"; public static final String META_HTTP_EQUIV="http-equiv"; public HtmlAnalysis(String htmlsource){ this.htmlsource = htmlsource; } public HtmlAnalysis(File htmlsource){ try{ String resource = this.getContentByLocalFile(htmlsource); this.htmlsource = resource; }catch(Exception e){ } } public void init() throws Exception{ try{ parser = new Parser(this.htmlsource); }catch(Exception e){ throw e; } } public String getMetaKeywords(){ String metaKeywords = ""; try { NodeFilter nt = new NodeClassFilter(MetaTag.class) ; NodeList nodeList = parser.parse(nt); for (int i = 0 ; i< nodeList.size(); i++) { MetaTag mt =(MetaTag) nodeList.elementAt(i) ; String cont = mt.getAttribute("name") ; if (cont!=null && cont.equalsIgnoreCase("Keywords")) { metaKeywords = mt.getAttribute("content"); break; } } } catch (ParserException e) { e.printStackTrace(); } return metaKeywords; } public String getTitle() { String title=""; try { NodeFilter nt = new NodeClassFilter(TitleTag.class) ; NodeList nodeList = parser.parse(nt); for (int i = 0 ; i< nodeList.size(); i++) { TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ; title = titlenode.getTitle(); break; } } catch (ParserException e) { e.printStackTrace(); } return title; } public String getBody() { String body=""; try { NodeFilter nt = new NodeClassFilter(BodyTag.class) ; NodeList nodeList = parser.parse(nt); for (int i = 0 ; i< nodeList.size(); i++) { BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ; body = bodynode.getChildrenHTML(); break; } } catch (ParserException e) { e.printStackTrace(); } return body; } public String getBodyOnload() { String bodyonload=""; try { NodeFilter nt = new NodeClassFilter(BodyTag.class) ; NodeList nodeList = parser.parse(nt); for (int i = 0 ; i< nodeList.size(); i++) { BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ; bodyonload = bodynode.getAttribute("onload"); break; } } catch (ParserException e) { e.printStackTrace(); } return bodyonload; } public String getHeadInfo() { String head=""; try { NodeFilter nt = new NodeClassFilter(HeadTag.class) ; NodeList nodeList = parser.parse(nt); HeadTag headnode = null; for (int i = 0 ; i< nodeList.size(); i++) { headnode = (HeadTag) nodeList.elementAt(i) ; break; } if(headnode !=null){ SimpleNodeIterator tag = headnode.children(); int i=0; while(tag.hasMoreNodes()){ Node node =tag.nextNode(); if((node instanceof MetaTag) || node instanceof TitleTag){ headnode.removeChild(i); } i++; } } head = headnode.getChildrenHTML(); } catch (ParserException e) { e.printStackTrace(); } return head; } public String getMetaInfo(String keytype){ String metaInfo = ""; try { NodeFilter nt = new NodeClassFilter(MetaTag.class) ; NodeList nodeList = parser.parse(nt); if(META_KEYWORDS.equalsIgnoreCase(keytype) || META_AUTHOR.equalsIgnoreCase(keytype) || META_DESCRIPTION.equalsIgnoreCase(keytype)) { for (int i = 0 ; i< nodeList.size(); i++) { MetaTag mt =(MetaTag) nodeList.elementAt(i) ; String cont = mt.getAttribute("name") ; if (cont!=null && cont.equalsIgnoreCase(keytype)) { metaInfo = mt.getAttribute("content"); break; } } }else if(META_HTTP_EQUIV.equals(keytype)){ for (int i = 0 ; i< nodeList.size(); i++) { MetaTag mt =(MetaTag) nodeList.elementAt(i) ; String cont = mt.getAttribute("http-equiv") ; if (cont!=null && cont.equalsIgnoreCase(keytype)) { metaInfo = mt.getAttribute("content"); break; } } }else{ for (int i = 0 ; i< nodeList.size(); i++) { MetaTag mt =(MetaTag) nodeList.elementAt(i) ; String cont = mt.getAttribute("name") ; if (cont!=null) { if(META_KEYWORDS.equalsIgnoreCase(cont) || META_AUTHOR.equalsIgnoreCase(cont) || META_DESCRIPTION.equalsIgnoreCase(cont)){ // }else{ String tempmetaInfo = mt.getAttribute("content"); metaInfo +="<"+cont+">"+tempmetaInfo+"</"+cont+">"; } } } } } catch (ParserException e) { e.printStackTrace(); } return metaInfo; } public String getContentByLocalFile (File path) throws IOException { StringBuffer sbStr = new StringBuffer(); BufferedReader reader = null ; String result = null ; try { reader = new BufferedReader(new FileReader(path)); } catch (FileNotFoundException e) { e.printStackTrace(); } String temp = ""; while((temp=reader.readLine())!=null) { sbStr.append(temp); sbStr.append("\r\n"); } reader.close(); result = sbStr.toString(); return result ; } public String getContentByUrl(String url){ return null ; } public void getmetaDataByVistor() { } public String getURLContent(String Url) { Parser parser = null; try { parser = new Parser(Url); String a=""; parser = new Parser(a); TextExtractingVisitor visitor = new TextExtractingVisitor(); parser.visitAllNodesWith(visitor); content = visitor.getExtractedText(); } catch (ParserException e1) { e1.printStackTrace(); } return content; } public NodeList getDiv(){ NodeList nodelist=null; NodeFilter[] nodeFilter=new NodeFilter[2]; try{ parser.setEncoding("GB2312");//set encode TagNameFilter divFilter=new TagNameFilter("div");//get the table content HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor" nodeFilter[0]=divFilter; nodeFilter[1]=divAttribute; AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter }catch(Exception e){ e.printStackTrace(); } return nodelist; } public NodeList getTable() throws ParserException{ NodeList nodelist=null; String dd = getDiv().toHtml(); Parser parser2 = new Parser(dd); TagNameFilter tableFilter=new TagNameFilter("table"); nodelist = parser2.extractAllNodesThatMatch(tableFilter); String htmlresult =""; for (int i = 0; i <= nodelist.size(); i++) { if (nodelist.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodelist.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; TableColumn[] td = tr.getColumns(); for (int k = 0; k < td.length; k++) { String result = td[k].toPlainTextString().trim().replace("\t", ""); if(k==0){ htmlresult += "<title>"+result+"</title>"; } else htmlresult += "<id>"+result+"</id>"; } } } } System.out.println(htmlresult); return nodelist; } public void testTable() { // Parser myParser; NodeList nodeList = null; // myParser = Parser.createParser("<body> " + "<table id=’table1′ >" // + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>" // + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>" // + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>" // + "<table id=’table2′ >" // + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>" // + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>" // + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>" // + "</body>", "GBK"); NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); try { nodeList = parser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); TableRow[] rows = tag.getRows(); for (int j = 0; j < rows.length; j++) { TableRow tr = (TableRow) rows[j]; TableColumn[] td = tr.getColumns(); for (int k = 0; k < td.length; k++) { System.out.println("<td>" + td[k].toPlainTextString()); } } } } } catch (ParserException e) { e.printStackTrace(); } } public String getImg() { String img=""; ImageTag imgnode=null; File file = new File("e:\\test\\jsp\\jsp\\test1.htm"); String imgRealPath=""; if(file.exists()) { file.delete(); try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); } }else{ try { file.createNewFile(); } catch (IOException e) { e.printStackTrace(); // TODO Auto-generated catch block } } try { NodeFilter nt = new NodeClassFilter(ImageTag.class) ; //BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file))); NodeList nodeList = parser.parse(nt); for (int i = 0 ; i< nodeList.size(); i++){ int num=0; imgnode = (ImageTag)nodeList.elementAt(i); img = imgnode.getImageURL(); System.out.println(img); /* String[] filePath = file.getParent().split("\\\\"); String[] imgPath = img.split("/"); System.out.println(img+" "+file.getParent()); for(int j=0;j<imgPath.length;j++) { if(imgPath[j].equals("..")) { num++; } } System.out.println(img.indexOf(":")+"img.indexOf(:)"+img); if(img.indexOf(":")!=-1) { imgRealPath=img; } else if(num>1) { System.out.println("img before replace"+img); img = img.replace("../",""); System.out.println("img num>1"+img+num); imgRealPath = filePath[filePath.length-1-num]+"/"+img; while((filePath.length-1-num)>0) { num++; imgRealPath = filePath[filePath.length-1-num]+imgRealPath; } System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num)); } else if(imgPath[0].equals(".")) { System.out.println(file.getParent()+"imgPath[0].equals(.)"); img = img.replace("./",""); imgRealPath=file.getParent()+"\\"+img; } else { for(int j=0;j<imgPath.length;j++) { if(imgPath[j].equals("..")) { imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1])); System.out.println(imgPath[j]); } if(!imgPath[j].equals("")) imgRealPath += "/"+imgPath[j]; } imgRealPath=filePath[0]+imgRealPath; } imgRealPath = imgRealPath.replaceAll("\\\\","/"); imgnode.setImageURL(imgRealPath); imgRealPath=""; writer.write(imgnode.toHtml()); */ } //writer.flush(); // writer.close (); } catch (Exception e) { e.printStackTrace(); } return imgRealPath; } public static void main(String[] args) { HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html")); try{ htmlAnalysis.init(); // System.out.println(htmlAnalysis.getMetaInfo("keywords")); // htmlAnalysis.parser.reset(); // System.out.println(htmlAnalysis.getMetaInfo("author")); // htmlAnalysis.parser.reset(); // System.out.println(htmlAnalysis.getMetaInfo("description")); // htmlAnalysis.parser.reset(); // System.out.println(htmlAnalysis.getMetaInfo("other")); // htmlAnalysis.parser.reset(); //System.out.println(htmlAnalysis.getTitle()); //htmlAnalysis.parser.reset(); //System.out.println(htmlAnalysis.getHeadInfo()); htmlAnalysis.getTable(); // htmlAnalysis.testTable(); }catch(Exception e){ } } public static void visitTag(Tag tag) { if (tag.getAttribute("class") != null) { System.out.println(" " + tag.getTagName() + tag.getAttribute("class")); } } public String getCharset() { return charset; } public void setCharset(String charset) { this.charset = charset; } public String getContentType() { return contentType; } public void setContentType(String contentType) { this.contentType = contentType; } public String getMetaDataString() { return metaDataString; } public void setMetaDataString(String metaDataString) { this.metaDataString = metaDataString; } public void setTitle(String title) { this.title = title; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } }