HtmlParase解析html文件

    第一次使用htmlparser到现在已经有4个月了。现在想整理一下,备忘。

package epson;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.BodyTag;
import org.htmlparser.tags.HeadTag;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;
import org.htmlparser.visitors.TextExtractingVisitor;


public class HtmlAnalysis {
    /**
     * @param args
     */
    private String metaDataString;
    private String title;
    private String charset;
    private String contentType;
    private String content;
    private String link;
    
    
    private String localPath ;
    private Parser parser = null;
    private String htmlsource=null;
    
    public static final String META_KEYWORDS="keywords";
    public static final String META_AUTHOR="author";
    public static final String META_DESCRIPTION="description";
    public static final String META_HTTP_EQUIV="http-equiv";
    
    public HtmlAnalysis(String htmlsource){
    	this.htmlsource = htmlsource; 
    }
    
    public HtmlAnalysis(File htmlsource){
    	
    	try{
    	String resource = this.getContentByLocalFile(htmlsource);
    	this.htmlsource = resource;
    	}catch(Exception e){
    		
    	}
    }
    
    public void init() throws Exception{
    	try{
    	parser = new Parser(this.htmlsource);
    	}catch(Exception e){
    		throw e;
    	}
    }
    
    
    public String getMetaKeywords(){
    	String metaKeywords = "";
        	
    	try {
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
				String cont  = mt.getAttribute("name") ;
				
				if (cont!=null && cont.equalsIgnoreCase("Keywords")) {
					metaKeywords = mt.getAttribute("content");
					break;
				}
			}
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaKeywords;
    }

    public String getTitle() {
    	String title="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(TitleTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
                TitleTag titlenode = (TitleTag) nodeList.elementAt(i) ;
                title = titlenode.getTitle();
                break;
			}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return title;
    }

    public String getBody() {
    	String body="";
       	
    	try {
		NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
		NodeList nodeList = parser.parse(nt);
		for (int i = 0 ; i< nodeList.size(); i++) {
                   BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
                   body = bodynode.getChildrenHTML();
                   break;
		}    
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return body;
    }

    public String getBodyOnload() {
    	String bodyonload=""; 	
    	try {
			NodeFilter nt = new NodeClassFilter(BodyTag.class) ;
			NodeList nodeList = parser.parse(nt);
			for (int i = 0 ; i< nodeList.size(); i++) {
				BodyTag bodynode = (BodyTag) nodeList.elementAt(i) ;
				bodyonload = bodynode.getAttribute("onload");
                               break;
			}  
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return bodyonload;
    }    
    
    public String getHeadInfo() {
    	String head="";
       	
    	try {
			NodeFilter nt = new NodeClassFilter(HeadTag.class) ;
			NodeList nodeList = parser.parse(nt);
            
			HeadTag headnode = null;
			for (int i = 0 ; i< nodeList.size(); i++) {
				headnode = (HeadTag) nodeList.elementAt(i) ;
				break;
			}  
			
			
			if(headnode !=null){
				SimpleNodeIterator tag = headnode.children();
				int i=0;
				while(tag.hasMoreNodes()){
					Node node =tag.nextNode();
					if((node instanceof MetaTag) || node instanceof TitleTag){
						headnode.removeChild(i);
					}
					
					i++;
				}
			}
			
			head = headnode.getChildrenHTML();
            
            
		} catch (ParserException e) {
			e.printStackTrace();
		}

        return head;
    } 
    
    
    public String getMetaInfo(String keytype){
    	String metaInfo = "";
        	
    	try {
    		
			NodeFilter nt = new NodeClassFilter(MetaTag.class) ;
			NodeList nodeList = parser.parse(nt);
			
    		if(META_KEYWORDS.equalsIgnoreCase(keytype)
    			||
    			META_AUTHOR.equalsIgnoreCase(keytype)
    			||
    			META_DESCRIPTION.equalsIgnoreCase(keytype))
    		{

				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else if(META_HTTP_EQUIV.equals(keytype)){
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("http-equiv") ;
					
					if (cont!=null && cont.equalsIgnoreCase(keytype)) {
						metaInfo = mt.getAttribute("content");
						break;
					}
				}
    		}else{
				for (int i = 0 ; i< nodeList.size(); i++) {
					MetaTag mt =(MetaTag) nodeList.elementAt(i) ;
					String cont  = mt.getAttribute("name") ;
					
					if (cont!=null) {
						
						if(META_KEYWORDS.equalsIgnoreCase(cont)
				    			||
				    			META_AUTHOR.equalsIgnoreCase(cont)
				    			||
				    			META_DESCRIPTION.equalsIgnoreCase(cont)){
							
							//
						}else{
							String tempmetaInfo = mt.getAttribute("content");
							metaInfo +="<"+cont+">"+tempmetaInfo+"</"+cont+">";
						}
							
						
					}
				}
    			
    		}
    		
    		
		} catch (ParserException e) {
			e.printStackTrace();
		}
    	return metaInfo;
    }
    
    
    public String  getContentByLocalFile (File path) throws IOException {
    	StringBuffer sbStr = new StringBuffer();
    	BufferedReader reader = null ;
    	String result = null ;
		try {
			reader = new BufferedReader(new FileReader(path));
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		}
		String temp = "";
		while((temp=reader.readLine())!=null)
		  {
		   sbStr.append(temp);
		   sbStr.append("\r\n");
		  }
		  reader.close();
		  result = sbStr.toString();
    	return result ;
    }

    
    public String getContentByUrl(String url){
    	return null ;
    }
    
    public void getmetaDataByVistor() {
    }

    public String getURLContent(String Url) {
        Parser parser = null;

        try {
            parser = new Parser(Url);
            String a="";
            parser = new Parser(a);
            TextExtractingVisitor visitor = new TextExtractingVisitor();
            parser.visitAllNodesWith(visitor);
            content = visitor.getExtractedText();
        } catch (ParserException e1) {
            e1.printStackTrace();
        }

        return content;
    }
    public NodeList getDiv(){
    	  NodeList nodelist=null;
    	  NodeFilter[] nodeFilter=new NodeFilter[2];
    	  try{
    	   parser.setEncoding("GB2312");//set encode
    	   TagNameFilter divFilter=new TagNameFilter("div");//get the table content
    	   HasAttributeFilter divAttribute=new HasAttributeFilter("id","Cont_13");//hava the attribute "bgcolor"
    	   nodeFilter[0]=divFilter;
    	   nodeFilter[1]=divAttribute;
    	   AndFilter andFilter=new AndFilter(nodeFilter);//to link the three filter that above together
    	   nodelist=parser.extractAllNodesThatMatch(andFilter);//get the result that fit for the filter
    	  }catch(Exception e){
    	   e.printStackTrace();
    	  }
    	  return nodelist;
    }
    public NodeList getTable() throws ParserException{
    	NodeList nodelist=null;
    	String dd = getDiv().toHtml();
    	Parser parser2 = new Parser(dd);
    	TagNameFilter tableFilter=new TagNameFilter("table");
    	nodelist = parser2.extractAllNodesThatMatch(tableFilter);
    	String htmlresult ="";
    	for (int i = 0; i <= nodelist.size(); i++) {
            if (nodelist.elementAt(i) instanceof TableTag) {
                TableTag tag = (TableTag) nodelist.elementAt(i);
                TableRow[] rows = tag.getRows();

                for (int j = 0; j < rows.length; j++) {
                    TableRow tr = (TableRow) rows[j];
                    TableColumn[] td = tr.getColumns();
                    for (int k = 0; k < td.length; k++) {
                    	String result = td[k].toPlainTextString().trim().replace("\t", "");
                    	if(k==0){
                    		htmlresult += "<title>"+result+"</title>";
                    	}
                    	else
                    		htmlresult += "<id>"+result+"</id>";
                    }
                }
            }
        }
    	System.out.println(htmlresult);
		return nodelist;
    }
    public void testTable() {
//        Parser myParser;
        NodeList nodeList = null;
//        myParser = Parser.createParser("<body> " + "<table id=’table1′ >"
//                + "<tr><td>1-11</td><td>1-12</td><td>1-13</td>"
//                + "<tr><td>1-21</td><td>1-22</td><td>1-23</td>"
//                + "<tr><td>1-31</td><td>1-32</td><td>1-33</td></table>"
//                + "<table id=’table2′ >"
//                + "<tr><td>2-11</td><td>2-12</td><td>2-13</td>"
//                + "<tr><td>2-21</td><td>2-22</td><td>2-23</td>"
//                + "<tr><td>2-31</td><td>2-32</td><td>2-33</td></table>"
//                + "</body>", "GBK");
        NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
        OrFilter lastFilter = new OrFilter();
        lastFilter.setPredicates(new NodeFilter[] { tableFilter });
        try {
            nodeList = parser.parse(lastFilter);
            for (int i = 0; i <= nodeList.size(); i++) {
                if (nodeList.elementAt(i) instanceof TableTag) {
                    TableTag tag = (TableTag) nodeList.elementAt(i);
                    TableRow[] rows = tag.getRows();

                    for (int j = 0; j < rows.length; j++) {
                        TableRow tr = (TableRow) rows[j];
                        TableColumn[] td = tr.getColumns();
                        for (int k = 0; k < td.length; k++) {
                            System.out.println("<td>" + td[k].toPlainTextString());
                        }

                    }

                }
            }

        } catch (ParserException e) {
            e.printStackTrace();
        }
    }
    public String getImg() {
    	String img="";
    	ImageTag imgnode=null;
    	File file = new File("e:\\test\\jsp\\jsp\\test1.htm");
    	String imgRealPath="";
    	if(file.exists())
    	{  
    		file.delete();  
    		try 
    		{
				file.createNewFile();
			} catch (IOException e) 
			{
				e.printStackTrace();
			}  
    	}else{  
			  try 
			  {
				file.createNewFile();
			  } catch (IOException e) {
				e.printStackTrace();
//					 TODO Auto-generated catch block
			  }  
    	}   
    	try {
			NodeFilter nt = new NodeClassFilter(ImageTag.class) ;
			
			//BufferedWriter writer = new BufferedWriter(new OutputStreamWriter (new FileOutputStream (file)));
			NodeList nodeList = parser.parse(nt);
			
			for (int i = 0 ; i< nodeList.size(); i++){
				int num=0;
				imgnode = (ImageTag)nodeList.elementAt(i);
                img = imgnode.getImageURL();
                System.out.println(img);
               /* String[] filePath = file.getParent().split("\\\\");
                String[] imgPath = img.split("/");
                System.out.println(img+"  "+file.getParent());
                for(int j=0;j<imgPath.length;j++)
                {
                	if(imgPath[j].equals(".."))
                	{	
                		num++;
                	}
                }
                System.out.println(img.indexOf(":")+"img.indexOf(:)"+img);
                if(img.indexOf(":")!=-1)
                {
                	imgRealPath=img;
                }
                else if(num>1)
                {
                	System.out.println("img before replace"+img);
                	img = img.replace("../","");
                	System.out.println("img num>1"+img+num);
                	imgRealPath = filePath[filePath.length-1-num]+"/"+img;
                	while((filePath.length-1-num)>0)
                	{
                		num++;
                		imgRealPath = filePath[filePath.length-1-num]+imgRealPath;
                	}
                	System.out.println("imgRealPath"+imgRealPath+(filePath.length-1-num));
                }
                else if(imgPath[0].equals("."))
                {
                	System.out.println(file.getParent()+"imgPath[0].equals(.)");
                	img = img.replace("./","");
                	imgRealPath=file.getParent()+"\\"+img;
                }
                else
                {
	                for(int j=0;j<imgPath.length;j++)
	                {
	                	if(imgPath[j].equals(".."))
	                	{
	                		imgPath[j] = (String)( imgPath[j].replace("..",filePath[j+1]));
	                		System.out.println(imgPath[j]);
	                	}
	                	if(!imgPath[j].equals(""))
	                		imgRealPath += "/"+imgPath[j];
	                }
	                imgRealPath=filePath[0]+imgRealPath;
                }
                imgRealPath = imgRealPath.replaceAll("\\\\","/");
                imgnode.setImageURL(imgRealPath);
                imgRealPath="";
                writer.write(imgnode.toHtml()); */
			}  
			//writer.flush();
           // writer.close ();  
		} catch (Exception e) {
			e.printStackTrace();
		}
        return imgRealPath;
    }

    public static void main(String[] args) {
    	HtmlAnalysis htmlAnalysis= new HtmlAnalysis(new File("f:\\test.html")); 
    	try{
    		htmlAnalysis.init();
//    		System.out.println(htmlAnalysis.getMetaInfo("keywords"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("author"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("description"));
//    		htmlAnalysis.parser.reset();
//    		System.out.println(htmlAnalysis.getMetaInfo("other"));
//    		htmlAnalysis.parser.reset();
    		//System.out.println(htmlAnalysis.getTitle());
    		//htmlAnalysis.parser.reset();
    		//System.out.println(htmlAnalysis.getHeadInfo());
    		htmlAnalysis.getTable();
//    		htmlAnalysis.testTable();
    	}catch(Exception e){
    		
    	}

    }
    
    public static void visitTag(Tag tag) {
        if (tag.getAttribute("class") != null) {
            System.out.println(" " + tag.getTagName() +
                tag.getAttribute("class"));
        }
    }
    
    

    public String getCharset() {
        return charset;
    }

    public void setCharset(String charset) {
        this.charset = charset;
    }

    public String getContentType() {
        return contentType;
    }

    public void setContentType(String contentType) {
        this.contentType = contentType;
    }

    public String getMetaDataString() {
        return metaDataString;
    }

    public void setMetaDataString(String metaDataString) {
        this.metaDataString = metaDataString;
    }



    public void setTitle(String title) {
        this.title = title;
    }

    public String getContent() {
        return content;
    }

    public void setContent(String content) {
        this.content = content;
    }
}

 

你可能感兴趣的:(html,jsp,F#,J#)