htmlparser解析网页内容代码

/*抽取html网页文本,循环到值节点并判断是否有src链接*/
	public void htmlText(String url) throws Exception
	{
		
		try {
			URL u = new URL(url);
			if("https".equalsIgnoreCase(u.getProtocol())){
				SslUtils.ignoreSsl();
			}
			//生成一个解析器对象,用网页的 url 作为参数
			Parser parser = new Parser(url);
			if(parser.getEncoding().equals("ISO-8859-1")) 
				parser.setEncoding("UTF-8");
			//迭代所有节点, null 表示不使用 NodeFilter
			NodeList list = parser.parse(null);
            //循环值节点并输出
			processNodeList(list);	
			//输出大节点
			/*SimpleNodeIterator iterator = list.elements();
			while (iterator.hasMoreNodes()) {
				Node node = iterator.nextNode();
				String result = node.toHtml();
				System.out.println(result);	
			}*/
        }catch (ParserException e) {e.printStackTrace();}
	}
	private void processNodeList(NodeList list) {
		//迭代开始
		SimpleNodeIterator iterator = list.elements();
		while (iterator.hasMoreNodes()) {
			Node node = iterator.nextNode();	
			//得到该节点的子节点列表
			NodeList childList = node.getChildren();
			//孩子节点为空,说明是值节点
			if (null == childList){
				//得到值节点的值
				String txt = node.toHtml();
				System.out.println(txt);
				if(txt.contains("src="))
				{
					int start = txt.indexOf("src=");
					txt=txt.substring(start);
					int end = txt.indexOf(" ");
					if (end == -1) end = txt.indexOf(">");
					String src = txt.substring(5, end - 1);
					System.out.println(src);
				}			
			}else processNodeList(childList);//孩子节点不为空,继续迭代该孩子节点
		}
	}

/*
	 * 在htmlparser中,Node分成三类,都继承AbstractNode:
	 *  RemarkNode:代表Html中的注释
	 *  TagNode:标签节点。
	 *  TextNode:文本节点
	 */
	public void tagGet(String url){
		try {
		    Parser parser = new Parser(url);
		    if(parser.getEncoding().equals("ISO-8859-1")) 
				parser.setEncoding("UTF-8");
		    
		    NodeVisitor visitor = new NodeVisitor( true, true ) {
                public void visitTag(Tag tag) {
                	System.out.println("This is Tag:"+tag.getText());
                }
                public void visitStringNode (Text string)    {
                	System.out.println("This is Text:"+string);
                }
                public void visitRemarkNode (Remark remark) {
                	System.out.println("This is Remark:"+remark.getText());
                }
                public void beginParsing () {
                	System.out.println("beginParsing");
                }
                public void visitEndTag (Tag tag){
                	System.out.println("visitEndTag:"+tag.getText());
                }
                public void finishedParsing () {
                	System.out.println("finishedParsing");
                }
            };
            parser.visitAllNodesWith(visitor);
            
		    /*NodeVisitor visitor = new NodeVisitor() {  
                public void visitTag(Tag tag) { 
                    System.out.println("正在访问的tag:" + tag.getTagName() + "  ||  Class is :"+ tag.getClass());  
                }  
            };  
		    parser.visitAllNodesWith(visitor);
		    */
		    //NodeList parse = parser.parse(null);
		    //System.out.println(parse.toHtml());
		    
		} catch (ParserException e) {
		    e.printStackTrace();
		}
	}

/*
	 * 单独获取inputtag和formtag
	 */
	public void formGet(String url) throws ParserException{
		Parser parser = new Parser(url);
		if(parser.getEncoding().equals("ISO-8859-1")) 
			parser.setEncoding("UTF-8");
		NodeFilter inputFilter = new NodeClassFilter(InputTag.class);
		NodeFilter formFilter = new NodeClassFilter(FormTag.class);
		NodeFilter frameFilter = new NodeClassFilter(FrameTag.class);
		OrFilter lastFilter = new OrFilter();
	    lastFilter.setPredicates(new NodeFilter[] { formFilter, inputFilter,frameFilter });
		NodeList  nodeList = parser.parse(lastFilter);
		for(int i = 0; i <= nodeList.size(); i++)
		{
			if(nodeList.elementAt(i) instanceof InputTag)
			{
				InputTag tag = (InputTag) nodeList.elementAt(i);
				System.out.println("Input Info:" + tag.getTagName()+":"
				                   +tag.getText());
			}
			if(nodeList.elementAt(i) instanceof FormTag)
			{
				FormTag tag = (FormTag) nodeList.elementAt(i);
				System.out.println("Form Info: " + tag.getFormName()+":"
		                   +tag.getText());
				NodeList inputnodes=tag.getFormInputs();
				for(int j=0;j<=inputnodes.size();j++){
					InputTag tag1 = (InputTag) inputnodes.elementAt(j);
					System.out.println("Input Info:" + tag1.getTagName()+":"
					                   +tag1.getText());
				}			
			}
			if(nodeList.elementAt(i) instanceof FrameTag)
			{
				FrameTag tag = (FrameTag) nodeList.elementAt(i);
				System.out.println("Frame Info: " + tag.getFrameName()+":"
				                   +tag.getText());
			}
		}
	}
	/*
	 * org.htmlparser Class PrototypicalNodeFactory
	 * A node factory based on the prototype pattern. 
	 * This factory uses the prototype pattern to generate new nodes. 
	 * These are cloned as needed to form new Text, Remark and Tag nodes.
	 */
	public void hyperGet(String url) throws ParserException{
		PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
		factory.registerTag(new FormTag());
		Parser parser = new Parser (url);
		parser.setNodeFactory (factory);
		NodeFilter filter = new NodeClassFilter(FormTag.class);
		NodeList  nodeList = parser.parse(filter);
		for (Node node : nodeList.toNodeArray()) {  
			if (node instanceof FormTag) { 
				FormTag tag = (FormTag) node;
				System.out.println("Form Info: " + tag.getFormName()+":"
		                   +tag.getText());
			}
		}
	}

你可能感兴趣的:(htmlparser解析网页内容代码)