htmlparser应用

package com.util.md5;

import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;

import com.sun.corba.se.spi.orbutil.fsm.State;
/**
 *  用HTMLparser抓取img标签的路径
 * @author Administrator [email protected]
 * @param content:要爬取的文本
 * @return 返回SRC路径改为服务器路径的文本
 * @throws Exception:io异常
 */
public class Htmlparserutil {
	/**
	 * example : new Htmlparserutil().parserto("<img src = 'www.baidu.com'/><a><a/>","D:\test");
	 * @param contentString  要取img标签的内容
	 * @param pathString    要存到服务器的路径
	 */
	public static  String parserto(String contentString,String pathString,String dispPath)  {
		String  stringBuffer=null;
		try {
			List list = new ArrayList();
			Parser parser = Parser.createParser(contentString, "UTF-8");
				NodeList nodeList = parser
					.extractAllNodesThatMatch(new NodeFilter() {
						public boolean accept(Node node) {
							if (node instanceof ImageTag)// <img>标记
								return true;
							return false;
						}
					});
			for (int i = 0; i < nodeList.size(); i++) {
				ImageTag n = (ImageTag) nodeList.elementAt(i);
				list.add(n.getImageURL());
				int start = 0;
				start = n.getImageURL().lastIndexOf("/");
				String picname=n.getImageURL().substring(start,n.getImageURL().length());
				contentString=contentString.replace(n.getImageURL(), dispPath+picname);
				///////////////?????????
				if(n.getImageURL().indexOf("http://e.huisou.com")==1){
					contentString.replace("http://e.huisou.com","http://img.e.huisou.com");
				}
			}
			List<String> listImg = new ArrayList<String>();
			listImg.add(".jpg");
			listImg.add(".JPG");
			listImg.add(".jpeg");
			listImg.add(".JPEG");
			listImg.add(".bmp");
			listImg.add(".BMP");
			listImg.add(".gif");
			listImg.add(".GIF");
			listImg.add(".png");
			listImg.add(".PNG");
			Iterator ite = list.iterator();
			while (ite.hasNext()) {
				String content = (String) ite.next();
				for (int i = 0; i < listImg.size(); i++) {
					if (content.contains(listImg.get(i))) {
						content = content.substring(0, content.indexOf(listImg
								.get(i))
								+ listImg.get(i).length());	
					}
				}
				URL u = new URL(content);
				URLConnection uc = u.openConnection();
				InputStream in = uc.getInputStream();
				String name = content.substring(content.lastIndexOf("/") + 1,
						content.length());
				OutputStream out = new FileOutputStream(pathString + name);
				byte[] buffer = new byte[1024];
				while (in.read(buffer) > 0) {
					out.write(buffer);
				}
				out.flush();
				out.close();
				in.close();
			}
		} catch (Exception e) {
		}
		return contentString;
	}
	/**
	 * 替换文本标签迭代
	 * @param contentString
	 * @return
	 * @throws ParserException
	 */
	public static  String parserto(String contentString) throws ParserException  {
		
		try {
			Parser parser = Parser.createParser(contentString, "UTF-8");
				NodeList nodeList = parser
					.extractAllNodesThatMatch(new NodeFilter() {
						public boolean accept(Node node) {
							if (node instanceof LinkTag)//
								return true;
							return false;
						}
					});
			for (int i = 0; i < nodeList.size(); i++) {
				LinkTag n = (LinkTag) nodeList.elementAt(i);
//			
				contentString=contentString.replace(n.toHtml().toString(),n.getLinkText());
			}
		} catch (Exception e) {
		}
		return contentString;
	}
	
	
	
	public static  String updateurl(String contentString)  {
			Parser parser = Parser.createParser(contentString, "UTF-8");
				NodeList nodeList = null;
				try {
					nodeList = parser
						.extractAllNodesThatMatch(new NodeFilter() {
							public boolean accept(Node node) {
								if (node instanceof ImageTag)
									return true;
								return false;
							}
					});
				} catch (ParserException e) {
					e.printStackTrace();
				}
			for (int i = 0; i < nodeList.size(); i++) {
				ImageTag n = (ImageTag) nodeList.elementAt(i);
				if (n.getImageURL().indexOf("http://e.huisou.com")==1) {
					System.out.println("start");
					contentString=contentString.replace(n.getImageURL().substring("http://e.huisou.com".length()),"http://img.e.huisou.com");
					System.out.println("end");
				}else{
					System.out.println("为找到外网的图片");
				}
			}
		return contentString;
	}
	 
	
}


 

你可能感兴趣的:(.net,应用服务器,sun)