ftp client的开源实现

HTML解析htmlparser

htmlparser
首页:http://sourceforge.net/projects/htmlparser/
下载:http://sourceforge.net/project/showfiles.php?group_id=24399
文件:HTMLParser-2.0-SNAPSHOT-bin.zip

cpdetector
首页:http://cpdetector.sourceforge.net/
下载:http://sourceforge.net/project/showfiles.php?group_id=114421
文件:cpdetector_eclipse_project_1.0.7.zip

解开压缩后,运行ANT打包命令,build.xml有些地方需要稍微根据具体情况调整一下
ant jar.htmlentitydecoder
得到JAR包
cpdetector_1.0.7.jar

HTML工具类函数一:自动探测URL的HTML内容的编码

/**
* 自动探测页面的编码
*
* @param url
* @return
* @throws MalformedURLException
*/
public static String autoDetectCharset(String url) {
   URL source = null;
   try {
    source = new URL(url);
   } catch (MalformedURLException e) {
    log.error(e);
   }
   CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
   detector.add(new ParsingDetector(false));
   detector.add(JChardetFacade.getInstance());
   detector.add(ASCIIDetector.getInstance());
   detector.add(UnicodeDetector.getInstance());

   Charset charset = null;
   try {
    charset = detector.detectCodepage(source);
   } catch (IOException e) {
    log.error(e);
   }

   if (charset == null) {
    charset = Charset.defaultCharset();
   }
   return charset.name();
}

HTML工具类函数二:读取URL中的HTML文本
/**
* 读取文件HTML内容
*
* @param url
* @param charset
* @return
* @throws IOException
*/
public static String readURL(String url, String charset) {
   /* StringBuffer的缓冲区大小 */
   int TRANSFER_SIZE = 4096;

   /* 当前平台的行分隔符 */
   String lineSep = System.getProperty("line.separator");

   String content = "";
   URL source = null;
   try {
    source = new URL(url);
   } catch (MalformedURLException e) {
    log.error(e);
   }
   InputStream in = null;
   try {
    in = source.openStream();
   } catch (IOException e) {
    log.error(e);
   }
   BufferedReader reader = null;
   try {
    reader = new BufferedReader(new InputStreamReader(in, charset));
   } catch (UnsupportedEncodingException e) {
    log.error(e);
   }
   String line = new String();
   StringBuffer temp = new StringBuffer(TRANSFER_SIZE);
   try {
    while ((line = reader.readLine()) != null) {
     temp.append(line);
     temp.append(lineSep);
    }
    in.close();
    reader.close();
   } catch (IOException e) {
    log.error(e);
   }
   content = temp.toString();
   return content;
}

HTML工具类函数三:解析HTML得到其中的所有TAG
public static NodeList getFormNodeList(String url) {
   Parser parser = Parser.createParser(readURL(url),
     autoDetectCharset(url));
   PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
   factory.registerTag(new ScclSelectBizCodesTag());
   factory.registerTag(new InputTag());
   factory.registerTag(new TextareaTag());
   parser.setNodeFactory(factory);

   NodeFilter formFilter = new PostFormFilter();

   NodeList nodeList = null;
   try {
    nodeList = parser.extractAllNodesThatMatch(formFilter);
   } catch (ParserException e) {
    log.error(e);
   }
   return nodeList;
}

HTML工具类函数四:解析TAG中的属性,生成所有的PageField的POJO
public static List<PageField> getPageFields(String url) {
   List<PageField> list = null;
   NodeList nodeList = getFormNodeList(url);
   if (nodeList != null && nodeList.size() > 0) {
    // nodeList不为空,开始构建
    list = new ArrayList<PageField>(nodeList.size());
    for (int i = 0; i < nodeList.size(); i++) {
     TagNode node = (TagNode) nodeList.elementAt(i);
     if (node instanceof InputTag) {
      InputTag input = (InputTag) node;
      PageField t = new PageField(input.getAttribute("name"),
        PageField.TAG_TYPE_INPUT, input
          .getAttribute("type"));
      list.add(t);
     } else if (node instanceof ScclSelectBizCodesTag) {
      ScclSelectBizCodesTag scclSelectBizCodesTag = (ScclSelectBizCodesTag) node;
      PageField t = new PageField(scclSelectBizCodesTag
        .getAttribute("id"),
        PageField.TAG_TYPE_SELECT, null);
      list.add(t);
     } else if (node instanceof TextareaTag) {
      TextareaTag textArea = (TextareaTag) node;
      PageField t = new PageField(textArea.getAttribute("name"),PageField.TAG_TYPE_TEXTAREA,null);
      list.add(t);
     }
    }
   }
   return list;
}

扩展自定义标签<sccl:selectBizCodes>
public class ScclSelectBizCodesTag extends TagNode {
private static final long serialVersionUID = -6352090777443844707L;
private static final String[] ids = new String[] { "sccl:selectBizCodes" };
public String[] getIds() {
   return (ids);
}
public String[] getEnders() {
   return (ids);
}
public String getCategory(){
   return super.getAttribute("category");
}
public String getId(){
   return super.getAttribute("id");
}
public String getSelected(){
   return super.getAttribute("selected");
}
}

用FILTER方式过滤访问TAG
public class PostFormFilter implements NodeFilter {
private static final long serialVersionUID = 8162322553987269165L;
public boolean accept(Node node) {
   if (node instanceof InputTag) {
    return true;
   }
   if (node instanceof ScclSelectBizCodesTag) {
    return true;
   }
   if (node instanceof TextareaTag) {
    return true;
   }
   return false;
}
}

测试
public static void main(String[] args)
    throws org.htmlparser.util.ParserException, IOException {
   String url = "file:///E:\\work\\html\\editOrder.jsp";
   List<PageField> list = getPageFields(url);
   list.get(0);
}

以上代码可以解析<input> <select> 自定义类型
<sccl:selectBizCodes category="worksheet" id="worksheetCode" selected="cl" onChange="go();" html="style='test';"/>

问题一
拷贝cpdetector_1.0.7.jar到项目中后
同时也要拷贝ext下面的chardet.jar到lib下面,不然在调用
detector.add(JChardetFacade.getInstance());时要报错,找不到类
nsICharsetDetectionObserver

问题二
拷贝htmlparser相关包如下:
htmlparser.jar
htmllexer.jar

你可能感兴趣的:(html,PHP,.net,ant,ext)