filter 解析html返回树形节点序列支持逻辑嵌套(andfilter(filter,andfilter(notfilter(),orfilter(..,..))))
其中有一个特别的 visitor:TextExtractingVisitor
//通过filter 获得NodeList NodeList gen_tr = this.getParse().parse(general_data_filter); //对已得到的NodeList采用TextExtractingVisitor方式visit //这样就实现了filter和visitor的结合使用 gen_tr.visitAllNodesWith(new TextExtractingVisitor());
package get_infor; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import org.htmlparser.Parser; public class BaseExtractor { protected static final String NEWLINE = "\r\n"; private String outputPath; private String inputPath; private Parser parse; /** * 对图片路径进行哈希的算法,这里采用MD5算法 */ protected static final String HASH_ALGORITHM = "md5"; /** * 用于存放被处理过后的产口的图片的目录 */ private String imageDir = ""; public int extractedCount = 0; String file_name = ""; public String getOutputPath() { return outputPath; } public void setOutputPath(String outputPath) { this.outputPath = outputPath; } public String getInputPath() { return inputPath; } public void setInputPath(String inputPath) { this.inputPath = inputPath; } public Parser getParse() { return parse; } public void setParse(Parser parse) { this.parse = parse; } public String getMirrorDir() { return mirrorDir; } public void setMirrorDir(String mirrorDir) { this.mirrorDir = mirrorDir; } public String getImageDir() { return imageDir; } public void setImageDir(String imageDir) { this.imageDir = imageDir; } public static String getNEWLINE() { return NEWLINE; } public static String getHASH_ALGORITHM() { return HASH_ALGORITHM; } protected void operator() { String ip = this.getInputPath(); visit(new File(ip)); } public void visit(File dir) { if (dir.isFile()) { extract(dir.getAbsolutePath()); } else { File[] fs = dir.listFiles(); for (int i = 0; i < fs.length; i++) { if (fs[i].isFile()) { // p(fs[i].getAbsolutePath()); extract(fs[i].getAbsolutePath()); } else { visit(fs[i]); } } } } public void extract(String url) { } protected void write_file(StringBuffer sb) { try { file_name = StringUtils.filenameProcess(file_name); BufferedWriter bw = new BufferedWriter(new FileWriter(this .getOutputPath() + file_name + ".txt")); bw.write(sb.toString()); bw.flush(); p("已经处理了:"+ extractedCount+++file_name); } catch (IOException e) { e.printStackTrace(); } } protected String process(String l6_td_str) { String l6_td_str1 = l6_td_str.replaceAll("\\ ", ""); String l6_td_str2 = l6_td_str1.replaceAll("\n", ""); int index2 = l6_td_str2.lastIndexOf(">"); int index1 = l6_td_str2.indexOf("<"); if (index1 != -1 && index2 != -1) { l6_td_str2 = l6_td_str2.substring(0, index1) + l6_td_str2.substring(index2 + 1); } return l6_td_str2; } protected void p(Object o) { System.out.println(o); } }
package get_infor; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.TextExtractingVisitor; public class CarExtractor extends BaseExtractor { // 析取网页内容方法 public void extract(String url) { try { String real_url = "http://"+url.substring(72,url.length()-10).replaceAll("\\\\", "/"); StringBuffer sb = new StringBuffer(); sb.append("url: "+real_url+NEWLINE); // 获得网页产品图片 String pic_src = get_pic_src(url); sb.append("pic: "+pic_src+NEWLINE); this.getParse().reset(); String general_data = this.get_general_data(url); sb.append(general_data); this.setParse(new Parser(url)); // 获得产品详细信息的过滤器 NodeFilter Attribute_filter = new AndFilter( new TagNameFilter("td"), new AndFilter( new HasAttributeFilter("class", "bor1_c1"), new HasAttributeFilter("style", "padding:5px;"))); // 设定分析器的编码方式为"gb2312" this.getParse().setEncoding("gb2312"); NodeList l1_td_list = this.getParse().parse(Attribute_filter); NodeList l2_table_list = l1_td_list.elementAt(0).getChildren(); // 获得产品的基本参数 for (int j = 1; j < l2_table_list.size(); j += 2) { TableTag l2_table = (TableTag) l2_table_list.elementAt(j); Node l4_txt = l2_table.getFirstChild().getNextSibling() .getFirstChild().getNextSibling().getFirstChild(); if (l4_txt.getClass().toString().equals( "class org.htmlparser.nodes.TextNode") && !l4_txt.getText().matches("\n")) { sb.append(process(l4_txt.getText()) + " "); } else { for (int m = 1; m < l2_table.getChildren().size(); m += 2) { NodeList l6_td_list = l2_table.getChildren().elementAt( m).getChildren().elementAt(1).getChildren() .elementAt(1).getChildren().elementAt(1) .getChildren(); for (int k = 1; k < l6_td_list.size(); k += 2) { TableColumn l6_td = (TableColumn) l6_td_list .elementAt(k); String l6_td_str = l6_td.getStringText(); l6_td_str = process(l6_td_str); if (l6_td.getAttribute("class").equals( "series_2_cs3_c1") || l6_td.getAttribute("class").equals( "series_2_cs3_c4") || l6_td.getAttribute("class").equals( "series_2_cs3_c7")) { sb.append(l6_td_str + " : "); } else if (l6_td.getAttribute("class").equals( "series_2_cs3_c2") || l6_td.getAttribute("class").equals( "series_2_cs3_c5") || l6_td.getAttribute("class").equals( "series_2_cs3_c8")) { sb.append(l6_td_str + " ; "); } } sb.append(NEWLINE); } } sb.append(NEWLINE); } // 获得产品的外设等高级参数 NodeList l2_table_list2 = l1_td_list.elementAt(1).getChildren(); for (int j = 1; j < l2_table_list2.size(); j += 2) { TableTag l2_table = (TableTag) l2_table_list2.elementAt(j); if (l2_table.getFirstChild().getNextSibling().getFirstChild().getNextSibling() != null) { Node l4_txt = l2_table.getFirstChild().getNextSibling() .getFirstChild().getNextSibling().getFirstChild(); if (l4_txt.getClass().toString().equals( "class org.htmlparser.nodes.TextNode") && !l4_txt.getText().matches("\n")) { sb.append(process(l4_txt.getText()) + " "); } else { for (int l = 1; l < l2_table.getChildren().size(); l += 2) { NodeList l4_td_list = l2_table.getChildren() .elementAt(l).getChildren(); for (int i = 1; i < l4_td_list.size(); i += 2) { TableColumn tc = (TableColumn) l4_td_list .elementAt(i).getChildren() .elementAt(1).getChildren() .elementAt(1).getChildren() .elementAt(1); sb.append(process(tc.getStringText()) + ","); } sb.append(NEWLINE); } } sb.append(NEWLINE); } } // System.out.println(sb.toString()); // 写入文件 write_file(sb); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } // 获得产品图片的src的方法 private String get_pic_src(String url) { NodeFilter pic_filter = new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "series_sy_intro_pic")); String imgURL = ""; String new_image_file = ""; try { this.setParse(new Parser(url)); this.getParse().setEncoding("gb2312"); NodeList pic_nodes = this.getParse().parse(pic_filter); TableColumn tc = (TableColumn) pic_nodes.elementAt(0); ImageTag it = (ImageTag) (tc.childAt(1).getChildren().elementAt(0)); imgURL = it.getImageURL(); // String fileType = imgURL.substring(imgURL // .lastIndexOf(".") + 1); //生成新的图片的文件名 new_image_file = StringUtils.encodePassword( imgURL, HASH_ALGORITHM) + ".jpg"; // imgURL = StringUtils.replace(imgURL, "+", " "); //利用miorr目录下的图片生成的新的图片 copyImage(imgURL, new_image_file); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return new_image_file; } protected boolean copyImage(String image_url, String new_image_file) { String dirs = image_url.substring(7); try { // instance the File as file_in and file_out File file_in = new File(new File("f:/"), dirs); if (file_in == null || !file_in.exists()) { file_in = new File("f:/noimage.jpg"); } File file_out = new File(new File("f:/img/"), new_image_file); FileInputStream in1 = new FileInputStream(file_in); FileOutputStream out1 = new FileOutputStream(file_out); byte[] bytes = new byte[1024]; int c; while ((c = in1.read(bytes)) != -1) out1.write(bytes, 0, c); // close in1.close(); out1.close(); return (true); // if success then return true } catch (Exception e) { e.printStackTrace(); return (false); // if fail then return false } } // 获取产品概要参数 private String get_general_data(String url){ StringBuffer general_data = new StringBuffer(); try { this.setParse(new Parser(url)); this.getParse().setEncoding("gb2312"); NodeFilter general_data_filter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","series_sy_intro_txt")); NodeFilter price_filter = new AndFilter(new TagNameFilter("td"),new HasAttributeFilter("class","f18b")); NodeList general_data_list = this.getParse().parse(general_data_filter); NodeList gen_tr = general_data_list.elementAt(0).getChildren(); gen_tr.visitAllNodesWith(new TextExtractingVisitor()); String str = process(gen_tr.asString().replaceAll("\\s", "")); // 获得文件名字 file_name= get_title(str); this.getParse().reset(); NodeList price_td = this.getParse().parse(price_filter); TableColumn tc = (TableColumn)price_td.elementAt(0); String price = tc.getStringText(); str = "概要参数:"+ str + NEWLINE +"厂家指导价: "+price+NEWLINE; general_data.append(str); } catch (ParserException e) { e.printStackTrace(); } return general_data.toString(); } private String get_title(String str) { int index1,index2,index3,index4,index5; index1 = str.indexOf("生产厂商:")+5; index2 = str.indexOf("所属:"); index3 = str.indexOf("上市时间:"); index4 = str.indexOf("型 号:")+5; index5 = str.indexOf("车 型:"); String bland = str.substring(index1,index2); String type = str.substring(index2+3,index3); String name = str.substring(index4,index5); return bland+"_"+type+""+name; } public static void main(String args[]) { CarExtractor ex = new CarExtractor(); ex .setInputPath("F:/Workspaces/MyEclipse 7.1/heritrix/jobs/may2-20090501055518750/mirror/price.pcauto.com.cn"); ex.setOutputPath("F:/job/"); ex.get_general_data("F:/Workspaces/MyEclipse 7.1/heritrix/jobs/may2-20090501055518750/mirror/price.pcauto.com.cn/m157/index.html"); ex.operator(); // ex.extract("F:/Workspaces/MyEclipse 7.1/heritrix/jobs/may2-20090501055518750/mirror/price.pcauto.com.cn/m157/index.html"); } }
中间 会用到StringUtils这个类,它的主要作用是对图片名称的md5编码,形成独一无二的名字
package get_infor; import java.security.MessageDigest; public class StringUtils { public static String trim(String line) { String result = line.trim(); while (result.startsWith(" ")) { result = result.substring(1); } while (result.endsWith(" ")) { result = result.substring(0, result.length() - 1); } return result; } public static String filenameProcess(String name) { String result = name.trim(); result = result.replaceAll("\\\\", "_"); result = result.replaceAll("/", "_"); result = result.replaceAll("\\*", " "); return result; } public static String encodePassword(String password, String algorithm) { byte[] unencodedPassword = password.getBytes(); MessageDigest md = null; try { // first create an instance, given the provider md = MessageDigest.getInstance(algorithm); } catch (Exception e) { return password; } md.reset(); // call the update method one or more times // (useful when you don't know the size of your data, eg. stream) md.update(unencodedPassword); // now calculate the hash byte[] encodedPassword = md.digest(); StringBuffer buf = new StringBuffer(); for (int i = 0; i < encodedPassword.length; i++) { if ((encodedPassword[i] & 0xff) < 0x10) { buf.append("0"); } buf.append(Long.toString(encodedPassword[i] & 0xff, 16)); } return buf.toString(); } public static final String replace(String line, String oldString, String newString) { if (line == null) { return null; } int i = 0; if ((i = line.indexOf(oldString, i)) >= 0) { char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length(); StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength; int j = i; while ((i = line.indexOf(oldString, i)) > 0) { buf.append(line2, j, i - j).append(newString2); i += oLength; j = i; } buf.append(line2, j, line2.length - j); return buf.toString(); } return line; } }
以上的3个类在设计过程中参考了《lucene+heritrix 开发自己的搜索引擎》