先说说htmlparser的初步学习
我觉得htmlparser也不是很困难,就是处理是麻烦些,htmlparser对html节点处理的数据结构为:
解析html有3中方法
1:lexer
lexer解析html的方式更底层些,我返回的是node节点的线性序列,不能产生树形序列
2:filter
filter 解析html返回树形节点序列支持逻辑嵌套(andfilter(filter,andfilter(notfilter(),orfilter(..,..))))
比较常用的几个filter:
(1):AndFilter:相当与逻辑与,构造函数接受2个Filter(f1,f2),AndFilter(f1,f2)类似于f1&&f2
(2):HasAttributeFilter:用于提取含有指定属性的节点
(3):TagNameFilter:提取所有满足指定tag名的节点
3:visitor
其中有一个特别的 visitor:TextExtractingVisitor
此类用来提取网页中的所有文字,剔除所有标签。在有些时候比较好用。它还支持对已提取的节点的visit
用法:
//通过filter 获得NodeList NodeList gen_tr = this.getParse().parse(general_data_filter); //对已得到的NodeList采用TextExtractingVisitor方式visit //这样就实现了filter和visitor的结合使用 gen_tr.visitAllNodesWith(new TextExtractingVisitor());
注:1:我认为析取筛选网页是做搜索引擎前期最关键也是最需要耐心的工作。
2:在解析的过程中你会发现经常抛空指针异常,原因:(1):你的filter根本没有析取出节点(2):你析取出的是"/n",这个确实比较烦,一般当解析网页的时候,会经常出现很多"/n"例如:
<table>"/n"<tr>"/n"<td>"/n"text"/n"</td>"/n"</tr>"/n"</table>
下面是我解析:http://price.pcauto.com.cn/m11199/等相关的网页
这个网址中的汽车参数的代码,可能对初学者有所帮助:
提取网页内容的基类,它定义了一些提取网页内容的基本的通用的方法:
package get_infor; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import org.htmlparser.Parser; public class BaseExtractor { protected static final String NEWLINE = "\r\n"; private String outputPath; private String inputPath; private Parser parse; /** * 对图片路径进行哈希的算法,这里采用MD5算法 */ protected static final String HASH_ALGORITHM = "md5"; /** * 用于存放被处理过后的产口的图片的目录 */ private String imageDir = ""; public int extractedCount = 0; String file_name = ""; public String getOutputPath() { return outputPath; } public void setOutputPath(String outputPath) { this.outputPath = outputPath; } public String getInputPath() { return inputPath; } public void setInputPath(String inputPath) { this.inputPath = inputPath; } public Parser getParse() { return parse; } public void setParse(Parser parse) { this.parse = parse; } public String getMirrorDir() { return mirrorDir; } public void setMirrorDir(String mirrorDir) { this.mirrorDir = mirrorDir; } public String getImageDir() { return imageDir; } public void setImageDir(String imageDir) { this.imageDir = imageDir; } public static String getNEWLINE() { return NEWLINE; } public static String getHASH_ALGORITHM() { return HASH_ALGORITHM; } protected void operator() { String ip = this.getInputPath(); visit(new File(ip)); } public void visit(File dir) { if (dir.isFile()) { extract(dir.getAbsolutePath()); } else { File[] fs = dir.listFiles(); for (int i = 0; i < fs.length; i++) { if (fs[i].isFile()) { // p(fs[i].getAbsolutePath()); extract(fs[i].getAbsolutePath()); } else { visit(fs[i]); } } } } public void extract(String url) { } protected void write_file(StringBuffer sb) { try { file_name = StringUtils.filenameProcess(file_name); BufferedWriter bw = new BufferedWriter(new FileWriter(this .getOutputPath() + file_name + ".txt")); bw.write(sb.toString()); bw.flush(); p("已经处理了:"+ extractedCount+++file_name); } catch (IOException e) { e.printStackTrace(); } } protected String process(String l6_td_str) { String l6_td_str1 = l6_td_str.replaceAll("\\ ", ""); String l6_td_str2 = l6_td_str1.replaceAll("\n", ""); int index2 = l6_td_str2.lastIndexOf(">"); int index1 = l6_td_str2.indexOf("<"); if (index1 != -1 && index2 != -1) { l6_td_str2 = l6_td_str2.substring(0, index1) + l6_td_str2.substring(index2 + 1); } return l6_td_str2; } protected void p(Object o) { System.out.println(o); } }
下面的CarExtractor是对BaseExtractor的继承,重载添加了特定的方法
package get_infor; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.AndFilter; import org.htmlparser.filters.HasAttributeFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.ImageTag; import org.htmlparser.tags.TableColumn; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import org.htmlparser.visitors.TextExtractingVisitor; public class CarExtractor extends BaseExtractor { // 析取网页内容方法 public void extract(String url) { try { String real_url = "http://"+url.substring(72,url.length()-10).replaceAll("\\\\", "/"); StringBuffer sb = new StringBuffer(); sb.append("url: "+real_url+NEWLINE); // 获得网页产品图片 String pic_src = get_pic_src(url); sb.append("pic: "+pic_src+NEWLINE); this.getParse().reset(); String general_data = this.get_general_data(url); sb.append(general_data); this.setParse(new Parser(url)); // 获得产品详细信息的过滤器 NodeFilter Attribute_filter = new AndFilter( new TagNameFilter("td"), new AndFilter( new HasAttributeFilter("class", "bor1_c1"), new HasAttributeFilter("style", "padding:5px;"))); // 设定分析器的编码方式为"gb2312" this.getParse().setEncoding("gb2312"); NodeList l1_td_list = this.getParse().parse(Attribute_filter); NodeList l2_table_list = l1_td_list.elementAt(0).getChildren(); // 获得产品的基本参数 for (int j = 1; j < l2_table_list.size(); j += 2) { TableTag l2_table = (TableTag) l2_table_list.elementAt(j); Node l4_txt = l2_table.getFirstChild().getNextSibling() .getFirstChild().getNextSibling().getFirstChild(); if (l4_txt.getClass().toString().equals( "class org.htmlparser.nodes.TextNode") && !l4_txt.getText().matches("\n")) { sb.append(process(l4_txt.getText()) + " "); } else { for (int m = 1; m < l2_table.getChildren().size(); m += 2) { NodeList l6_td_list = l2_table.getChildren().elementAt( m).getChildren().elementAt(1).getChildren() .elementAt(1).getChildren().elementAt(1) .getChildren(); for (int k = 1; k < l6_td_list.size(); k += 2) { TableColumn l6_td = (TableColumn) l6_td_list .elementAt(k); String l6_td_str = l6_td.getStringText(); l6_td_str = process(l6_td_str); if (l6_td.getAttribute("class").equals( "series_2_cs3_c1") || l6_td.getAttribute("class").equals( "series_2_cs3_c4") || l6_td.getAttribute("class").equals( "series_2_cs3_c7")) { sb.append(l6_td_str + " : "); } else if (l6_td.getAttribute("class").equals( "series_2_cs3_c2") || l6_td.getAttribute("class").equals( "series_2_cs3_c5") || l6_td.getAttribute("class").equals( "series_2_cs3_c8")) { sb.append(l6_td_str + " ; "); } } sb.append(NEWLINE); } } sb.append(NEWLINE); } // 获得产品的外设等高级参数 NodeList l2_table_list2 = l1_td_list.elementAt(1).getChildren(); for (int j = 1; j < l2_table_list2.size(); j += 2) { TableTag l2_table = (TableTag) l2_table_list2.elementAt(j); if (l2_table.getFirstChild().getNextSibling().getFirstChild().getNextSibling() != null) { Node l4_txt = l2_table.getFirstChild().getNextSibling() .getFirstChild().getNextSibling().getFirstChild(); if (l4_txt.getClass().toString().equals( "class org.htmlparser.nodes.TextNode") && !l4_txt.getText().matches("\n")) { sb.append(process(l4_txt.getText()) + " "); } else { for (int l = 1; l < l2_table.getChildren().size(); l += 2) { NodeList l4_td_list = l2_table.getChildren() .elementAt(l).getChildren(); for (int i = 1; i < l4_td_list.size(); i += 2) { TableColumn tc = (TableColumn) l4_td_list .elementAt(i).getChildren() .elementAt(1).getChildren() .elementAt(1).getChildren() .elementAt(1); sb.append(process(tc.getStringText()) + ","); } sb.append(NEWLINE); } } sb.append(NEWLINE); } } // System.out.println(sb.toString()); // 写入文件 write_file(sb); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } // 获得产品图片的src的方法 private String get_pic_src(String url) { NodeFilter pic_filter = new AndFilter(new TagNameFilter("td"), new HasAttributeFilter("class", "series_sy_intro_pic")); String imgURL = ""; String new_image_file = ""; try { this.setParse(new Parser(url)); this.getParse().setEncoding("gb2312"); NodeList pic_nodes = this.getParse().parse(pic_filter); TableColumn tc = (TableColumn) pic_nodes.elementAt(0); ImageTag it = (ImageTag) (tc.childAt(1).getChildren().elementAt(0)); imgURL = it.getImageURL(); // String fileType = imgURL.substring(imgURL // .lastIndexOf(".") + 1); //生成新的图片的文件名 new_image_file = StringUtils.encodePassword( imgURL, HASH_ALGORITHM) + ".jpg"; // imgURL = StringUtils.replace(imgURL, "+", " "); //利用miorr目录下的图片生成的新的图片 copyImage(imgURL, new_image_file); } catch (ParserException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return new_image_file; } protected boolean copyImage(String image_url, String new_image_file) { String dirs = image_url.substring(7); try { // instance the File as file_in and file_out File file_in = new File(new File("f:/"), dirs); if (file_in == null || !file_in.exists()) { file_in = new File("f:/noimage.jpg"); } File file_out = new File(new File("f:/img/"), new_image_file); FileInputStream in1 = new FileInputStream(file_in); FileOutputStream out1 = new FileOutputStream(file_out); byte[] bytes = new byte[1024]; int c; while ((c = in1.read(bytes)) != -1) out1.write(bytes, 0, c); // close in1.close(); out1.close(); return (true); // if success then return true } catch (Exception e) { e.printStackTrace(); return (false); // if fail then return false } } // 获取产品概要参数 private String get_general_data(String url){ StringBuffer general_data = new StringBuffer(); try { this.setParse(new Parser(url)); this.getParse().setEncoding("gb2312"); NodeFilter general_data_filter = new AndFilter(new TagNameFilter("table"),new HasAttributeFilter("class","series_sy_intro_txt")); NodeFilter price_filter = new AndFilter(new TagNameFilter("td"),new HasAttributeFilter("class","f18b")); NodeList general_data_list = this.getParse().parse(general_data_filter); NodeList gen_tr = general_data_list.elementAt(0).getChildren(); gen_tr.visitAllNodesWith(new TextExtractingVisitor()); String str = process(gen_tr.asString().replaceAll("\\s", "")); // 获得文件名字 file_name= get_title(str); this.getParse().reset(); NodeList price_td = this.getParse().parse(price_filter); TableColumn tc = (TableColumn)price_td.elementAt(0); String price = tc.getStringText(); str = "概要参数:"+ str + NEWLINE +"厂家指导价: "+price+NEWLINE; general_data.append(str); } catch (ParserException e) { e.printStackTrace(); } return general_data.toString(); } private String get_title(String str) { int index1,index2,index3,index4,index5; index1 = str.indexOf("生产厂商:")+5; index2 = str.indexOf("所属:"); index3 = str.indexOf("上市时间:"); index4 = str.indexOf("型 号:")+5; index5 = str.indexOf("车 型:"); String bland = str.substring(index1,index2); String type = str.substring(index2+3,index3); String name = str.substring(index4,index5); return bland+"_"+type+""+name; } public static void main(String args[]) { CarExtractor ex = new CarExtractor(); ex .setInputPath("F:/Workspaces/MyEclipse 7.1/heritrix/jobs/may2-20090501055518750/mirror/price.pcauto.com.cn"); ex.setOutputPath("F:/job/"); ex.get_general_data("F:/Workspaces/MyEclipse 7.1/heritrix/jobs/may2-20090501055518750/mirror/price.pcauto.com.cn/m157/index.html"); ex.operator(); // ex.extract("F:/Workspaces/MyEclipse 7.1/heritrix/jobs/may2-20090501055518750/mirror/price.pcauto.com.cn/m157/index.html"); } }
中间 会用到StringUtils这个类,它的主要作用是对图片名称的md5编码,形成独一无二的名字
package get_infor; import java.security.MessageDigest; public class StringUtils { public static String trim(String line) { String result = line.trim(); while (result.startsWith(" ")) { result = result.substring(1); } while (result.endsWith(" ")) { result = result.substring(0, result.length() - 1); } return result; } public static String filenameProcess(String name) { String result = name.trim(); result = result.replaceAll("\\\\", "_"); result = result.replaceAll("/", "_"); result = result.replaceAll("\\*", " "); return result; } public static String encodePassword(String password, String algorithm) { byte[] unencodedPassword = password.getBytes(); MessageDigest md = null; try { // first create an instance, given the provider md = MessageDigest.getInstance(algorithm); } catch (Exception e) { return password; } md.reset(); // call the update method one or more times // (useful when you don't know the size of your data, eg. stream) md.update(unencodedPassword); // now calculate the hash byte[] encodedPassword = md.digest(); StringBuffer buf = new StringBuffer(); for (int i = 0; i < encodedPassword.length; i++) { if ((encodedPassword[i] & 0xff) < 0x10) { buf.append("0"); } buf.append(Long.toString(encodedPassword[i] & 0xff, 16)); } return buf.toString(); } public static final String replace(String line, String oldString, String newString) { if (line == null) { return null; } int i = 0; if ((i = line.indexOf(oldString, i)) >= 0) { char[] line2 = line.toCharArray(); char[] newString2 = newString.toCharArray(); int oLength = oldString.length(); StringBuffer buf = new StringBuffer(line2.length); buf.append(line2, 0, i).append(newString2); i += oLength; int j = i; while ((i = line.indexOf(oldString, i)) > 0) { buf.append(line2, j, i - j).append(newString2); i += oLength; j = i; } buf.append(line2, j, line2.length - j); return buf.toString(); } return line; } }
运行CarExtractor的main方法即可实现对INputPath目录下的所有已下网页的处理析取
运行后会出现像下面这种格式的5000多个txt文件
毕竟我也是菜鸟,在设计过程中也遇到了很多困难,但是都一一克服,我说的有什么不对的地方,欢迎大家指正。谢谢
以上的3个类在设计过程中参考了《lucene+heritrix 开发自己的搜索引擎》