具体的方案参见博客
基于概率的网页正文页抽取方案
代码实现如下:
maven依赖:
<dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.4</version> <scope>test</scope> </dependency> <dependency> <groupId>commons-lang</groupId> <artifactId>commons-lang</artifactId> <version>2.6</version> <scope>compile</scope> </dependency> <dependency> <groupId>org.apache.common</groupId> <artifactId>commons-codec</artifactId> <version>1.2</version> </dependency> <dependency> <groupId>net.sourceforge.nekohtml</groupId> <artifactId>nekohtml</artifactId> <version>1.9.10</version> </dependency> <dependency> <groupId>dom4j</groupId> <artifactId>dom4j</artifactId> <version>1.6.1</version> </dependency> <dependency> <groupId>jaxen</groupId> <artifactId>jaxen</artifactId> <version>1.1.4</version> </dependency>
公共类:
import java.util.HashSet; import java.util.Set; import org.apache.commons.codec.digest.DigestUtils; /** * 训练集封装的value object * @date 2013-10-21 */ public class ItemTrainVo { private ItemTrainVo() { super(); } /**更新实例的same number同时加入text对应的md5值 * @param insance ItemTrainVo实例 * @param xpath xpath * @param text 解析dom树节点对应的文本值 * @date 2013-10-21 */ public static void updateInstance(ItemTrainVo insance, String xpath, String text) { insance.setXpath(xpath); String md5Text = DigestUtils.md5Hex(text); if (insance.getMd5Texts().contains(md5Text)) { insance.setSameNum(insance.getSameNum() + 1L); } insance.getMd5Texts().add(md5Text); } /**创建一个空的实例 * @param xpath xpath * @param text 解析dom树节点对应的文本值 * @return 返回创建的实例 * @date 2013-10-21 */ public static ItemTrainVo getInstance(String xpath, String text) { ItemTrainVo insance = new ItemTrainVo(); insance.setXpath(xpath); String md5Text = DigestUtils.md5Hex(text); if (insance.getMd5Texts().contains(md5Text)) { insance.setSameNum(insance.getSameNum() + 1L); } insance.getMd5Texts().add(md5Text); return insance; } private String xpath; private Set<String> md5Texts = new HashSet<String>(); private long sameNum = 0L; public String getXpath() { return xpath; } public void setXpath(String xpath) { this.xpath = xpath; } public Set<String> getMd5Texts() { return md5Texts; } public void setMd5Texts(Set<String> md5Texts) { this.md5Texts = md5Texts; } public long getSameNum() { return sameNum; } public void setSameNum(long sameNum) { this.sameNum = sameNum; } @Override public String toString() { return xpath + "=>" + md5Texts + ""; } }
抽象训练器,如果以后有获取css path的可以继承此类:
import java.math.BigDecimal; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.Map.Entry; import com.panguso.techrd.parser.trainer.vo.ItemTrainVo; public abstract class BaseTrainer { /** * 基于概率阀值清理训练结果,保留符合条件的结果,返回最终的标签相对模板集合 * * @param map 训练中间数据传输对象map * @param q 概率阀值 * @throws Exception Exception * @date 2013-10-24 */ protected Set<String> cleanResult(Map<String, ItemTrainVo> map, double q) throws Exception { Iterator<Entry<String, ItemTrainVo>> iterator = map.entrySet().iterator(); Set<String> xpaths = new HashSet<String>(); while (iterator.hasNext()) { Entry<String, ItemTrainVo> entry = iterator.next(); ItemTrainVo itv = entry.getValue(); BigDecimal dif = new BigDecimal(itv.getMd5Texts().size()); BigDecimal all = new BigDecimal(itv.getMd5Texts().size() + itv.getSameNum()); BigDecimal result = dif.divide(all, 3, BigDecimal.ROUND_HALF_DOWN); if (result.doubleValue() < q) { iterator.remove(); } else { xpaths.add(itv.getXpath()); } } return xpaths; } /** * 基于训练集提取其所有的text和需要处理的标签对应的path(css * path或者xpath基于具体的实现)作为key,value为map,其key为标签路径 * ,value为封装结构体,其中包括了相对标签path,所有的text值,以及出现相同次数,依此作为概率计算基数。 * * @param path path为训练集的根目录 * @param map 空的map * @throws Exception Exception * @date 2013-10-24 */ protected abstract void parseInfo(String path, Map<String, ItemTrainVo> map) throws Exception; /** * 对外暴漏的方法,提取最终的训练集合,map的key为标签path,value为其例子 * * @param fileDirPath 训练集根路径 * @return 训练结果 * @date 2013-10-24 */ public abstract Map<String, String> train(String fileDirPath); /** * 获取除了text之外要提取的标签比如<img> * * @return 返回需要提取的标签列表 * @date 2013-10-24 */ protected abstract Set<String> getUsefullTags(); /** * 获取需要过滤的标签比如<HEADE>,训练器当碰到之后就会忽略下层迭代处理 * * @return 返回要过滤的标签列表 * @date 2013-10-24 */ protected abstract Set<String> getUnUsefullTags(); }
基于xpath训练器的实现:
import java.io.File; import java.io.FileInputStream; import java.math.BigDecimal; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.cyberneko.html.parsers.DOMParser; import org.dom4j.Attribute; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.io.DOMReader; import org.xml.sax.InputSource; import com.panguso.techrd.parser.trainer.vo.ItemTrainVo; /** * 基于概率的xpath训练器实现 * * @date 2013-10-24 */ public class XPathTrainer extends BaseTrainer { private Set<String> usefullTags = Collections.emptySet(); private Set<String> unUsefullTags = Collections.emptySet(); public XPathTrainer(Set<String> usefullTags, Set<String> unUsefullTags) { super(); this.unUsefullTags = unUsefullTags; this.usefullTags = usefullTags; } @Override protected Set<String> getUnUsefullTags() { return unUsefullTags; } @Override protected Set<String> getUsefullTags() { return usefullTags; } @Override public Map<String, String> train(String fileDirPath) { Map<String, ItemTrainVo> map = new HashMap<String, ItemTrainVo>(); Map<String, String> result = new HashMap<String, String>(); try { parseInfo(fileDirPath, map); Set<String> xpaths = cleanResult(map, computQ(fileDirPath, true)); getResult(fileDirPath, result, xpaths); } catch (Exception e) { e.printStackTrace(); } return result; } private void getResult(String path, Map<String, String> result, Set<String> xpaths) throws Exception { File file = new File(path); if (!file.isDirectory()) { return; } File[] files = file.listFiles(); for (File file2 : files) { if (file2.isFile()) { DOMParser parser = new DOMParser(); parser.parse(new InputSource(new FileInputStream(file2))); DOMReader domReader = new DOMReader(); Document document = domReader.read(parser.getDocument()); Element root = document.getRootElement(); Iterator<String> xpathIterator = xpaths.iterator(); while (xpathIterator.hasNext()) { String xpath = xpathIterator.next(); Element node = (Element) root.selectSingleNode(xpath); result.put(xpath, node.asXML()); } return; } } } private double computQ(String path, boolean defaultVal) { if (defaultVal) { return 1.0d; } File file = new File(path); int num = 0; File[] files = file.listFiles(); for (File file2 : files) { if (file2.isFile()) { num++; } } BigDecimal dif = new BigDecimal(num - 1); BigDecimal all = new BigDecimal(num); BigDecimal result = dif.divide(all, 3, BigDecimal.ROUND_HALF_DOWN); return result.doubleValue(); } @Override protected void parseInfo(String path, Map<String, ItemTrainVo> map) throws Exception { File file = new File(path); if (!file.isDirectory()) { return; } File[] files = file.listFiles(); for (File file2 : files) { if (file2.isDirectory()) { continue; } DOMParser parser = new DOMParser(); parser.parse(new InputSource(new FileInputStream(file2))); DOMReader domReader = new DOMReader(); Document document = domReader.read(parser.getDocument()); Element root = document.getRootElement(); dom2PathMap(root, map); } } @SuppressWarnings("unchecked") private void dom2PathMap(Element root, Map<String, ItemTrainVo> map) { if (this.getUnUsefullTags().contains(root.getName())) { return; } if (root == null || root.isTextOnly() || this.getUsefullTags().contains(root.getName())) { String text = root.getText(); String uniqXpath = root.getUniquePath(); String xpath = root.getPath(); if (StringUtils.isEmpty(text)) { Iterator<Attribute> iterator = root.attributeIterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); text = attr.getName() + "." + attr.getValue() + text; } } if (map.containsKey(uniqXpath)) { ItemTrainVo.updateInstance(map.get(uniqXpath), xpath, text); } else { map.put(uniqXpath, ItemTrainVo.getInstance(xpath, text)); } return; } Iterator<Element> iterator = root.elementIterator(); while (iterator.hasNext()) { Element el = iterator.next(); dom2PathMap(el, map); } } }
测试代码:
import java.util.HashSet; import java.util.Set; import org.junit.Test; import com.panguso.techrd.parser.trainer.service.XPathTrainer; public class XPathTrainerTest { @Test public final void testTrain() { Set<String> usefullTags = new HashSet<String>(); Set<String> unUsefullTags = new HashSet<String>(); usefullTags.add("IMG"); unUsefullTags.add("HEAD"); unUsefullTags.add("SCRIPT"); String path = "/dom/163/shehui"; XPathTrainer xt = new XPathTrainer(usefullTags, unUsefullTags); System.out.println(xt.train(path)); } }
测试集如附件: