最近在开发http://www.sokers.com的时候需要抽取网页正文,在网上也看了很多算法,但效果感觉都不好,有的根本打不开无法看到效果,于是自己就试着写了一个,效果还不错,支持图片和Flash,不仅仅能抽取文字。
方法就是适用打分机制,把正文文字和标签的比例、标点符号、换行等因素累加起来,打分最高的就是正文,当然肯定有识别不出来的,这个是任何算法都无法避免的。
感兴趣的同学可以看看,代码很短:
/**
*@Extract Content
*/
var extractor = {iframe: null, iDoc: null};
extractor.markExp = /[\,\.\?\:\;\-\'\!\"\(\)\[\]\{\}\,\。\‘\!\“\”\?\:\、]/g;
extractor.footExp = /(©?)|(All Rights Reserved)|(Powered By)|(备[0-9]{5,}号)/ig;
extractor.extract = function(html)
{
if(!extractor.iDoc)
{
return "";
}
var arr = html.replace(/\r|\n/g, "").match(/<body(.*)<\/body>/i);
if(!arr || !arr.length)
{
return "";
}
html = arr[0].replace(/<iframe.*?<\/iframe>/ig, "");
html = html.replace(/<link.*?\/?>/ig, "");
html = html.replace(/<!--.*?-->/g, "");
html = html.replace(/<style.*?<\/style>/ig, "");
html = html.replace(/<script.*?<\/script>/ig, "");
html = html.replace(/<embed.*?\/?>/ig, function(data){
return data.replace("<", "$1$").replace(">", "$2$");
});
html = html.replace(/<img.*?\/?>/ig, function(data){
return data.replace("<", "$1$").replace(">", "$2$");
});
html = html.replace(/<object.*?<\/object>/ig, function(data){
return data.replace("<", "$1$").replace(">", "$2$");
});
extractor.iDoc.body.innerHTML = html;
return extractor.process(extractor.iDoc.body);
};
extractor.getWordRatio = function(itemStr)
{
var tagsCount = 0;
var tags = itemStr.match(/<[^>]+>/g);
if(tags && tags.length > 0)
{
tagsCount = tags.length;
}
var ratio = 0;
var words = itemStr.replace(/<a.*?<\/a>/ig, "");
words = words.replace(/<[^>]+>/g, "");
if(tagsCount == 0 && words.length > 6)
{
ratio = 6;
}
else
{
var t = words.length/tagsCount;
ratio = t > 6 ? 6 : t;
}
words = words.match(/\S/g);
return [ratio, (words ? words.length : 0)];
};
extractor.getMarkRatio = function(itemStr)
{
var items = itemStr.match(extractor.markExp);
var length = items ? items.length : 0;
if(length == 0)
{
return 0;
}
return length > 5 ? 3 : 1.5;
};
extractor.getLineRatio = function(itemStr)
{
var items = (/<br ?\/?>/ig).test(itemStr);
var length = items ? items.length : 0;
if(length == 0)
{
return 0;
}
return length > 5 ? 2 : 1;
};
extractor.isFooter = function(itemStr)
{
return extractor.footExp.test(itemStr);
};
extractor.process = function(body)
{
var items = body.getElementsByTagName("div");
if(!items)
{
items = body.getElementsByTagName("table");
}
if(!items)
{
items = body.getElementsByTagName("p");
}
if(!items)
{
return "";
}
var mostItemStr = null, mostRatio = 0, mostLength = 0;
for(var i=0;i<items.length;i++)
{
var item = items[i];
var itemStr = item.innerHTML;
if(itemStr.length < 16)
{
continue;
}
var div = item.getElementsByTagName("div");
if(div && div.length > 8)
{
continue;
}
var input = item.getElementsByTagName("input");
if(input && input.length > 2)
{
continue;
}
var wordRatio = extractor.getWordRatio(itemStr);
var lineRatio = extractor.getLineRatio(itemStr);
var markRatio = extractor.getMarkRatio(itemStr);
var isFooter = extractor.isFooter(itemStr);
if(isFooter)
{
continue;
}
var ratio = wordRatio[0] + lineRatio + markRatio;
var length = wordRatio[1];
if(ratio >= mostRatio && length > mostLength)
{
//alert(wordRatio[0] +","+ length +","+ markRatio +","+ itemStr);
mostRatio = ratio;
mostLength = length;
mostItemStr = itemStr;
}
}
if(mostItemStr)
{
return mostItemStr.replace(/\$1\$/g, "<").replace(/\$2\$/g, ">");
}
return "";
};
extractor.init = function()
{
if(extractor.iframe)
{
return;
}
extractor.iframe = document.createElement("iframe");
extractor.iframe.src = "about:blank";
extractor.iframe.style.display = "none";
document.body.appendChild(extractor.iframe);
extractor.iDoc = extractor.iframe.document;
if(!extractor.iDoc)
{
extractor.iDoc = extractor.iframe.contentDocument;
}
};
(function(){
if(!document.body)
{
return;
}
extractor.init();
})();