愚钝只看了php 感觉很不错。
PHP 一个类库文件,一个DEMO文件。就2个。
<?php # Script - class.textExtract.php /** * textExtract - text extraction class * Created on 2010-08-10 * author: Wenfeng Xuan * Email: [email protected] * Blog: http://hi.baidu.com/xwf_like */ class textExtract { /////////////////////////////////// // MEMBERS /////////////////////////////////// /** * record the web page's url * @var string */ public $url = ''; /** * record the web page's source code * @var string */ public $rawPageCode = ''; /** * record the text after preprocessing * @var array */ public $textLines = array(); /** * record the length of each block * @var array */ public $blksLen = array(); /** * record the final extracted text * @var string */ public $text = ''; /** * set the size of each block ( regards how many single lines as a block ) * it is the only parameter of this method * @var int */ public $blkSize; /** * record whether the web page's encoding is 'gb*' * @var bool */ public $isGB; /////////////////////////////////// // METHODS /////////////////////////////////// /** * Set the value of relevant members * @param string $_url * @param int $_blkSize * @return void */ function __construct( $_url, $_blkSize = 3 ) { $this->url = $_url; $this->blkSize = $_blkSize; } /** * Get the web page's source code * @return void */ function getPageCode() { $this->rawPageCode = file_get_contents( $this->url ); } /** * Transform the web page's source code according to its encoding, * and set the value of member $isGB for correctly display * @return void */ function procEncoding() { $pattern = '/charset(\s*?)=(\s*?)(.*?)"/i'; preg_match( $pattern, $this->rawPageCode, $matches ); $tmp = substr( $matches[3], 0, 2 ); if( strtoupper($tmp) != 'GB' ) { $this->isGB = false; $replacement = 'charset=GBK"'; $this->rawPageCode = preg_replace( $pattern, $replacement, $this->rawPageCode ); } else { $this->isGB = true; } } /** * Preprocess the web page's source code * @return string */ function preProcess() { $content = $this->rawPageCode; // 1. DTD information $pattern = '/<!DOCTYPE.*?>/si'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 2. HTML comment $pattern = '/<!--.*?-->/s'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 3. Java Script $pattern = '/<script.*?>.*?<\/script>/si'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 4. CSS $pattern = '/<style.*?>.*?<\/style>/si'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 5. HTML TAGs $pattern = '/<.*?>/s'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 6. some special charcaters $pattern = '/&.{1,5};|&#.{1,5};/'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); return $content; } /** * Split the preprocessed text into lines by '\n' * after replacing "\r\n", '\n', and '\r' with '\n' * @param string @rawText * @return void */ function getTextLines( $rawText ) { // do some replacement $order = array( "\r\n", "\n", "\r" ); $replace = '\n'; $rawText = str_replace( $order, $replace, $rawText ); $lines = explode( '\n', $rawText ); foreach( $lines as $line ) { // remove the blanks in each line $tmp = preg_replace( '/\s+/s', '', $line ); $this->textLines[] = $tmp; } } /** * Calculate the blocks' length * @return void */ function calBlocksLen() { $textLineNum = count( $this->textLines ); // calculate the first block's length $blkLen = 0; for( $i = 0; $i < $this->blkSize; $i++ ) { $blkLen += strlen( $this->textLines[$i] ); } $this->blksLen[] = $blkLen; // calculate the other block's length using Dynamic Programming method for( $i = 1; $i < ($textLineNum - $this->blkSize); $i++ ) { $blkLen = $this->blksLen[$i - 1] + strlen( $this->textLines[$i - 1 + $this->blkSize] ) - strlen( $this->textLines[$i - 1] ); $this->blksLen[] = $blkLen; } } /** * Extract the text from the web page's source code * according to the simple idea: * [the text should be the longgest continuous content * in the web page] * @return string */ function getPlainText() { $this->getPageCode(); $this->procEncoding(); $preProcText = $this->preProcess(); $this->getTextLines( $preProcText ); $this->calBlocksLen(); $start = $end = -1; $i = $maxTextLen = 0; $blkNum = count( $this->blksLen ); while( $i < $blkNum ) { while( ($i < $blkNum) && ($this->blksLen[$i] == 0) ) $i++; if( $i >= $blkNum ) break; $tmp = $i; $curTextLen = 0; $portion = ''; while( ($i < $blkNum) && ($this->blksLen[$i] != 0) ) { if( $this->textLines[$i] != '' ) { $portion .= $this->textLines[$i]; $portion .= '<br />'; $curTextLen += strlen( $this->textLines[$i] ); } $i++; } if( $curTextLen > $maxTextLen ) { $this->text = $portion; $maxTextLen = $curTextLen; $start = $tmp; $end = $i - 1; } } return $this->text; } } ?>
demo.php . 注意示例中代码,最后都是将提取后的文本iconv为UTF-8输出,并且段落类库自行添加了<br />。 我们根据自己来设置修改. 提取正文时候记得,explode 删掉最后一个。后面的东西减噪
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>网页正文提取演示系统</title>
<style type="text/css">
#allcontent {
	font-family: Georgia, "Times New Roman", Times, serif;
	width: 800px;
}
#title {
	font-size: 30px;
}
.des {
	font-size: 20px;
	font-style: bold;
	color: blue;
}
#text, #lf {
	text-align: left;
}
#gap {
	border-style: outset;
	border-top-width: 10px;
	border-top-color: gray;
}
#copyright {
	margin-top: 60px;
	clear: both;
	font-size: 12px;
}
</style>
</head>

<body>
<center>
<div id="allcontent">
	<div id="title"><p>网页正文提取演示</p></div>
	<?php
		if(isset($_POST['url']))
		{
			set_time_limit( 60 * 10 );
			require_once( 'class.textExtract.php' );
			$iTextExtractor = new textExtract( $_POST['url'] );
			$text = $iTextExtractor->getPlainText();
			if( $iTextExtractor->isGB )
				$text = iconv( 'GBK', 'UTF-8//IGNORE', $text );
			echo '<form method="post" action="demo.php">
					<span class="des">网址:</span><input type="text" name="url" size="60" />
					<input type="submit" name="submit" value="提取" />
				  </form>';
			echo '<p id="gap"><?p>';
			echo '<p id="lf" class="des">正文:</p>';
			echo '<p id="text">' . $text . '</p>';
		}
		else
		{
			echo '<form method="post" action="demo.php">
					<span class="des">网址:</span><input type="text" name="url" size="60" />
					<input type="submit" name="submit" value="提取" />
				  </form>';
		}
	?>
</div>
</center>
</body>
</html>
着实敬佩这些做算法的家伙!自己琢磨几天,搜索下,原来 LAST CHANGE:18:48:11, 2010年8月11日 2010年都有人做出来了。