等标签替换成特殊占位符[p][br]等,因为最终的正文需要保留段落和回车换行等格式。这一步用正则实现。
6、把最后剩下的文本块的html标签去掉,我用正则过滤的。
7、把[p]替换成回车换行加俩空格,把[br]替换成回车换行,这步也用正则。到此,正文提取完毕
主要代码:
public class GetMainContentHelper
{
///
/// 判断两段儿文本里哪个中文占的比例高
///
///
///
///
public static int CompareDinosByChineseLength(string x, string y)
{
if (x == null)
{
if (y == null)
{
return 0;
}
else
{
return -1;
}
}
else
{
if (y == null)
{
return 1;
}
else
{
Regex r = new Regex("[\u4e00-\u9fa5]");
float xCount = (float)(r.Matches(x).Count) / (float)x.Length;
float yCount = (float)(r.Matches(y).Count) / (float)y.Length;
int retval = xCount.CompareTo(yCount);
if (retval != 0)
{
return retval;
}
else
{
return x.CompareTo(y);
}
}
}
}
///
/// 获取一个网页源码中的标签列表,支持嵌套,一般或去div,td等容器
///
///
///
///
public static List GetTags(string input, string tag)
{
StringReader strReader = new StringReader(input);
int lowerThanCharCounter = 0;
int lowerThanCharPos = 0;
Stack tagPos = new Stack();
List taglist = new List();
int i = 0;
while (true)
{
try
{
int intCharacter = strReader.Read();
if (intCharacter == -1) break;
char convertedCharacter = Convert.ToChar(intCharacter);
if (lowerThanCharCounter > 0)
{
if (convertedCharacter == '>')
{
lowerThanCharCounter--;
string biaoqian = input.Substring(lowerThanCharPos, i - lowerThanCharPos + 1);
if (biaoqian.StartsWith(string.Format("<{0}", tag)))
{
tagPos.Push(lowerThanCharPos);
}
if (biaoqian.StartsWith(string.Format("{0}", tag)))
{
if (tagPos.Count < 1)
continue;
int tempTagPos = tagPos.Pop();
string strdiv = input.Substring(tempTagPos, i - tempTagPos + 1);
taglist.Add(strdiv);
}
}
}
if (convertedCharacter == '<')
{
lowerThanCharCounter++;
lowerThanCharPos = i;
}
}
finally
{
i++;
}
}
return taglist;
}
///
/// 获取指定网页的源码,支持编码自动识别
///
///
///
public static string getDataFromUrl(string url)
{
string str = string.Empty;
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
//设置http头
request.AllowAutoRedirect = true;
request.AllowWriteStreamBuffering = true;
request.Referer = "";
request.Timeout = 10 * 1000;
request.UserAgent = "";
HttpWebResponse response = null;
try
{
response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK)
{
//根据http应答的http头来判断编码
string characterSet = response.CharacterSet;
Encoding encode;
if (characterSet != "")
{
if (characterSet == "ISO-8859-1")
{
characterSet = "gb2312";
}
encode = Encoding.GetEncoding(characterSet);
}
else
{
encode = Encoding.Default;
}
//声明一个内存流来保存http应答流
Stream receiveStream = response.GetResponseStream();
MemoryStream mStream = new MemoryStream();
byte[] bf = new byte[255];
int count = receiveStream.Read(bf, 0, 255);
while (count > 0)
{
mStream.Write(bf, 0, count);
count = receiveStream.Read(bf, 0, 255);
}
receiveStream.Close();
mStream.Seek(0, SeekOrigin.Begin);
//从内存流里读取字符串
StreamReader reader = new StreamReader(mStream, encode);
char[] buffer = new char[1024];
count = reader.Read(buffer, 0, 1024);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 1024);
}
//从解析出的字符串里判断charset,如果和http应答的编码不一直
//那么以页面声明的为准,再次从内存流里重新读取文本
Regex reg =
new Regex(@"",
RegexOptions.Multiline | RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(str);
if (mc.Count > 0)
{
string tempCharSet = mc[0].Result("$1");
if (string.Compare(tempCharSet, characterSet, true) != 0)
{
encode = Encoding.GetEncoding(tempCharSet);
str = string.Empty;
mStream.Seek(0, SeekOrigin.Begin);
reader = new StreamReader(mStream, encode);
buffer = new char[255];
count = reader.Read(buffer, 0, 255);
while (count > 0)
{
str += new String(buffer, 0, count);
count = reader.Read(buffer, 0, 255);
}
}
}
reader.Close();
mStream.Close();
}
}
catch (Exception ex)
{
Trace.TraceError(ex.ToString());
}
finally
{
if (response != null)
response.Close();
}
return str;
}
///
/// 从一段网页源码中获取正文
///
///
///
public static string GetMainContent(string input)
{
string reg1 = @"<(p|br)[^<]*>";
string reg2 =
@"(\[([^=]*)(=[^\]]*)?\][\s\S]*?\[/\1\])|(?(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");])]*>[^<]{2,}(?=[^\u4E00-\u9FA5\uFE30-\uFFA0,."");]))|(?