//获取 title
private string GetTitle(string strHtml)
{
string str = "";
string p = @"<title>([^<]*)</title>";
Regex reg = new Regex(p, RegexOptions.IgnoreCase | RegexOptions.Compiled);
Match m = reg.Match(strHtml);
str = m.Groups[1].Captures[0].ToString().Replace(":", "-").Replace(@"/", "").Replace("/", "").Replace("*", "#").Replace("/"", "").Replace("<", "(").Replace(">", ")").Replace("|", "_");
return str;
}
/// <summary>
/// 获得读取的html文挡的标题
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
public static string GetTitle(string html)
{
Match m = Regex.Match(html, "<title>([^<]*)</title>");
if (m.Groups.Count == 2)
return m.Groups[1].Value;
return "此文挡标题未知";
}
/// <summary>
/// 把读取的文件中的所有的html标记去掉,把 替换成空格
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private string ParseHtml(string html)
{
string temp = Regex.Replace(html, @"<script[^>]*?>.*?</script>", "");
temp = Regex.Replace(temp, "<[^>]*>", "");
return temp.Replace(" ", " ");
}
/// <summary>
/// 获取页面预处理后的信息
/// </summary>
/// <remarks>页面主体内容是</remarks>
/// <returns>页面主体内容</returns>
public static string GetBodyText(string html)
{
string resultText;
Regex reg_Body = new Regex(@"<(body/b)([/s/S]*)<//s*body/s*>", RegexOptions.IgnoreCase);
Regex reg_script_start = new Regex(@"<script/b", RegexOptions.IgnoreCase);
Regex reg_script_end = new Regex(@"<//s*script/s*>");
Regex reg_script = new Regex(@"<script/b[^>]*/>");
Regex reg_targe = new Regex(@"<[^>]*>");
int start = 0;
int end;
int length;
if (reg_Body.IsMatch(html))
{
string body = reg_Body.Match(html).Value;
while (reg_script_start.IsMatch(body, start))
{
start = reg_script_start.Match(body, start).Index;
end = reg_script_end.Match(body, start).Index;
length = end - start;
if (length > 0)
{
if (reg_script.IsMatch(body, start))
{
string script = reg_script.Match(body, start).Value;
length = script.Length;
body = body.Remove(start, length);
}
else
{
body = body.Remove(start, length);
}
}
else
{
start += 1;
}
}
resultText = reg_targe.Replace(body, " ");
Regex nulls = new Regex(@"/s");
resultText = nulls.Replace(resultText, "");
Regex reg1 = new Regex(@"/s");
Regex reg2 = new Regex(@"/.{2,}");
Regex reg3 = new Regex(@"&");
Regex reg4 = new Regex(@"nbsp;");
Regex reg5 = new Regex(@"quot;");
Regex reg6 = new Regex(@"gt;");
Regex reg7 = new Regex(@"&{2,}");
resultText = reg1.Replace(resultText, "");
resultText = reg2.Replace(resultText, "");
resultText = reg3.Replace(resultText, "");
resultText = reg4.Replace(resultText, "");
resultText = reg5.Replace(resultText, "");
resultText = reg6.Replace(resultText, "");
resultText = reg7.Replace(resultText, "");
return resultText;
}
else
{
return "No Body Text.";
}
}