c#的网页内容提取程序,在vs2010下调试完全通过,且无乱码现象
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using System.Threading;
using System.IO;
using System.Net;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Nodes;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Visitors;
using Winista.Text.HtmlParser.Filters;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Http;
using System.Diagnostics;
using System.Text.RegularExpressions;
namespace testhtml
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
htmlText = getData(textBox1.Text);
htmlText = delJsStyle(htmlText);
string xx = toText(htmlText);
htmlText = delspace(xx);
textBox2.Text = htmlText;
}
string htmlText = "";
private string getData(string WebUrl)
{
try
{
WebClient myWebClient = new WebClient();
myWebClient.Encoding = System.Text.Encoding.Default;
htmlText = myWebClient.DownloadString(WebUrl);
int index = htmlText.IndexOf("charset");
string tempcode = htmlText.Substring(index, 15);
if (tempcode.Contains("gbk") || tempcode.Contains("gb2312") || tempcode.Contains("GBK") || tempcode.Contains("GB2312"))
myWebClient.Encoding = System.Text.Encoding.GetEncoding("gb2312");
else
myWebClient.Encoding = System.Text.Encoding.UTF8;
htmlText = myWebClient.DownloadString(WebUrl);
}
catch (Exception ex)
{
MessageBox.Show(ex.Message + "ee");
}
if (htmlText.Trim() == "")
htmlText = "获取页面失败!";
return htmlText;
}
private string toText(string str)
{
string strParser = "";
Lexer lexer1 = new Lexer(str);
Parser parser1 = new Parser(lexer1);
// Parser parser1 = Parser.CreateParser(textBox1.Text,"utf-8");
NodeFilter body = new TagNameFilter("BODY");
NodeList nodelistoftitle = parser1.Parse(body);
TextExtractingVisitor visitor = new TextExtractingVisitor();
nodelistoftitle.VisitAllNodesWith(visitor);
strParser = visitor.ExtractedText.ToString();
return strParser;
}
private void Form1_Load_1(object sender, EventArgs e)
{
textBox1.Text = "http://www.ybzy.cn";
}
private void button2_Click(object sender, EventArgs e)
{
}
public static string delJsStyle(string str)
{
string str1 = new Regex(@"(?m)<script[^>]*>(/w|/W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(str, "");
// str1 = new Regex(@"/s+", RegexOptions.Multiline).Replace(str1, " ");
return new Regex(@"(?m)<style[^>]*>(/w|/W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(str1, "");
}
public static string delspace(string str)
{
string str1 = new Regex(@"/s+", RegexOptions.Multiline).Replace(str, " ");
return str1;
}
}
}