private string GetHtmlCode(string url)
{
string htmlCode;
HttpWebRequest webRequest = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
webRequest.Timeout = 30000;
webRequest.Method = "GET";
webRequest.UserAgent = "Mozilla/4.0";
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
HttpWebResponse webResponse = (System.Net.HttpWebResponse)webRequest.GetResponse();
if (webResponse.ContentEncoding.ToLower() == "gzip")//如果使用了GZip则先解压
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
using (var zipStream =
new System.IO.Compression.GZipStream(streamReceive, System.IO.Compression.CompressionMode.Decompress))
{
Encoding enc = GetEncoding(url);
using (StreamReader sr = new System.IO.StreamReader(zipStream, enc))
{
htmlCode = sr.ReadToEnd();
}
}
}
}
else
{
using (System.IO.Stream streamReceive = webResponse.GetResponseStream())
{
Encoding enc = GetEncoding(url);
using (System.IO.StreamReader sr = new System.IO.StreamReader(streamReceive, enc))
{
htmlCode = sr.ReadToEnd();
}
}
}
return htmlCode;
}
public Encoding GetEncoding(string strurl)
{
string urlToCrawl = strurl;
//generate http request
if (urlToCrawl != null && urlToCrawl != "")
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlToCrawl);
//use GET method to get url's html
req.Method = "GET";
req.Accept = "*/*";
req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
req.ContentType = "text/xml";
//use request to get response
HttpWebResponse resp = (HttpWebResponse)req.GetResponse();
Encoding enc;
try
{
if (resp.CharacterSet != "ISO-8859-1")
enc = Encoding.GetEncoding(resp.CharacterSet);
else
enc = Encoding.UTF8;
}
catch
{
// *** Invalid encoding passed
enc = Encoding.UTF8;
}
string sHTML = string.Empty;
using (StreamReader read = new StreamReader(resp.GetResponseStream(), enc))
{
sHTML = read.ReadToEnd();
Match charSetMatch = Regex.Match(sHTML, "charset=(?[a-zA-Z0-9\\-]+)", RegexOptions.IgnoreCase);
string sChartSet = charSetMatch.Groups["code"].Value;
//if it's not utf-8,we should redecode the html.
if (!string.IsNullOrEmpty(sChartSet) && !sChartSet.Equals("utf-8", StringComparison.OrdinalIgnoreCase))
{
enc = Encoding.GetEncoding(sChartSet);
}
}
return enc;
}
return Encoding.Default;
}
使用C#抓取网页时遇到乱码问题,找了各种办法都没有妥善解决的,发现存在gzip压缩的问题;于是乎,在参考CSDN上两位达人的帖子以后,我把代码进行了修正,基本妥善解决页面代码错误问题;欢迎大家使用上面的代码尝试;
以下为参考贴:
http://blog.csdn.net/wsc449/article/details/7280646
http://bbs.csdn.net/topics/320213776