前几天我在博客园转载了一篇"net HtmlParser初步使用研究"的文章,之后看过文章评论才知道原来现在已经有HtmlAgilityPack这个类库可以更方便地对HTML内容进行分析和提取。因此今天特别学习和实践了一下HtmlAgilityPack和XPath,并作下笔记。
HtmlAgilityPack是用C#写的开源Html Parser。不过可能在某些方面设计不尽完善,或者是对中文的支持并不是很好,例如,抓取中文页面时就出现乱码.
我的代码如下:
HtmlWeb hw
=
new
HtmlWeb();
HtmlDocument doc
=
hw.Load(
"
http://www.xinlg.com
"
);
//HtmlNodeCollection nodes
=
doc.DocumentNode.SelectNodes(
"
//a
"
);
Response.Write(doc);
页面输出时,全部是乱码,不过有些页面又是正常的,cnblogs是正常,新浪,QQ,新华网等都是乱码,这可能跟编码有关.
具体的开源代码我就没有去看了,我的解决方案就是自己写一个获取Html的方法.
代码如下:
///
<summary>
///
获取指定URL的HTML源代码
///
</summary>
///
<param name="url"></param>
///
<param name="encoding">
如果为NULL 则自动识别
</param>
///
<returns></returns>
public
static
string
GetWebHtml(
string
url, Encoding encoding)
{
try
{
HttpWebRequest hwr
=
(HttpWebRequest)HttpWebRequest.Create(url);
HttpWebResponse res;
try
{
res
=
(HttpWebResponse)hwr.GetResponse();
}
catch
{
return
string
.Empty;
}
if
(res.StatusCode
==
HttpStatusCode.OK)
{
using
(Stream mystream
=
res.GetResponseStream())
{
//
没有指定编码,
if
(encoding
==
null
)
{
return
DecodeData(mystream, res);
}
//
指定了编码
else
{
using
(StreamReader reader
=
new
StreamReader(mystream, encoding))
{
return
reader.ReadToEnd();
}
}
}
}
return
null
;
}
catch
{
return
null
;
}
}
private
static
string
DecodeData(Stream responseStream, HttpWebResponse response)
{
string
name
=
null
;
string
text2
=
response.Headers[
"
content-type
"
];
if
(text2
!=
null
)
{
int
index
=
text2.IndexOf(
"
charset=
"
);
if
(index
!=
-
1
)
{
name
=
text2.Substring(index
+
8
);
}
}
MemoryStream stream
=
new
MemoryStream();
byte
[] buffer
=
new
byte
[
0x400
];
for
(
int
i
=
responseStream.Read(buffer,
0
, buffer.Length); i
>
0
; i
=
responseStream.Read(buffer,
0
, buffer.Length))
{
stream.Write(buffer,
0
, i);
}
responseStream.Close();
if
(name
==
null
)
{
MemoryStream stream3
=
stream;
stream3.Seek((
long
)
0
, SeekOrigin.Begin);
string
text3
=
new
StreamReader(stream3, Encoding.ASCII).ReadToEnd();
if
(text3
!=
null
)
{
int
startIndex
=
text3.IndexOf(
"
charset=
"
);
int
num4
=
-
1
;
if
(startIndex
!=
-
1
)
{
num4
=
text3.IndexOf(
"
\"
"
, startIndex);
if
(num4
!=
-
1
)
{
int
num5
=
startIndex
+
8
;
name
=
text3.Substring(num5, (num4
-
num5)
+
1
).TrimEnd(
new
char
[] {
'
>
'
,
'
"
'
});
}
}
}
}
Encoding aSCII
=
null
;
if
(name
==
null
)
{
aSCII
=
Encoding.GetEncoding(
"
gb2312
"
);
}
else
{
try
{
if
(name
==
"
GBK
"
)
{
name
=
"
GB2312
"
;
}
aSCII
=
Encoding.GetEncoding(name);
}
catch
{
aSCII
=
Encoding.GetEncoding(
"
gb2312
"
);
}
}
stream.Seek((
long
)
0
, SeekOrigin.Begin);
StreamReader reader2
=
new
StreamReader(stream, aSCII);
return
reader2.ReadToEnd();
}
我测试过了,大型网站都没有问题.
最后再调用时,代码如下:
string
Html
=
XINLG.Labs.Utils.NetUtil.GetWebHtml(
"
http://www.cnblogs.com/pick/
"
,
null
);
HtmlDocument doc
=
new
HtmlDocument();
doc.LoadHtml(Html);
在上面的代码中,如果你不知道指定的url编码是什么,直接写NULL,自动判断.
另外,网上还介绍了方法,可以去 HtmlAgilityPack的官网下载源代码,打开项目,自己改一下,代码我就不提供了,大家可以自己搜索一下.