using
System;
using
System.Collections.Generic;
using
System.Linq;
using
System.Web;
using
System.Text;
using
System.Net;
using
System.IO;
using
System.Text.RegularExpressions;
using
System.Collections;
using
System.IO.Compression;
///
<summary>
///
Name:网页抓取类
///
Author:loafinweb
///
Date:2011-09-12
///
</summary>
public
class
webCrawl
{
public
webCrawl() { }
//
获取网页字符根据url
public
static
string
getHtml(
string
url)
{
try
{
string
str
=
""
;
Encoding en
=
Encoding.GetEncoding(getEncoding(url));
HttpWebRequest request
=
(HttpWebRequest)WebRequest.Create(url);
request.Headers.Set(
"
Pragma
"
,
"
no-cache
"
);
request.Timeout
=
30000
;
HttpWebResponse response
=
(HttpWebResponse)request.GetResponse();
if
(response.StatusCode
==
HttpStatusCode.OK
&&
response.ContentLength
<
1024
*
1024
)
{
Stream strM
=
response.GetResponseStream();
StreamReader sr
=
new
StreamReader(strM, en);
str
=
sr.ReadToEnd();
strM.Close();
sr.Close();
}
return
str;
}
catch
{
return
String.Empty;
}
}
//
获取编码
public
static
string
getEncoding(
string
url)
{
HttpWebRequest request
=
null
;
HttpWebResponse response
=
null
;
StreamReader reader
=
null
;
try
{
request
=
(HttpWebRequest)WebRequest.Create(url);
request.Timeout
=
30000
;
request.AllowAutoRedirect
=
false
;
response
=
(HttpWebResponse)request.GetResponse();
if
(response.StatusCode
==
HttpStatusCode.OK
&&
response.ContentLength
<
1024
*
1024
)
{
if
(response.ContentEncoding
!=
null
&&
response.ContentEncoding.Equals(
"
gzip
"
, StringComparison.InvariantCultureIgnoreCase))
reader
=
new
StreamReader(
new
GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader
=
new
StreamReader(response.GetResponseStream(), Encoding.ASCII);
string
html
=
reader.ReadToEnd();
Regex reg_charset
=
new
Regex(
@"
charset\b\s*=\s*(?<charset>[^""]*)
"
);
if
(reg_charset.IsMatch(html))
{
return
reg_charset.Match(html).Groups[
"
charset
"
].Value;
}
else
if
(response.CharacterSet
!=
string
.Empty)
{
return
response.CharacterSet;
}
else
return
Encoding.Default.BodyName;
}
}
catch
(Exception ex)
{
throw
new
Exception(ex.Message);
}
finally
{
if
(response
!=
null
)
{
response.Close();
response
=
null
;
}
if
(reader
!=
null
)
reader.Close();
if
(request
!=
null
)
request
=
null
;
}
return
Encoding.Default.BodyName;
}
//
根据内容--获取标题
public
static
string
getTitle(
string
url)
{
string
title
=
string
.Empty;
string
htmlStr
=
getHtml(url);
//
获取网页
Match TitleMatch
=
Regex.Match(htmlStr,
"
<title>([^<]*)</title>
"
, RegexOptions.IgnoreCase
|
RegexOptions.Multiline);
title
=
TitleMatch.Groups[
1
].Value;
title
=
Regex.Replace(title,
@"
\W
"
,
""
);
//
去除空格
return
title;
}
//
根据内容--获取描述信息
public
static
string
getDescription(
string
url)
{
string
htmlStr
=
getHtml(url);
Match Desc
=
Regex.Match(htmlStr,
"
<meta name=\"Description\" content=\"([^<]*)\"*>
"
, RegexOptions.IgnoreCase
|
RegexOptions.Multiline);
string
mdd
=
Desc.Groups[
1
].Value;
return
Regex.Replace(Desc.Groups[
1
].Value,
@"
\W
"
,
""
);
}
//
根据内容--获取所有链接
public
static
List
<
string
>
getLink(
string
htmlStr)
{
List
<
string
>
list
=
new
List
<
string
>
();
//
用来存放链接
String reg
=
@"
http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?
"
;
//
链接的正则表达式
Regex regex
=
new
Regex(reg, RegexOptions.IgnoreCase);
MatchCollection mc
=
regex.Matches(htmlStr);
for
(
int
i
=
0
; i
<
mc.Count; i
++
)
//
存放匹配的集合
{
bool
hasExist
=
false
;
//
链接存在与否的标记
String name
=
mc[i].ToString();
foreach
(String one
in
list)
{
if
(name
==
one)
{
hasExist
=
true
;
//
链接已存在
break
;
}
}
if
(
!
hasExist) list.Add(name);
//
链接不存在,添加
}
return
list;
}
//
根据内容--取得body内的内容
public
static
string
getBody(
string
url)
{
string
htmlStr
=
getHtml(url);
string
result
=
string
.Empty;
Regex regBody
=
new
Regex(
@"
(?is)<body[^>]*>(?:(?!</?body\b).)*</body>
"
);
Match m
=
regBody.Match(htmlStr);
if
(m.Success)
{
result
=
parseHtml(m.Value);
}
return
result;
}
//
获取所有图片
public
static
List
<
string
>
getImg(
string
url)
{
List
<
string
>
list
=
new
List
<
string
>
();
string
temp
=
string
.Empty;
string
htmlStr
=
getHtml(url);
MatchCollection matchs
=
Regex.Matches(htmlStr,
@"
<(IMG|img)[^>]+>
"
);
//
抽取所有图片
for
(
int
i
=
0
; i
<
matchs.Count; i
++
)
{
list.Add(matchs[i].Value);
}
return
list;
}
//
所有图片路径(如果是相对路径的话,自动设置成绝对路径)
public
static
List
<
string
>
getImgPath(
string
url)
{
List
<
string
>
list
=
new
List
<
string
>
();
string
htmlStr
=
getHtml(url);
string
pat
=
@"
<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>
"
;
MatchCollection matches
=
Regex.Matches(htmlStr, pat, RegexOptions.IgnoreCase
|
RegexOptions.Multiline);
foreach
(Match m
in
matches)
{
string
imgPath
=
m.Groups[
"
imgUrl
"
].Value.Trim();
if
(Regex.IsMatch(imgPath,
@"
\w+\.(gif|jpg|bmp|png)$
"
))
//
用了2次匹配,去除链接是网页的 只留图片
{
if
(
!
imgPath.Contains(
"
http
"
))
//
必须包含http 否则无法下载
{
imgPath
=
getUrl(url)
+
imgPath;
}
list.Add(imgPath);
}
}
return
list;
}
//
下载图片
public
void
DownloadImg(
string
fileurl)
{
if
(fileurl.Contains(
'
.
'
))
//
url路径必须是绝对路径 例如http:
//
xxx.com/img/logo.jpg
{
string
imgName
=
DateTime.Now.ToString(
"
yyyyMMddHHmmssffff
"
)
+
fileurl.Substring(fileurl.LastIndexOf(
'
.
'
));
//
生成图片的名字
string
filepath
=
System.Web.HttpContext.Current.Server.MapPath(
""
)
+
"
/
"
+
imgName;
WebClient mywebclient
=
new
WebClient();
mywebclient.DownloadFile(fileurl, filepath);
}
}
//
过滤html
public
static
string
parseHtml(
string
html)
{
string
value
=
Regex.Replace(html,
"
<[^>]*>
"
,
string
.Empty);
value
=
value.Replace(
"
<
"
,
string
.Empty);
value
=
value.Replace(
"
>
"
,
string
.Empty);
//
return value.Replace(" ", string.Empty);
return
Regex.Replace(value,
@"
\s+
"
,
""
);
}
//
处理url路径问题
public
static
string
getUrl(
string
url)
{
//
如果是http:
//
www.xxx.com 返回http:
//
www.xxx.com/
//
如果是http:
//
www.xxx.com/art.aspx 返回http:
//
www.xxx.com/
return
url
=
url.Substring(
0
, url.LastIndexOf(
'
/
'
))
+
"
/
"
;
}
}