HTML解析器有很多种,最常用的是HtmlAgilityPack和SgmlReader(http://sourceforge.net/projects/dekiwiki/files/SgmlReader/)。
一段例子代码
public async Task NewsGathering(string newsUrl)
{
//获取网页所有内容
var strContent = await HttpHelper.GetContentAsync(newsUrl, Encoding.UTF8);
var title = "";
var content = "";
var pubTime = DateTime.Now;
try
{
//取出标题,时间
title = StrHelperUtil.GetStrByXPath(strContent, "//h1[@class='art_title_h1']", "");
var strTime = StrHelperUtil.GetStrByXPath(strContent, "//time", "");
strTime = StrHelperUtil.FormatHTML(strTime).Replace("\n","").Replace("\\n","");
if (strTime.Length > 18)
{
strTime = strTime.Substring(0, 17);
DateTime.TryParse(strTime, out pubTime);
}
//取出正文区内容
content = StrHelperUtil.GetStrByXPath(strContent, "//section[@data-sudaclick='articleContent']", "");
//利用正则去掉一些不要的内容
content = Regex.Replace(content, "[\\s\\S]*?
", "");
content = Regex.Replace(content, "", "");
content = Regex.Replace(content, "", "");
content = Regex.Replace(content, "", "");
content = Regex.Replace(content, "[\\s\\S]*? ", "");
content = Regex.Replace(content, "", "");
var news = new News
{
Content = content,
Title = title,
PubDate = pubTime,
From = newsUrl
};
return news;
}
catch (Exception ex)
{
return null;
}
return null;
}
解释:
1 content = Regex.Replace(content, "", ""); 这个是用正则替换结尾的(含头尾tag 字符串
[\\s\\S]*? 这个是非贪婪匹配 如
afkjldkfkljkl
只会匹配 和
如果是这样 [\\s\\S]* 不要后面的?则是贪婪匹配 则匹配 afkjldkfkljkl
附上获取网页内容的类
里面有些方法是多余的
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
using System.Web;
namespace Net.Tools
{
public class HttpHelper
{
///
/// 获取指定网页的内容
///
/// 网页地址
/// 网页编码格式
/// string
public static string GetContent(string strUrl, Encoding encoder)
{
string strMsg = string.Empty;
try
{
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
////request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
strMsg = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strMsg = strMsg.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}
public static async Task GetContentAsync(string strUrl, Encoding encoder)
{
var strMsg = await Task.Run(() =>
{
try
{
#region
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest) WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
////request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse) request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
var strcontent = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strcontent = strcontent.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
return strcontent;
#endregion
}
catch (Exception ex)
{
return "";
}
});
return strMsg;
}
public static async Task GetContentByMobileAgentAsync(string strUrl, Encoding encoder)
{
var strMsg = await Task.Run(() =>
{
try
{
#region
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"CoolPad8750_CMCC_TD/1.0 Linux/3.4.5 Android/4.2.1 Release/06.31.2013 Browser/1.0 Profile/MIDP-1.0 Configuration/CLDC-1.0";
//request.Referer = strUrl;
//request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add("x-requested-with:com.android.browser");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
////request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
var strcontent = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strcontent = strcontent.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
return strcontent;
#endregion
}
catch (Exception ex)
{
return "";
}
});
return strMsg;
}
public static string GetContent(string strUrl, Encoding encoder, CookieContainer cc)
{
string strMsg = string.Empty;
try
{
//CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//模拟goole浏览器访问
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
//request.Referer = strUrl;
request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
////request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
//StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
StreamReader reader = new StreamReader(response.GetResponseStream(), encoder);
strMsg = reader.ReadToEnd();
// .\0为null,空字符,也是字符串结束标志
strMsg = strMsg.Replace("\0", "");
reader.Close();
reader.Dispose();
response.Close();
}
catch
{
}
return strMsg;
}
///
/// 获取指定网页的内容
///
/// 网页地址
/// 网页编码格式,不指定null时将自动获取网页编码格式
/// string
public static string GetContent2(string strUrl, Encoding encoder)
{
string strMsg = string.Empty;
CookieContainer cc = new CookieContainer();
//WebRequest request = WebRequest.Create(strUrl);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
//set request args
request.Method = "Get";
request.CookieContainer = cc;
request.KeepAlive = true;
//request.ContentType = "application/x-www-form-urlencoded; charset=UTF-8";
request.ContentType = "text/html";
//request.UserAgent = "Mozilla/5.0 (Windows NT 5.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1";
request.UserAgent =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36";
request.Referer = strUrl;
request.Headers.Add("x-requested-with:XMLHttpRequest");
request.Headers.Add(HttpRequestHeader.AcceptLanguage, "zh-CN,zh;q=0.8,en;q=0.6,nl;q=0.4,zh-TW;q=0.2");
//request.ContentLength = postdataByte.Length; text/html; charset=utf-8
request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8";
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip |
DecompressionMethods.None;
//支持跳转页面,查询结果将是跳转后的页面
request.AllowAutoRedirect = true;
request.Headers.Add("Accept-Encoding", "gzip, deflate");
if (request.Method == "POST")
{
(request as HttpWebRequest).ContentType = "application/x-www-form-urlencoded";
}
#region 获取数据
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
//从这里开始我们要无视编码了
if (encoder == null)
{
MemoryStream _stream = new MemoryStream();
response.GetResponseStream().CopyTo(_stream, 10240);
byte[] RawResponse = _stream.ToArray();
string temp = Encoding.Default.GetString(RawResponse, 0, RawResponse.Length);
//](.*?)>
Match meta = Regex.Match(temp, " 2) ? meta.Groups[2].Value : string.Empty;
charter = charter.Replace("\"", string.Empty).Replace("'", string.Empty).Replace(";", string.Empty);
if (charter.Length > 0)
{
encoder = Encoding.GetEncoding(charter);
}
else
{
if (string.IsNullOrEmpty(response.CharacterSet))
{
encoder = Encoding.UTF8;
}
else
{
encoder = Encoding.GetEncoding(response.CharacterSet);
}
}
strMsg = encoder.GetString(RawResponse);
}
else
{
//开始读取流并设置编码方式
using (StreamReader reader = new StreamReader(response.GetResponseStream(), encoder))
{
strMsg = reader.ReadToEnd();
}
}
}
#endregion
return strMsg.Replace("\0", "");
}
public static string PostLogin(string postData, string requestUrlString, ref CookieContainer cookie)
{
ASCIIEncoding encoding = new ASCIIEncoding();
byte[] data = encoding.GetBytes(postData);
//向服务端请求
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(requestUrlString);
myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = data.Length;
myRequest.CookieContainer = new CookieContainer();
Stream newStream = myRequest.GetRequestStream();
newStream.Write(data, 0, data.Length);
newStream.Close();
//将请求的结果发送给客户端(界面、应用)
HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
cookie.Add(myResponse.Cookies);
StreamReader reader = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
return reader.ReadToEnd();
}
}
}