c#采集网页类

自己写的一个用来采集网页内容的c#类,可以像火车头一样指定目标字符串前面的字符和后面的字符,自动截取到目标字符串,测试效果不错:

using System.IO;
using System.Net;
using System.Text;
using System;
using System.Text.RegularExpressions;
using System.Collections.Generic;


namespace testtaobao {
    public class caiji
    {
        #region 获取网页内容
        /// <summary>
        /// 获取网页内容
        /// </summary>
        /// <param name="url">网址</param>
        /// <param name="code">网页编码例如GB2312</param>
        /// <returns>网页源码</returns>
        public string gethtml(string url,string code){
            string strResult;
            try
            {
                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                //声明一个HttpWebRequest请求   
                request.Timeout = 30000;
                //设置连接超时时间   
                request.Headers.Set("Pragma", "no-cache");
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream streamReceive = response.GetResponseStream();
                Encoding encoding = Encoding.GetEncoding(code);
                StreamReader streamReader = new StreamReader(streamReceive, encoding);
                strResult = streamReader.ReadToEnd();
                return strResult;
            }
            catch (Exception ex)
            {
                throw ex;
            }
        }
        #endregion
        #region 替换换行符
        /// <summary>
        /// 替换掉网页源码里面的换行符,方便匹配
        /// </summary>
        /// <param name="HtmlCode">html代码</param>
        /// <returns>去除换行符后的字符串</returns>
        public string ReplaceEnter(string HtmlCode)
        {
            string s = "";
            if (HtmlCode == null || HtmlCode == "")
                s = "";
            else
                s = HtmlCode.Replace("\"", "");
            s = s.Replace("\r\n", "");
            return s;
        }
        #endregion
        #region 执行正则提取出值
        /// <summary>
        /// 执行正则提取出值
        /// </summary>
        /// <param name="RegexString">正则表达式</param>
        /// <param name="RemoteStr">HtmlCode源代码</param>
        /// <returns></returns>
        public MatchCollection GetRegValue(string RegexString, string RemoteStr)
        {
            Regex r = new Regex(RegexString,RegexOptions.Multiline);            
            MatchCollection matches = r.Matches(RemoteStr);
            return matches;
            
        }
        #endregion


        #region 获取目标字符串
        /// <summary>
        /// 获取目标字符串
        /// </summary>
        /// <param name="fstr">目标字符串前面的字串</param>
        /// <param name="estr">目标字符串后面的字串</param>
        /// <param name="scstr">源字符串</param>
        /// <returns>匹配到的字符串数组</returns>
        public List<string> getstr(string fstr, string estr, string scstr) {
            //StringBuilder stb = new StringBuilder();
            string regstr = fstr + @".*?" + estr;
            List<string> rlist = new List<string>();
            MatchCollection match = GetRegValue(regstr, scstr);
            
            for (int i = 0; i < match.Count; i++)
            {
                string tpstr = match[i].ToString();
                tpstr = tpstr.Replace(fstr, "");
                tpstr = tpstr.Replace(estr, "");
                rlist.Add(tpstr);
            }
            return rlist;
        }
        #endregion
    }
}

你可能感兴趣的:(c#正则表达式用法,网页采集,类似火车头的采集代码)