网页中对图像的采集

有时我们需要采集一些信息到自己的数据库,本地磁盘,我们经常使用的是WebClient,WebRequest等等,今天主要说一下,对于一个URI地址,采集这个页面上所有的图像资源,下面是源代码,供大家参考,学习。

   /// <summary>

    /// 下载指定URL下的所有图片

    /// </summary>

    public class WebPageImage

    {

        /// <summary>

        /// 获取网页中全部图片

        /// </summary>

        /// <param name="url">网页地址</param>

        /// <param name="charSet">网页编码,为空自动判断</param>

        /// <returns>全部图片显示代码</returns>

        public string getImages(string url, string charSet)

        {

            string s = getHtml(url, charSet);

            return getPictures(s, url);

        }



        /// <summary>

        /// 获取网页中全部图片

        /// </summary>

        /// <param name="url">网址</param>

        /// <returns>全部图片代码</returns>

        public string getImages(string url)

        {

            return getImages(url, "");

        }



        string doman(string url)

        {

            Uri u = new Uri(url);

            return u.Host;

        }



        /// <summary>

        /// 获取网页内容

        /// </summary>

        /// <param name="url">网站地址</param>

        /// <param name="charSet">目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码 </param>

        /// <returns></returns>

        string getHtml(string url, string charSet)

        {

            WebClient myWebClient = new WebClient();

            //创建WebClient实例myWebClient 

            // 需要注意的: 

            //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等 

            //这是就要具体问题具体分析比如在头部加入cookie 

            // webclient.Headers.Add("Cookie", cookie); 

            //这样可能需要一些重载方法。根据需要写就可以了 



            //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。 

            myWebClient.Credentials = CredentialCache.DefaultCredentials;

            //如果服务器要验证用户名,密码 

            //NetworkCredential mycred = new NetworkCredential(struser, strpassword); 

            //myWebClient.Credentials = mycred; 

            //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号) 

            byte[] myDataBuffer = myWebClient.DownloadData(url);

            string strWebData = Encoding.Default.GetString(myDataBuffer);



            //获取网页字符编码描述信息 

            Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);

            string webCharSet = charSetMatch.Groups[2].Value.Replace("\"", "");

            if (charSet == null || charSet == "")

                charSet = webCharSet;



            if (charSet != null && charSet != "" && Encoding.GetEncoding(charSet) != Encoding.Default)

                strWebData = Encoding.GetEncoding(charSet).GetString(myDataBuffer);

            return strWebData;

        }



        string getPictures(string data, string url)

        {

            MatchCollection ps = Regex.Matches(data, @"<img\b[^<>]*?\bsrc[\s\t\r\n]*=[\s\t\r\n]*[""']?[\s\t\r\n]*(?<imgUrl>[^\s\t\r\n""'<>]*)[^<>]*?/?[\s\t\r\n]*>");

            string s = string.Empty;

            for (int i = 0; i < ps.Count; i++)

            {

                pictures p = new pictures(ps[i].Value, url);

                s += p.GetHtml + "<br />" + Environment.NewLine;

            }

            return s;

        }



        /// <summary>

        /// 图片实体

        /// 图片文件属性处理类

        /// </summary>

        public class pictures

        {

            public pictures(string strHtml, string baseUrl)

            {

                _html = strHtml;

                Uri u1 = new Uri(baseUrl);

                _doman = u1.Host;

                _baseUrl = u1.Scheme + "://" + _doman;

                setSrc();

            }



            private string _html = string.Empty;

            private string _baseUrl = string.Empty;

            private string _doman = string.Empty;



            public string GetHtml

            {

                get { return _html; }

            }



            public string Alt

            {

                get

                {

                    return GetAttribute("alt")[0];

                }

            }



            public string Src

            {

                get

                {

                    string s = GetAttribute("src")[0];

                    return s;

                }

            }



            /// <summary>

            /// 根据基路径把相对路径转换成绝对径

            /// </summary>

            /// <param name="baseUrl">基础路径</param>

            /// <param name="u">待转换的相对路径</param>

            /// <returns>绝对路径</returns>

            public string absUrl(string baseUrl, string u)

            {

                Uri ub = new Uri(baseUrl);

                Uri ua = new Uri(ub, u);

                return ua.AbsoluteUri;

            }



            private void setSrc()

            {

                string strPattern = @"src[\s\t\r\n]*=[\s\t\r\n]*[""']?\S+[""']?";

                string src = GetAttribute("src")[0].ToLower();

                if (!(src.IndexOf("http://") == 0 || src.IndexOf("https://") == 0) && _baseUrl.Length > 10)

                {

                    src = absUrl(_baseUrl, src);

                    string s = "src=\"" + src + "\"";

                    _html = Regex.Replace(_html, strPattern, s);

                }

            }



            /// <summary>

            /// 获取HTML代码中标签属性

            /// </summary>

            /// <param name="strHtml">HTML代码</param>

            /// <param name="strAttributeName">属性名称</param>

            /// <returns>属性值集合</returns>

            private string[] GetAttribute(string strAttributeName)

            {

                List<string> lstAttribute = new List<string>();

                string strPattern = string.Format(

                    @"{0}[\s\t\r\n]*=[\s\t\r\n]*[""']?\S+[""']?",

                    strAttributeName

                    );

                MatchCollection matchs = Regex.Matches(_html, strPattern, RegexOptions.IgnoreCase);

                foreach (Match m in matchs)

                {

                    lstAttribute.Add(m.Value.Split('=')[1].Replace("\"", "").Replace("'", ""));

                }

                if (lstAttribute.Count == 0) lstAttribute.Add("");

                return lstAttribute.ToArray();

            }

        }

    }

调用:

new WebPageImage().getImages("http://www.sina.com")

结果:

你可能感兴趣的:(网页)