C# 解析html —— 将html转为XHTML,然后利用Xml解析

呵呵,由于正则不熟,所以另谋出路——利用XML去解析html。

要想将抓取到的数据(直接抓取到的是byte[])  转为XML文档(即XMLDocument对象),有两个要点:

 

一、判断编码(http头 charset 在某些网站上是不准确的)

我利用的是 第三方的一开源项目 去判断编码的,效果还不错:链接 。 

二、将html转为XHTML

我利用的是 : SgmlReaderDll.dll ,微软提供的,虽然不是100%的准确,但是足以满足 轻量级的商业需求 。

 

核心代码如下:

    public class XHtmlTools

    {

        private const string RegBody = @"<body[\s\S]*?>(?<body>[\s\S]*)</body>";



        /// <summary>

        /// 获取xml文档

        /// </summary>

        /// <param name="html"></param>

        /// <returns></returns>

        public XmlDocument GetXmlDocument(byte[] html)

        {

            StringBuilder XMLHEAD = new StringBuilder();

            XMLHEAD.Append("<?xml version=\"1.0\" encoding=\"utf-8\" ?>");

            XMLHEAD.Append("<!DOCTYPE ARTICLE[");

            XMLHEAD.Append("<!ENTITY nbsp \" \"><!ENTITY iexcl \"¡\"><!ENTITY cent \"¢\"><!ENTITY pound \"£\"><!ENTITY curren \"¤\"><!ENTITY yen \"¥\">");

            XMLHEAD.Append("<!ENTITY brvbar \"¦\"><!ENTITY sect \"§\"><!ENTITY uml \"¨\"><!ENTITY copy \"©\"><!ENTITY ordf \"ª\"><!ENTITY laquo \"«\">");

            XMLHEAD.Append("<!ENTITY not \"¬\"><!ENTITY shy \"-\"><!ENTITY reg \"®\"><!ENTITY macr \"¯\"><!ENTITY deg \"°\"><!ENTITY plusmn \"±\">");

            XMLHEAD.Append("<!ENTITY sup2 \"²\"><!ENTITY sup3 \"³\"><!ENTITY acute \"´\"><!ENTITY micro \"µ\"><!ENTITY para \"¶\"><!ENTITY middot \"·\">");

            XMLHEAD.Append("<!ENTITY cedil \"¸\"><!ENTITY sup1 \"¹\"><!ENTITY ordm \"º\"><!ENTITY raquo \"»\"><!ENTITY frac14 \"¼\"><!ENTITY frac12 \"½\">");

            XMLHEAD.Append("<!ENTITY frac34 \"¾\"><!ENTITY iquest \"¿\"><!ENTITY times \"×\"><!ENTITY divide \"÷\"><!ENTITY Agrave \"À\"><!ENTITY Aacute \"Á\">");

            XMLHEAD.Append("<!ENTITY Acirc \"Â\"><!ENTITY Atilde \"Ã\"><!ENTITY Auml \"Ä\"><!ENTITY Aring \"Å\"><!ENTITY AElig \"Æ\"><!ENTITY Ccedil \"Ç\">");

            XMLHEAD.Append("<!ENTITY Egrave \"È\"><!ENTITY Eacute \"É\"><!ENTITY Ecirc \"Ê\"><!ENTITY Euml \"Ë\"><!ENTITY Igrave \"Ì\"><!ENTITY Iacute \"Í\">");

            XMLHEAD.Append("<!ENTITY Icirc \"Î\"><!ENTITY Iuml \"Ï\"><!ENTITY ETH \"Ð\"><!ENTITY Ntilde \"Ñ\"><!ENTITY Ograve \"Ò\"><!ENTITY Oacute \"Ó\">");

            XMLHEAD.Append("<!ENTITY Ocirc \"Ô\"><!ENTITY Otilde \"Õ\"><!ENTITY Ouml \"Ö\"><!ENTITY Oslash \"Ø\"><!ENTITY Ugrave \"Ù\"><!ENTITY Uacute \"Ú\">");

            XMLHEAD.Append("<!ENTITY Ucirc \"Û\"><!ENTITY Uuml \"Ü\"><!ENTITY Yacute \"Ý\"><!ENTITY THORN \"Þ\"><!ENTITY szlig \"ß\"><!ENTITY agrave \"à\">");

            XMLHEAD.Append("<!ENTITY aacute \"á\"><!ENTITY acirc \"â\"><!ENTITY atilde \"ã\"><!ENTITY auml \"ä\"><!ENTITY aring \"å\"><!ENTITY aelig \"æ\">");

            XMLHEAD.Append("<!ENTITY ccedil \"ç\"><!ENTITY egrave \"è\"><!ENTITY eacute \"é\"><!ENTITY ecirc \"ê\"><!ENTITY euml \"ë\"><!ENTITY igrave \"ì\">");

            XMLHEAD.Append("<!ENTITY iacute \"í\"><!ENTITY icirc \"î\"><!ENTITY iuml \"ï\"><!ENTITY eth \"ð\"><!ENTITY ntilde \"ñ\"><!ENTITY ograve \"ò\">");

            XMLHEAD.Append("<!ENTITY oacute \"ó\"><!ENTITY ocirc \"ô\"><!ENTITY otilde \"õ\"><!ENTITY ouml \"ö\"><!ENTITY oslash \"ø\"><!ENTITY ugrave \"ù\">");

            XMLHEAD.Append("<!ENTITY uacute \"ú\"><!ENTITY ucirc \"û\"><!ENTITY uuml \"ü\"><!ENTITY yacute \"ý\"><!ENTITY thorn \"þ\"><!ENTITY yuml \"ÿ\">");

            XMLHEAD.Append("<!ENTITY lsquo \"‘\"><!ENTITY rsquo \"’\"><!ENTITY ldquo \"“\"><!ENTITY rdquo \"”\"><!ENTITY sbquo \"'\"><!ENTITY mdash \"—\">");

            XMLHEAD.Append("<!ENTITY Prime \"′\"><!ENTITY hellip \"…\">");

            XMLHEAD.Append("]>");



            if (html == null)

                return null;



            string xml = Convert(html);



            if (string.IsNullOrEmpty(xml))

                return null;



            try

            {

                XmlDocument xmlDoc = new XmlDocument();

                xmlDoc.XmlResolver = null;

                xmlDoc.LoadXml(string.Format("{0}{1}", XMLHEAD.ToString(), xml));



                return xmlDoc;

            }

            catch (XmlException)

            {

                return null;

            }

        }





        /// <summary>

        /// 将html转为xml

        /// </summary>

        /// <param name="html"></param>

        /// <returns></returns>

        public string Convert(byte[] html)

        {

            string xml = string.Empty;

            try

            {

                using (HtmlReader reader = new HtmlReader(GetString(html)))

                {

                    StringBuilder sb = new StringBuilder();



                    using (HtmlWriter writer = new HtmlWriter(sb))

                    {

                        while (!reader.EOF)

                        {

                            writer.WriteNode(reader, true);

                        }

                    }



                    xml = sb.ToString();

                }

            }

            catch (Exception)

            {

            }



            Match match = Regex.Match(xml, RegBody, RegexOptions.IgnoreCase);

            if (match.Success)

            {

                xml = match.Value;

            }



            if (string.IsNullOrEmpty(xml))

            {

                xml = "<body></body>";

            }



            return xml;

        }





        /// <summary>

        /// 解析编码并获得字符串

        /// </summary>

        /// <param name="buffer"></param>

        /// <returns></returns>

        public string GetString(byte[] buffer)

        {

            string result = string.Empty;



            if (buffer == null)

                return result;



            using (MemoryStream msTemp = new MemoryStream(buffer))

            {

                if (msTemp.Length > 0)

                {

                    msTemp.Seek(0, SeekOrigin.Begin);

                    int DetLen = 0;

                    byte[] DetectBuff = new byte[4096];



                    UniversalDetector det = new UniversalDetector(null);

                    while ((DetLen = msTemp.Read(DetectBuff, 0, DetectBuff.Length)) > 0 && !det.IsDone())

                    {

                        det.HandleData(DetectBuff, 0, DetectBuff.Length);

                    }

                    det.DataEnd();

                    if (det.GetDetectedCharset() != null)

                    {

                        try

                        {

                            result = System.Text.Encoding.GetEncoding(det.GetDetectedCharset()).GetString(buffer);

                        }

                        catch (ArgumentException)

                        {

                        }

                    }

                }

            }



            return result;

        }



    }



    public class HtmlReader : Sgml.SgmlReader

    {

        public HtmlReader(TextReader reader)

            : base()

        {

            base.InputStream = reader;

            base.DocType = "HTML";

        }

        public HtmlReader(string content)

            : base()

        {

            base.InputStream = new StringReader(System.Web.HttpUtility.HtmlDecode(content));

            base.DocType = "HTML";

        }



        public override bool Read()

        {

            bool status = false;

            try

            {

                status = base.Read();

                if (status)

                {      

                    if (base.NodeType == XmlNodeType.Element

                        && (string.Compare(base.Name, "head", true) == 0

                            || string.Compare(base.Name, "script", true) == 0))

                    {

                        base.Skip();

                    }

                }

            }

            catch (Exception ex)

            {

                Console.WriteLine(ex.Message);

            }

            return status;

        }

    }



    public class HtmlWriter : XmlTextWriter

    {

        private char[] chArrFilter = new char[] { '\'', '=', '?', '\"', '.', ';', '', ')', '(', ' ', ' ' };



        public HtmlWriter(TextWriter writer)

            : base(writer)

        {

        }



        public HtmlWriter(StringBuilder builder)

            : base(new StringWriter(builder))

        {

        }



        public HtmlWriter(Stream stream, Encoding enc)

            : base(stream, enc)

        {



        }



        public override void WriteCData(string text)

        {

            // base.WriteCData(text);

        }



        public override void WriteComment(string text)

        {



        }



        public override void WriteWhitespace(string ws)

        {

            if (ws.IndexOf("\r\n") > -1 || ws.IndexOf("\t") > -1)

            {

                return;

            }



            if (ws != " ")

            {

                // 处理空白字符

                base.WriteWhitespace(ws);

            }

        }





        public override void WriteStartElement(string prefix, string localName, string ns)

        {

            if (localName != "")

            {

                int index = localName.LastIndexOf(':');



                if (index > -1)

                {

                    // 防止带有前缀

                    localName = localName.Substring(index + 1);

                }



                localName = string.Join("", localName.Split(chArrFilter)).ToLower();



                base.WriteStartElement("", localName, "");

            }

        }





        public override void WriteAttributes(XmlReader reader, bool defattr)

        {

            if ((reader.NodeType == XmlNodeType.Element) || (reader.NodeType == XmlNodeType.XmlDeclaration))

            {

                if (reader.MoveToFirstAttribute())

                {

                    this.WriteAttributes(reader, defattr);

                    reader.MoveToElement();

                }

            }

            else if (reader.NodeType == XmlNodeType.Attribute)

            {

                string localName = "";

                string value = "";

                do

                {

                    localName = reader.LocalName.ToLower();



                    // 单过滤

                    if (localName != "xml:space" && (localName.LastIndexOf(':') > -1 || localName.StartsWith("xml")))

                    {

                        // 防止带有前缀

                        continue;

                    }



                    localName = string.Join("", localName.Split(chArrFilter));



                    if (localName == "")

                    {

                        continue;

                    }



                    this.WriteStartAttribute("", localName, "");



                    while (reader.ReadAttributeValue())

                    {

                        // if (reader.NodeType == XmlNodeType.EntityReference)

                        // {

                        //     this.WriteEntityRef(reader.Name);

                        //     continue;

                        // }



                        value = reader.Value;



                        if (value == "")

                        {

                            continue;

                        }



                        this.WriteString(value);



                        // this.WriteRawString(reader.Value);

                        // this.WriteAttributeString(localName, reader.Value);

                    }



                    this.WriteEndAttribute();



                    // ===========================================

                    //string attributeLocalName = reader.LocalName;

                    //while (reader.ReadAttributeValue())

                    //{

                    //    string str =  reader.Name;

                    //}



                    //string strValue = reader.Value;

                    //attributeLocalName = reader.Name;



                    //// 过滤无效的属性

                    //if (attributeLocalName == "" || strValue == "")

                    //{

                    //    attributeLocalName = attributeLocalName.TrimStart(new char[] { '\'', '=', '?', '\"', '.' }).ToLower();

                    //    this.WriteAttributeString(attributeLocalName, strValue);

                    //}



                } while (reader.MoveToNextAttribute());

            }

        }



    }


 

上述源码及DLL : http://files.cnblogs.com/08shiyan/XHtmlTools.zip 

 

下面再说一下解析XML,我利用的XPath:

XPath 和 jQuery所支持的选择器有一定的相似之处,借助jQuery所支持的选择器去理解XPath会更容易一些。

XmlNode.SelectSingleNode

XmlNode.SelectNodes

http://www.cnblogs.com/08shiyan/archive/2013/05/02/3055078.html

 

 

续:

imfunny  分享的 HtmlAgilityPack,开源的力量很强大!

HtmlAgilityPack 里的部分类 的元属性截图
C# 解析html —— 将html转为XHTML,然后利用Xml解析

支持多个 .NET 版本
C# 解析html —— 将html转为XHTML,然后利用Xml解析

HtmlAgilityPack地址:http://htmlagilitypack.codeplex.com/ 

 

你可能感兴趣的:(XHTML)