功能:
将Html原码解析成IHTMLDocumet2对象,然后将IHTMLDocumet2转换成IHTMLDocumet3,使用DOMNode,将html显示成一棵树。此解析不执行任何脚本,不从网上下载任何资料,是一个纯文本的解析。
(方法 Parse(string str) 一个轻量级Parsing 实现。这个代码不会从网上下载任何资料,也不会执行任何脚本,纯属Parsing。
Parsing是通过MSHTML的Markup Service实现的。要正确使用这个代码,需要添加MSHTML引用。)
要正确编译如下代码,还需要修改unsafe(启用不安全模式)编译器选项,将其开启。
方法:在“项目”->“<应用程序名称>属性”对话框中打开“配置属性”,选中“生成”项,修改“允许不安全代码块”的内容为true.
[C#]
using
System;
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using mshtml;
using System.Runtime.InteropServices;
using System.IO;
namespace WindowsApplication1
{
[ComVisible( true ), ComImport(), Guid( " 7FD52380-4E07-101B-AE2D-08002B2EC713 " ) , InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID([In, Out] ref Guid pClassID);
[ return : MarshalAs(UnmanagedType.I4)] [PreserveSig]
int IsDirty();
void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm,
[In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
void InitNew();
}
///
/// Form1 的摘要说明。
///
public class Form1 : System.Windows.Forms.Form
{
private System.Windows.Forms.Button button1;
private System.Windows.Forms.TreeView treeView1;
///
/// 必需的设计器变量。
///
private System.ComponentModel.Container components = null ;
public Form1()
{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();
//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
}
///
/// 清理所有正在使用的资源。
///
protected override void Dispose( bool disposing )
{
if ( disposing )
{
if (components != null )
{
components.Dispose();
}
}
base .Dispose( disposing );
}
#region Windows 窗体设计器生成的代码
///
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
///
private void InitializeComponent()
{
this .button1 = new System.Windows.Forms.Button();
this .treeView1 = new System.Windows.Forms.TreeView();
this .SuspendLayout();
//
// button1
//
this .button1.Location = new System.Drawing.Point( 24 , 16 );
this .button1.Name = " button1 " ;
this .button1.Size = new System.Drawing.Size( 88 , 24 );
this .button1.TabIndex = 0 ;
this .button1.Text = " button1 " ;
this .button1.Click += new System.EventHandler( this .button1_Click);
//
// treeView1
//
this .treeView1.ImageIndex = - 1 ;
this .treeView1.Location = new System.Drawing.Point( 280 , 96 );
this .treeView1.Name = " treeView1 " ;
this .treeView1.SelectedImageIndex = - 1 ;
this .treeView1.Size = new System.Drawing.Size( 288 , 224 );
this .treeView1.TabIndex = 1 ;
//
// Form1
//
this .AutoScaleBaseSize = new System.Drawing.Size( 6 , 14 );
this .ClientSize = new System.Drawing.Size( 664 , 333 );
this .Controls.Add( this .treeView1);
this .Controls.Add( this .button1);
this .Name = " Form1 " ;
this .Text = " Form1 " ;
this .ResumeLayout( false );
}
#endregion
///
/// 应用程序的主入口点。
///
[STAThread]
static void Main()
{
Application.Run( new Form1());
}
unsafe IHTMLDocument2 Parse( string s)
{
IHTMLDocument2 pDocument = new HTMLDocumentClass();
if (pDocument != null )
{
IPersistStreamInit pPersist = pDocument as IPersistStreamInit ;
pPersist.InitNew();
pPersist = null ;
IMarkupServices ms = pDocument as IMarkupServices ;
if (ms != null )
{
IMarkupContainer pMC = null ;
IMarkupPointer pStart,pEnd;
ms.CreateMarkupPointer( out pStart);
ms.CreateMarkupPointer( out pEnd);
System.Text.StringBuilder sb = new System.Text.StringBuilder(s);
IntPtr pSource = Marshal.StringToHGlobalUni(s);
ms.ParseString( ref * ( ushort * )pSource.ToPointer(), 0 , out pMC,pStart,pEnd);
if (pMC != null )
{
Marshal.Release(pSource);
return pMC as IHTMLDocument2;
}
Marshal.Release(pSource);
}
}
return null ;
}
private void button1_Click( object sender, System.EventArgs e)
{
string html = "" ;
string filename = " D:\\NetC#Program\\html\\163.htm " ;
if ( ! File.Exists(filename))
{
Console.WriteLine( " 文件不存在 " );
return ;
}
StreamReader sr1 = new StreamReader(
(System.IO.Stream)File.OpenRead(filename),System.Text.Encoding.Default);
html = "" ;
while (sr1.Peek() >- 1 )
{
html = html + sr1.ReadToEnd();
}
sr1.Close();
IHTMLDocument2 doc2 = Parse(html);
Console.WriteLine(doc2.styleSheets.length);
IHTMLDocument3 HTMLDocument = (IHTMLDocument3)doc2;
IHTMLDOMNode rootDomNode = (IHTMLDOMNode)HTMLDocument.documentElement;
TreeNode root = treeView1.Nodes.Add( " HTML " );
InsertDOMNodes(rootDomNode,root);
}
private void InsertDOMNodes(IHTMLDOMNode parentnode,TreeNode tree_node)
{
if (parentnode.hasChildNodes()) // 是否有子结点
{
IHTMLDOMChildrenCollection allchild = (IHTMLDOMChildrenCollection)parentnode.childNodes;
int length = allchild.length;
for ( int i = 0 ;i < length;i ++ ) // 对每个子结点进行处理,首先取出每个子节点的属性,然后进行递归
{
IHTMLDOMNode child_node = (IHTMLDOMNode)allchild.item(i);
string m_snodeName = child_node.nodeName;
object m_onodevalue = child_node.nodeValue;
string m_snodetype = child_node.nodeType.ToString();
string m_snodevalue = "" ;
if ( m_onodevalue != null )
m_snodevalue = m_onodevalue.ToString().Trim();
TreeNode tempnode = null ;
if (child_node.nodeName.Equals( " #text " ))
{
if ((m_snodevalue != null ) && ( ! m_snodevalue.Equals( "" )))
{
tempnode = tree_node.Nodes.Add(m_snodevalue);
}
}
else
{
tempnode = tree_node.Nodes.Add(child_node.nodeName);
InsertDOMNodes(child_node,tempnode);
}
}
}
}
}
}
using System.Drawing;
using System.Collections;
using System.ComponentModel;
using System.Windows.Forms;
using System.Data;
using mshtml;
using System.Runtime.InteropServices;
using System.IO;
namespace WindowsApplication1
{
[ComVisible( true ), ComImport(), Guid( " 7FD52380-4E07-101B-AE2D-08002B2EC713 " ) , InterfaceTypeAttribute(ComInterfaceType.InterfaceIsIUnknown)]
public interface IPersistStreamInit
{
void GetClassID([In, Out] ref Guid pClassID);
[ return : MarshalAs(UnmanagedType.I4)] [PreserveSig]
int IsDirty();
void Load([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm);
void Save([In, MarshalAs(UnmanagedType.Interface)] UCOMIStream pstm,
[In, MarshalAs(UnmanagedType.I4)] int fClearDirty);
void GetSizeMax([Out, MarshalAs(UnmanagedType.LPArray)] long pcbSize);
void InitNew();
}
///
/// Form1 的摘要说明。
///
public class Form1 : System.Windows.Forms.Form
{
private System.Windows.Forms.Button button1;
private System.Windows.Forms.TreeView treeView1;
///
/// 必需的设计器变量。
///
private System.ComponentModel.Container components = null ;
public Form1()
{
//
// Windows 窗体设计器支持所必需的
//
InitializeComponent();
//
// TODO: 在 InitializeComponent 调用后添加任何构造函数代码
//
}
///
/// 清理所有正在使用的资源。
///
protected override void Dispose( bool disposing )
{
if ( disposing )
{
if (components != null )
{
components.Dispose();
}
}
base .Dispose( disposing );
}
#region Windows 窗体设计器生成的代码
///
/// 设计器支持所需的方法 - 不要使用代码编辑器修改
/// 此方法的内容。
///
private void InitializeComponent()
{
this .button1 = new System.Windows.Forms.Button();
this .treeView1 = new System.Windows.Forms.TreeView();
this .SuspendLayout();
//
// button1
//
this .button1.Location = new System.Drawing.Point( 24 , 16 );
this .button1.Name = " button1 " ;
this .button1.Size = new System.Drawing.Size( 88 , 24 );
this .button1.TabIndex = 0 ;
this .button1.Text = " button1 " ;
this .button1.Click += new System.EventHandler( this .button1_Click);
//
// treeView1
//
this .treeView1.ImageIndex = - 1 ;
this .treeView1.Location = new System.Drawing.Point( 280 , 96 );
this .treeView1.Name = " treeView1 " ;
this .treeView1.SelectedImageIndex = - 1 ;
this .treeView1.Size = new System.Drawing.Size( 288 , 224 );
this .treeView1.TabIndex = 1 ;
//
// Form1
//
this .AutoScaleBaseSize = new System.Drawing.Size( 6 , 14 );
this .ClientSize = new System.Drawing.Size( 664 , 333 );
this .Controls.Add( this .treeView1);
this .Controls.Add( this .button1);
this .Name = " Form1 " ;
this .Text = " Form1 " ;
this .ResumeLayout( false );
}
#endregion
///
/// 应用程序的主入口点。
///
[STAThread]
static void Main()
{
Application.Run( new Form1());
}
unsafe IHTMLDocument2 Parse( string s)
{
IHTMLDocument2 pDocument = new HTMLDocumentClass();
if (pDocument != null )
{
IPersistStreamInit pPersist = pDocument as IPersistStreamInit ;
pPersist.InitNew();
pPersist = null ;
IMarkupServices ms = pDocument as IMarkupServices ;
if (ms != null )
{
IMarkupContainer pMC = null ;
IMarkupPointer pStart,pEnd;
ms.CreateMarkupPointer( out pStart);
ms.CreateMarkupPointer( out pEnd);
System.Text.StringBuilder sb = new System.Text.StringBuilder(s);
IntPtr pSource = Marshal.StringToHGlobalUni(s);
ms.ParseString( ref * ( ushort * )pSource.ToPointer(), 0 , out pMC,pStart,pEnd);
if (pMC != null )
{
Marshal.Release(pSource);
return pMC as IHTMLDocument2;
}
Marshal.Release(pSource);
}
}
return null ;
}
private void button1_Click( object sender, System.EventArgs e)
{
string html = "" ;
string filename = " D:\\NetC#Program\\html\\163.htm " ;
if ( ! File.Exists(filename))
{
Console.WriteLine( " 文件不存在 " );
return ;
}
StreamReader sr1 = new StreamReader(
(System.IO.Stream)File.OpenRead(filename),System.Text.Encoding.Default);
html = "" ;
while (sr1.Peek() >- 1 )
{
html = html + sr1.ReadToEnd();
}
sr1.Close();
IHTMLDocument2 doc2 = Parse(html);
Console.WriteLine(doc2.styleSheets.length);
IHTMLDocument3 HTMLDocument = (IHTMLDocument3)doc2;
IHTMLDOMNode rootDomNode = (IHTMLDOMNode)HTMLDocument.documentElement;
TreeNode root = treeView1.Nodes.Add( " HTML " );
InsertDOMNodes(rootDomNode,root);
}
private void InsertDOMNodes(IHTMLDOMNode parentnode,TreeNode tree_node)
{
if (parentnode.hasChildNodes()) // 是否有子结点
{
IHTMLDOMChildrenCollection allchild = (IHTMLDOMChildrenCollection)parentnode.childNodes;
int length = allchild.length;
for ( int i = 0 ;i < length;i ++ ) // 对每个子结点进行处理,首先取出每个子节点的属性,然后进行递归
{
IHTMLDOMNode child_node = (IHTMLDOMNode)allchild.item(i);
string m_snodeName = child_node.nodeName;
object m_onodevalue = child_node.nodeValue;
string m_snodetype = child_node.nodeType.ToString();
string m_snodevalue = "" ;
if ( m_onodevalue != null )
m_snodevalue = m_onodevalue.ToString().Trim();
TreeNode tempnode = null ;
if (child_node.nodeName.Equals( " #text " ))
{
if ((m_snodevalue != null ) && ( ! m_snodevalue.Equals( "" )))
{
tempnode = tree_node.Nodes.Add(m_snodevalue);
}
}
else
{
tempnode = tree_node.Nodes.Add(child_node.nodeName);
InsertDOMNodes(child_node,tempnode);
}
}
}
}
}
}