使用MSHTML解析HTML代码

从国外站点转过来的:

C++版本:

I have a lot of experience in programming low-level MSHTML and I always see questions on how one can use MSHTML to parse HTML and then access elements via the DOM.

 

Well, here it is. I use IMarkupServices provided by MSHTML. There is no need for an IOleClientSite or any sort of embedding. I think is is just about as light as anyone can get.

In future articles, I will be concentrating on the reuse of MSHTML in other aspects of programming. Such as using MSHTML as an editor, for example.

This code makes use of simple COM calls and nothing more. It can be easily adapted for ATL, MFC and VB, among other languages. Please don't ask me to provide samples in other languages. In order to build this you need the IE SDK

/****************************************************************** * ParseHTML.cpp * * ParseHTML: Lightweight UI-less HTML parser using MSHTML * * Note: This is for accessing the DOM only. No image download, * script execution, etc... * * 8 June 2001 - Asher Kobin ([email protected]) * * THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY * OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT * LIMITED TO THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR * FITNESS FOR A PARTICULAR PURPOSE. * *******************************************************************/ #include <windows.h> #include <mshtml.h> OLECHAR szHTML[] = OLESTR("<HTML><BODY>Hello World!</BODY></HTML>"); int __stdcall WinMain(HINSTANCE hInst, HINSTANCE hPrev, LPSTR lpCmdLine, int nShowCmd) { IHTMLDocument2 *pDoc = NULL; CoInitialize(NULL); CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (LPVOID *) &pDoc); if (pDoc) { IPersistStreamInit *pPersist = NULL; pDoc->QueryInterface(IID_IPersistStreamInit, (LPVOID *) &pPersist); if (pPersist) { IMarkupServices *pMS = NULL; pPersist->InitNew(); pPersist->Release(); pDoc->QueryInterface(IID_IMarkupServices, (LPVOID *) &pMS); if (pMS) { IMarkupContainer *pMC = NULL; IMarkupPointer *pMkStart = NULL; IMarkupPointer *pMkFinish = NULL; pMS->CreateMarkupPointer(&pMkStart); pMS->CreateMarkupPointer(&pMkFinish); pMS->ParseString(szHTML, 0, &pMC, pMkStart, pMkFinish); if (pMC) { IHTMLDocument2 *pNewDoc = NULL; pMC->QueryInterface(IID_IHTMLDocument, (LPVOID *) &pNewDoc); if (pNewDoc) { // do anything with pNewDoc, in this case // get the body innerText. IHTMLElement *pBody; pNewDoc-gt;get_body(&pBody); if (pBody) { BSTR strText; pBody->get_innerText(&strText); pBody->Release(); SysFreeString(strText); } pNewDoc->Release(); } pMC->Release(); } if (pMkStart) pMkStart->Release(); if (pMkFinish) pMkFinish->Release(); pMS->Release(); } } pDoc->Release(); } CoUninitialize(); return TRUE; }

 

Delphi版本1:

( add to uses clause, MSHTML, ActiveX, ComObj ) const IID_IPersistStreamInit : TGUID = '{7FD52380-4E07-101B-AE2D-08002B2EC713}'; procedure TFormMain.FormCreate(Sender: TObject); var pDoc : IHTMLDocument2; pNewDoc : IHTMLDocument2; pPersist : IPersistStreamInit; pMS : IMarkupServices; pMC : IMarkupContainer; pMkStart : IMarkupPointer; pMkFinish : IMarkupPointer; pBody : IHTMLElement; strText : string; szHTML : widestring; didInit : boolean; begin didInit :=Succeeded(CoInitialize(nil)); szHTML :='<HTML><BODY>Hello World!</BODY></HTML>'; CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, pDoc); if pDoc <> nil then begin pDoc.QueryInterface(IID_IPersistStreamInit, pPersist); if (pPersist <> nil) then begin pPersist.InitNew; pPersist._Release; pDoc.QueryInterface(IID_IMarkupServices, pMS); if (pMS <> nil) then begin pMS.CreateMarkupPointer(pMkStart); pMS.CreateMarkupPointer(pMkFinish); pMS.ParseString(word(szHTML[1]), 0, pMC, pMkStart, pMkFinish); if (pMC <> nil) then begin pMC.QueryInterface(IID_IHTMLDocument, pNewDoc); if (pNewDoc <> nil) then begin // do anything with pNewDoc, in this case // get the body innerText. pBody :=pNewDoc.Get_body; if (pBody <> nil) then begin strText :=pBody.Get_innerText; m.Text :=strText; pBody._Release; end; pNewDoc._Release; end; pMC._Release; end; if (pMkStart <> nil) then pMkStart._Release; if (pMkFinish <> nil) then pMkFinish._Release; pMS._Release; end; pPersist._Release; end; pDoc._Release; end; if didInit then CoUninitialize(); end;

 

Delphi版本2:

( add to uses clause, MSHTML, ActiveX, ComObj ) const IID_IPersistStreamInit : TGUID = '{7FD52380-4E07-101B-AE2D-08002B2EC713}'; procedure TFormMain.FormCreate(Sender: TObject); var pDoc : IHTMLDocument2; pBody : IHTMLElement; strText : string; szHTML : widestring; didInit : boolean; begin didInit :=Succeeded(CoInitialize(nil)); szHTML :='<HTML><BODY>Hello World!</BODY></HTML>'; CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, pDoc); if pDoc <> nil then begin pDoc.Set_designMode('On'); //no script execution while not (pDoc.readyState = 'complete') do Application.ProcessMessages; pDoc.body.innerHTML :=szHTML; pBody :=pDoc.Get_body; if pBody <> nil then strText :=pBody.Get_innerText else strText :=''; m.Text :=strText; pDoc._Release; end; if didInit then CoUninitialize(); end; ============== Other Useful Routines =============== ============== Other Useful Routines =============== ============== Other Useful Routines =============== function GetHTMLSource(Document: IDispatch) : string; var pStream : IStream; pPersist : IPersistStreamInit; li,lo : int64; stat : STATSTG; str : string; BytesRead : longint; begin result :=''; if SUCCEEDED(CreateStreamOnHGlobal(0, TRUE, pStream)) then begin if (SUCCEEDED(Document.QueryInterface(IID_IPersistStreamInit, pPersist))) then begin pPersist.Save(pStream, FALSE); li :=0; pStream.Seek(li, STREAM_SEEK_SET, lo); pStream.Stat(stat, 0); SetLength(str,stat.cbSize + 1); pStream.Read(@str[1], stat.cbSize, @BytesRead); result :=str; end; end; end; procedure SetHTMLSource(Document: IDispatch; value: string); var stm : TMemoryStream; psi : IPersistStreamInit; sa : TStreamAdapter; begin stm :=TMemoryStream.Create; stm.SetSize(Length(value)); stm.Seek(0,0); stm.Write(value[1],Length(value)); stm.Seek(0,0); sa :=TStreamAdapter.Create(stm, soReference); //if you pass soOwned instead, the stream will be freed for you if (SUCCEEDED(Document.QueryInterface(IID_IPersistStreamInit,psi))) then psi.Load(sa); end;

你可能感兴趣的:(html,Stream,server,null,download,Delphi)