内存中html源码用IHTMLDocument2进行DOM方式解析

准备开发一款站群系统,不过不准备用webbrowser 因为webbrowser 会下载整个网页 甚至图片,而这并非我所需要的,我只是需要其html代码即可  而winnet稳定性又不高,所以最终选择了winnet的升级版winhttp

不过winhttp读取到源码之后解析是个大问题,本来打算用正则表达式,不过我并不擅长,而且需要处理的问题非常多,网上倒是有几个html解析源码 诸如htmlcxx 但是我最终还是放弃了  选择用IHTMLDocument2 进行解析,因为这毕竟之前操作过,更熟悉一些,兼容性方面也不存在什么问题, 不过之前使用IHTMLDocument2 是因为开发BHO或有webbrowser控件的MFC程序 他们都有doc载体,在consle程序中还没有操作过,经过一番google 终于写出了可用的代码

// html.cpp : 定义控制台应用程序的入口点。 // #include "stdafx.h" #include "stdafx.h" #include <WINDOWS.H> #include <winhttp.h> #include <string> #import <mshtml.tlb> #include <mshtml.h> #include <atlbase.h> #include <oleacc.h> #include <oleauto.h> #pragma comment (lib,"oleaut32.lib") #pragma comment (lib,"Winhttp.lib") using namespace std; string g_HtmlBuf; void microsoft() { // First, split up the URL DWORD dwSize = 0; DWORD dwDownloaded = 0; LPSTR pszOutBuffer; BOOL bResults = FALSE; HINTERNET hSession = NULL, hConnect = NULL, hRequest = NULL; // Use WinHttpOpen to obtain a session handle. hSession = WinHttpOpen( L"WinHTTP Example/1.0", WINHTTP_ACCESS_TYPE_DEFAULT_PROXY, WINHTTP_NO_PROXY_NAME, WINHTTP_NO_PROXY_BYPASS, 0 ); // Specify an HTTP server. if( hSession ) hConnect = WinHttpConnect( hSession, L"www.microsoft.com", INTERNET_DEFAULT_HTTPS_PORT, 0 ); // Create an HTTP request handle. if( hConnect ) hRequest = WinHttpOpenRequest( hConnect, L"GET", NULL, NULL, WINHTTP_NO_REFERER, WINHTTP_DEFAULT_ACCEPT_TYPES, WINHTTP_FLAG_SECURE ); // Send a request. if( hRequest ) bResults = WinHttpSendRequest( hRequest, WINHTTP_NO_ADDITIONAL_HEADERS, 0, WINHTTP_NO_REQUEST_DATA, 0, 0, 0 ); // End the request. if( bResults ) bResults = WinHttpReceiveResponse( hRequest, NULL ); // Keep checking for data until there is nothing left. if( bResults ) { do { // Check for available data. dwSize = 0; if( !WinHttpQueryDataAvailable( hRequest, &dwSize ) ) printf( "Error %u in WinHttpQueryDataAvailable./n", GetLastError( ) ); // Allocate space for the buffer. pszOutBuffer = new char[dwSize+1]; if( !pszOutBuffer ) { printf( "Out of memory/n" ); dwSize=0; } else { // Read the data. ZeroMemory( pszOutBuffer, dwSize+1 ); if( !WinHttpReadData( hRequest, (LPVOID)pszOutBuffer, dwSize, &dwDownloaded ) ) printf( "Error %u in WinHttpReadData./n", GetLastError( ) ); else { g_HtmlBuf.append(pszOutBuffer); printf( "%s", pszOutBuffer ); } // Free the memory allocated to the buffer. delete [] pszOutBuffer; } } while( dwSize > 0 ); } // Report any errors. if( !bResults ) printf( "Error %d has occurred./n", GetLastError( ) ); // Close any open handles. if( hRequest ) WinHttpCloseHandle( hRequest ); if( hConnect ) WinHttpCloseHandle( hConnect ); if( hSession ) WinHttpCloseHandle( hSession ); } //对内存中的html源码借用IHTMLDocument2进行DOM方式解析 void Analysis_DOM_Html() { // IHTMLDocument2 *document; // Declared earlier in the code MSHTML::IHTMLDocument2Ptr pDoc2; MSHTML::IHTMLDocument3Ptr pDoc3; MSHTML::IHTMLElementCollectionPtr pCollection; MSHTML::IHTMLElementPtr pElement; HRESULT hr =CoInitialize(NULL); hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, (void**)&pDoc2); HRESULT hresult = S_OK; VARIANT *param; SAFEARRAY *sfArray; BSTR bstr = _com_util::ConvertStringToBSTR(g_HtmlBuf.c_str()); // Creates a new one-dimensional array sfArray = SafeArrayCreateVector(VT_VARIANT, 0, 1); if (sfArray == NULL || pDoc2 == NULL) { MessageBox(NULL,"失败",NULL,NULL); goto cleanup; } hresult = SafeArrayAccessData(sfArray,(LPVOID*) & param); param->vt = VT_BSTR; param->bstrVal = bstr; hresult = SafeArrayUnaccessData(sfArray); hresult = pDoc2->write(sfArray); //Start Analysis //这个部分自由发挥 hr = pDoc2->get_all(&pCollection); VARIANT name; IDispatch * pDispatch=NULL; if(pCollection!=NULL) { name.vt = VT_I4; //AfxMessageBox( "gethere1 "); for(int i=0;i <100;i++)//遍历所有元素 { name.lVal = i; /////////////////////////////////////// //IDispatch* pDisp; //IHTMLElement *pElem; pDispatch = pCollection-> item(name,name);//获取元素对象指针 pDispatch-> QueryInterface(IID_IHTMLElement,(void**)&pElement); //pAllElem-> item(name,name,(IDispatch**)&pElem);//获取元素对象指针 //ASSERT(pElem);///////////////////////运行出错!!!!!!!!!!!!! BSTR tag,outerText,title,innerText,className; pElement-> get_tagName(&tag);//获取tagName pElement-> get_outerText(&outerText); pElement-> get_title(&title); pElement-> get_innerText(&innerText); pElement-> get_className(&className); ::SysFreeString(tag); ::SysFreeString(outerText); ::SysFreeString(title); ::SysFreeString(innerText); ::SysFreeString(className); } } cleanup: SysFreeString(bstr); if (sfArray != NULL) { // SafeArrayDestroy(sfArray); } CoUninitialize(); } int main() { microsoft(); Analysis_DOM_Html(); return (0); }  

你可能感兴趣的:(html,正则表达式,Microsoft,null,WebBrowser,stdstring)