//CparseHTML.h
#include <afx.h>
#include <iostream>
#include <comdef.h>
#include <CString>
#include <mshtml.h>
#include <String>
#pragma warning (disable: 4786)
#include <vector>
#pragma warning(disable : 4146) //see Q231931 for explaintation
#import <mshtml.tlb> no_auto_exclude
using namespace std;
class CparseHTML{
public:
CparseHTML();
IHTMLDocument2Ptr anylyseHTML(char *);
vector<string> getRSSURL(char *);
vector<string> getRSSTitle(char *);
// int getErrorCode();
private:
vector< string > V_RSSURL;
vector< string > V_RSSTitle;
};
//CparseHTML.cpp
#include "CparseHTML.h"
using namespace std;
MSHTML::IHTMLDocument3Ptr pDoc3;
MSHTML::IHTMLElementCollectionPtr pCollection;
MSHTML::IHTMLElementPtr pElement;
CparseHTML::CparseHTML()
{
}
vector<string> CparseHTML::getRSSURL(char * strHTML)
{
char *s1="application/rss+xml";
char *s="http://";
string forstr=strHTML;
pDoc3=anylyseHTML(strHTML);
MSHTML::IHTMLElementCollectionPtr pCollection;
MSHTML::IHTMLElementPtr pElement;
pCollection = pDoc3->getElementsByTagName("link");
if(pCollection==NULL){
pCollection = pDoc3->getElementsByTagName("LINK");
}
CString l_temp;
CString l_tp;
CString l_title;
BSTR bsText;
for(long i=0; i<pCollection->length; i++)
{
pElement = pCollection->item(i, (long)0);
if(pElement != NULL)
{
l_tp=(LPSTR)(LPCTSTR)bstr_t(pElement->getAttribute("type",2));
char *tp=(LPSTR)(LPCTSTR)l_tp;
if(tp==NULL)
cout<<"没有找到RSS源"<<endl;
else{
int m=memcmp(tp,s1,19);
if(m==0)
{
l_temp =(LPCTSTR)bstr_t(pElement->getAttribute("href",2));
HRESULT hr=pElement->get_innerText(&bsText);
char *RSSURL = (LPSTR)(LPCTSTR)l_temp;
int n=memcmp(RSSURL,s,7);
string str1 = RSSURL;
if(n!=0)
{
str1=forstr+str1;
}
V_RSSURL.push_back(str1);
}
}
}
}
return V_RSSURL;
}
vector<string> CparseHTML::getRSSTitle(char * strHTML)
{
char *s1="application/rss+xml";
BSTR bsText;
pDoc3=anylyseHTML(strHTML);
pCollection = pDoc3->getElementsByTagName("link");
if(pCollection==NULL){
pCollection = pDoc3->getElementsByTagName("LINK");
}
CString l_tp;
CString l_title;
for(long i=0; i<pCollection->length; i++)
{
pElement = pCollection->item(i, (long)0);
if(pElement != NULL)
{
l_tp=(LPSTR)(LPCTSTR)bstr_t(pElement->getAttribute("type",2));
char *tp=(LPSTR)(LPCTSTR)l_tp;
if(tp==NULL)
cout<<"没有找到RSS源"<<endl;
else
{
int m=memcmp(tp,s1,19);
if(m==0)
{
l_title= (LPCTSTR)bstr_t(pElement->getAttribute("title",2));
char *RSSTitle=(LPSTR)(LPCTSTR)l_title;
HRESULT hr=pElement->get_innerText(&bsText);
string RSSTit=RSSTitle;
V_RSSTitle.push_back(RSSTit);
}
}
}
}
return V_RSSTitle;
}
IHTMLDocument2Ptr CparseHTML::anylyseHTML(char *strHTML )
{
CFile f;
CString m_csFilename="D:\\test1.html";
//CFile:: modeCreate 创建一个新的文件,如果文件存在将文件截取成长度为0
// CFile::shareDenyNone 打开这个文件同时允许其它进程读写这个文件。如果文件被其它进程以兼容的模式打开,这是create操作会失败。
//let's open file and read it into CString (u can use any buffer to read though
if (f.Open(m_csFilename, CFile::modeRead|CFile::shareDenyNone))
{
CString csWholeFile;
char *buf =csWholeFile.GetBuffer(f.GetLength());
f.Read(buf, f.GetLength());
csWholeFile.ReleaseBuffer(f.GetLength());
f.Close();
CoInitialize(NULL);
IHTMLDocument2Ptr pDoc;
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,
IID_IHTMLDocument2, (void**)&pDoc);
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
VARIANT *param;
bstr_t bsData = (LPCTSTR)csWholeFile;
hr = SafeArrayAccessData(psa, (LPVOID*)¶m);//typedef void *LPVOID;
param->vt = VT_BSTR;
param->bstrVal = (BSTR)bsData;
hr = pDoc->write(psa); //write your buffer
hr = pDoc->close();//and closes the document, "applying" your code
SafeArrayDestroy(psa);
CoUninitialize();
return pDoc;
}
else return -1;
}
/*int CparseHTML::getErrorCode()
{
}*/
主函数
#include "CparseHTML.h"
#include <iostream>
using namespace std;
int main()
{
CparseHTML pHTML;
cout<<"please input HTMLURL"<<endl;
char HTMLURL[100];
cin>>HTMLURL;
vector <string> V_RSSURL;
vector <string> V_RSSTitle;
V_RSSURL=pHTML.getRSSURL(HTMLURL);
V_RSSTitle=pHTML.getRSSTitle(HTMLURL);
for(int i=0;i<V_RSSURL.size();i++){
cout<<V_RSSTitle[i]<<endl;
cout<<V_RSSURL[i]<<endl;
}
return 0;
}