如何解析网页源代码

int CSurfSafeParser::Parse(IHTMLElementCollection *p_imgColl)
{
 long cElems=0;
 // retrieve the count of elements in the collection
 HRESULT hr=S_OK;
 if (!SUCCEEDED(hr = p_imgColl->get_length( &cElems )))
  return __LINE__;
 for ( int i=0; i<cElems; i++ )
 {
  _variant_t vIndex((long)i,VT_I4);
  _variant_t var2((long)0,VT_I4);
//  VARIANT var2 = { 0 };
  LPDISPATCH pDisp;
  if (SUCCEEDED(hr = p_imgColl->item( vIndex, var2, &pDisp )))
  {
   IHTMLTable* pElement = NULL;
   if (SUCCEEDED(hr = pDisp->QueryInterface( IID_IHTMLTable, (LPVOID*)&pElement )))
   {
    CComPtr<IHTMLTable> ptable(pElement);
    DoTable(ptable);
   }      
   pDisp->Release();
  } // item
 } // for
}

int CSurfSafeParser::DoTable(IHTMLTable *pElement)
{
 HRESULT hr=S_OK;
// long cols=0;
// pElement->get_cols(&cols);
// if(cols!=2)
// {
//  CString msg;
//  msg.Format("%s:%d table.get_cols() return %d",__FILE__,__LINE__,cols);
//  OutputDebugString(msg);
//  return __LINE__;
// }
  IHTMLElementCollection* prowCol=NULL;
  pElement->get_rows(&prowCol);
  if(prowCol)
  {
   CComPtr<IHTMLElementCollection> rowcolptr(prowCol);
   long rowcount=0;
   rowcolptr->get_length(&rowcount);
   if(rowcount>20)
   {
    for(int rowi=1;rowi<rowcount;rowi++)
    {
     VARIANT vIndexRow;
     vIndexRow.vt = VT_UINT;
     vIndexRow.lVal = rowi;
     VARIANT var0 = { 0 };
     LPDISPATCH pDisp;
     if (SUCCEEDED(hr = rowcolptr->item( vIndexRow, var0, &pDisp )))
     {
      IHTMLElement* pverboselement=NULL;
      if (SUCCEEDED(hr = pDisp->QueryInterface( IID_IHTMLElement, (LPVOID*)&pverboselement )))
      {
       BSTR bstag;
       CComPtr<IHTMLElement> prowptr(pverboselement);
       pverboselement->get_tagName(&bstag);
       _bstr_t bstrtag(bstag);

      }
      IHTMLTableRow* pElement = NULL;
      if (SUCCEEDED(hr = pDisp->QueryInterface( IID_IHTMLTableRow, (LPVOID*)&pElement )))
      {
       CComPtr<IHTMLTableRow> prowptr(pElement);
       DoRow(prowptr);
      }
     }
       
     
    }
   }
   else
   {
    CString msg;
    msg.Format("table row count:%d",rowcount);
    OutputDebugString(msg);
   }
  }
  return 0;
}

int CSurfSafeParser::DoRow(IHTMLTableRow *prow)
{
 if(!prow)
  return __LINE__;
 IHTMLElementCollection* pcelcol=NULL;
 HRESULT hr=S_OK;
 prow->get_cells(&pcelcol);
 if(pcelcol)
 {
  long count =0;
  pcelcol->get_length(&count);
  if(count!=2)
   return __LINE__;

  VARIANT var0 = { 0 };
  VARIANT vIndexRow;
  std::string proxyaddr;
  vIndexRow.vt = VT_UINT;
  {
   vIndexRow.lVal = 0;
   LPDISPATCH pDisp;
   if (SUCCEEDED(hr = pcelcol->item( vIndexRow, var0, &pDisp )))
   {
    IHTMLElement* pElement = NULL;
    if (SUCCEEDED(hr = pDisp->QueryInterface( IID_IHTMLElement, (LPVOID*)&pElement )))
    {
     CComPtr<IHTMLElement> prowptr(pElement);
     std::string strip = GetCellText(prowptr);
     if(strip.length()<4)
      return __LINE__;
     proxyaddr = strip+":";
    }
    pDisp->Release();
   }
  }
  {
   vIndexRow.lVal = 1;
   LPDISPATCH pDisp;
   if (SUCCEEDED(hr = pcelcol->item( vIndexRow, var0, &pDisp )))
   {
    IHTMLElement* pElement = NULL;
    if (SUCCEEDED(hr = pDisp->QueryInterface( IID_IHTMLElement, (LPVOID*)&pElement )))
    {
     CComPtr<IHTMLElement> prowptr(pElement);
     std::string strip = GetCellText(prowptr);
     if(strip.length()<4)
      return __LINE__;
     proxyaddr += strip;
    }
    pDisp->Release();
   }
  }
  CString msg;
  msg.Format("%s:%d address %s",__FILE__,__LINE__,proxyaddr.c_str());
  OutputDebugString(msg);
//  g_ProxyList.push_back(proxyaddr);
  g_ProxyList.AddProxy(proxyaddr.c_str());
 }
}

std::string CSurfSafeParser::GetCellText(IHTMLElement *pelement)
{
 if(!pelement)
  return "";
 BSTR bstext;
 pelement->get_innerText(&bstext);
 _bstr_t bstrtext(bstext);
 std::string celltext=(LPCTSTR)bstrtext;
 CString msg;
 msg.Format("%s:%d cell text:%s",__FILE__,__LINE__,celltext.c_str());
// OutputDebugString(msg);
 return celltext;
 
}

你可能感兴趣的:(String,File,table,null)