在windows下的C++通过Http协议实现对网页的内容抓取:
首先介绍下两个重要的包(一般是在linux下的开源数据包,在windows下则调用其动态链接库dll):curl包和pthreads_dll,其中curl包解释为命令行浏览器,通过调用内置的curl_easy_setopt等函数即可实现特定的网页内容获取(正确的编译导入的curl链接库,还需要另外一个包C-ares)。pthreads是多线程控制包,当中包含了互斥变量加锁和解锁。程序进程分配等函数。
下载地址:点击打开链接。其中要正确的导入外接动态链接库,需要步骤:1,项目->属性->配置属性->C/C++->常规->附加包含目录(添加include的路径),2,项目->属性->配置属性->连接器->常规->附加库目录(添加lib包含的路径);3,在链接器->输入->附加依赖项(libcurld.lib ;pthreadVC2.lib;ws2_32.lib;winmm.lib;wldap32.lib;areslib.lib添加)4,在c/c++->预处理器->预处理器定义(_CONSOLE;BUILDING_LIBCURL;HTTP_ONLY)
具体实现过程介绍:
1:自定义hashTable结构,用以存储获取的string字符。以hashTable类的形式实现,包含hash表set类型,以及add、find和几种常见的string哈希方式函数
Code:
///HashTable.h
#ifndef HashTable_H
#define HashTable_H
#include
#include
#include
class HashTable
{
public:
HashTable(void);
~HashTable(void);
unsigned int ForceAdd(const std::string& str);
unsigned int Find(const std::string& str);
/*string的常见的hash方式*/
unsigned int RSHash(const std::string& str);
unsigned int JSHash (const std::string& str);
unsigned int PJWHash (const std::string& str);
unsigned int ELFHash (const std::string& str);
unsigned int BKDRHash(const std::string& str);
unsigned int SDBMHash(const std::string& str);
unsigned int DJBHash (const std::string& str);
unsigned int DEKHash (const std::string& str);
unsigned int BPHash (const std::string& str);
unsigned int FNVHash (const std::string& str);
unsigned int APHash (const std::string& str);
private:
std::set HashFunctionResultSet;
std::vector hhh;
};
#endif
/////HashTable.cpp
#include "HashTable.h"
HashTable::HashTable(void)
{
}
HashTable::~HashTable(void)
{
}
unsigned int HashTable::ForceAdd(const std::string& str)
{
unsigned int i=ELFHash(str);
HashFunctionResultSet.insert(i);
return i;
}
unsigned int HashTable::Find(const std::string& str)
{
int ff=hhh.size();
const unsigned int i=ELFHash(str);
std::set::const_iterator it;
if(HashFunctionResultSet.size()>0)
{
it=HashFunctionResultSet.find(i);
if(it==HashFunctionResultSet.end())
return -1;
}
else
{
return -1;
}
return i;
}
/*几种常见的字符串hash方式实现函数*/
unsigned int HashTable::APHash(const std::string& str)
{
unsigned int hash=0xAAAAAAAA;
for(std::size_t i=0;i> 3)) :
(~((hash << 11) + str[i] ^ (hash >> 5)));
}
return hash;
}
unsigned int HashTable::BKDRHash(const std::string& str)
{
unsigned int seed=131; //31 131 1313 13131 131313 etc
unsigned int hash=0;
for(std::size_t i=0;i(str.length());
for(std::size_t i = 0; i < str.length(); i++)
{
hash = ((hash << 5) ^ (hash >> 27)) ^ str[i];
}
return hash;
}
unsigned int HashTable::DJBHash(const std::string& str)
{
unsigned int hash = 5381;
for(std::size_t i = 0; i < str.length(); i++)
{
hash = ((hash << 5) + hash) + str[i];
}
return hash;
}
unsigned int HashTable::ELFHash(const std::string& str)
{
unsigned int hash=0;
unsigned int x=0;
for(std::size_t i = 0; i < str.length(); i++)
{
hash=(hash<<4)+str[i];
if((x = hash & 0xF0000000L) != 0)
hash^=(x>>24);
hash&=~x;
}
return hash;
}
unsigned int HashTable::FNVHash(const std::string& str)
{
const unsigned int fnv_prime = 0x811C9DC5;
unsigned int hash = 0;
for(std::size_t i = 0; i < str.length(); i++)
{
hash *= fnv_prime;
hash ^= str[i];
}
return hash;
}
unsigned int HashTable::JSHash(const std::string& str)
{
unsigned int hash = 1315423911;
for(std::size_t i = 0; i < str.length(); i++)
{
hash ^= ((hash << 5) + str[i] + (hash >> 2));
}
return hash;
}
unsigned int HashTable::PJWHash(const std::string& str)
{
unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);
unsigned int ThreeQuarters = (unsigned int)((BitsInUnsignedInt * 3) / 4);
unsigned int OneEighth = (unsigned int)(BitsInUnsignedInt / 8);
unsigned int HighBits = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
unsigned int hash = 0;
unsigned int test = 0;
for(std::size_t i = 0; i < str.length(); i++)
{
hash = (hash << OneEighth) + str[i];
if((test = hash & HighBits) != 0)
hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
return hash;
}
unsigned int HashTable::RSHash(const std::string& str)
{
unsigned int b = 378551;
unsigned int a = 63689;
unsigned int hash = 0;
for(std::size_t i = 0; i < str.length(); i++)
{
hash = hash * a + str[i];
a = a * b;
}
return hash;
}
unsigned int HashTable::SDBMHash(const std::string& str)
{
unsigned int hash = 0;
for(std::size_t i = 0; i < str.length(); i++)
{
hash = str[i] + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
2:实现进程间的互斥处理函数(另外提供进行当前操作的进程ID,以便加锁机制)。以SingleTone类实现。该类只能有静态函数Instance建立一个唯一的类对象。以互斥的方式实现对hashTable的基本操作,当中的变量加锁和解锁有mutex类来实现,具体参见代码:
////mutex.h
#ifndef mutex_H
#define mutex_H
#pragma once
#include "pthread.h"
class mutex
{
pthread_mutex_t& m_mutex;
public:
mutex(pthread_mutex_t& m):m_mutex(m)
{
pthread_mutex_lock(&m_mutex);
}
~mutex(void)
{
pthread_mutex_unlock(&m_mutex);
}
};
#endif
////SingleTone.h
#ifndef SingleTone_H
#define SingleTone_H
#include
#include
#include
#include "SingleTone.h"
#include "mutex.h"
SingleTone* SingleTone::m_pSingleTone=NULL;
SingleTone::SingleTone()
{
pthread_mutex_init(&m_singleton_mutex,NULL);
m_pcurl=curl_easy_init();
}
SingleTone::~SingleTone()
{
pthread_mutex_destroy(&m_singleton_mutex);
}
SingleTone* SingleTone::Instance()
{
if(m_pSingleTone==NULL){
m_pSingleTone=new SingleTone();
}
return (m_pSingleTone);
}
void SingleTone::push_back(std::string s)
{
mutex m(m_singleton_mutex);
return m_LinkStack.push_back(s);
}
void SingleTone::pop_back()
{
mutex m(m_singleton_mutex);
return m_LinkStack.pop_back();
}
int SingleTone::size()
{
return m_LinkStack.size();
}
std::list::iterator SingleTone::begin()
{
return m_LinkStack.begin();
}
std::list::reference SingleTone::back()
{
mutex m(m_singleton_mutex);
return m_LinkStack.back();
}
std::list::iterator SingleTone::end()
{
return m_LinkStack.end();
}
void SingleTone::push_front(std::string s)
{
mutex m(m_singleton_mutex);
return m_LinkStack.push_front(s);
}
bool SingleTone::empty()
{
return m_LinkStack.empty();
}
unsigned int SingleTone::Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url)
{
mutex m(m_singleton_mutex);
return m_UniqueMap[key].ForceAdd(url);
}
unsigned int SingleTone::Get_m_UniqueMap_Find(const std::string& key,const std::string& url)
{
HashTable hss = m_UniqueMap[key];
unsigned int uiRet =hss.Find(url);
//unsigned int uiRet = m_UniqueMap[key]->Find(url);
return uiRet;
}
HashTable SingleTone::Get_m_UniqueMap(const std::string& key)
{
return m_UniqueMap[key];
}
void SingleTone::Set_m_UniqueMap(const std::string& key,HashTable& hash)
{
m_UniqueMap[key] = hash;
}
CURL* SingleTone::GetpCurl()
{
return m_pcurl;
}
3:实现HTTP对网页内容的获取:功能包含初始网页内容的获取,和URL设置等函数。这个过程要求是互斥的,所以引入SingleTone类的内容。
Code:
/////Http.h
#ifndef Http_H
#define Http_H
#include "curl/curl.h"
#include "pthread.h"
#include
using namespace std;
class Http
{
public:
Http(void);
~Http(void);
bool InitCurl(void);
bool InitCurl(const std::string& url, std::string& szbuffer);
bool DeInitCurl();
void setUrl(const std::string& url);
string setUrl();
const string getBuffer();
private:
static void writer(void* buffer,size_t size,size_t nmemb,void* f);
int setBuffer(char* buffer,size_t size,size_t nmemb);
CURL *m_pcurl;
char m_errorBuffer[CURL_ERROR_SIZE];
string m_szbuffer;
string m_szUrl;
pthread_mutex_t m_http_mutex;
};
#endif
#include "Http.h"
#include "SingleTone.h"
#include "mutex.h"
Http::Http(void)
{
m_pcurl=SingleTone::Instance()->GetpCurl();
}
Http::~Http(void)
{
}
bool Http::InitCurl(void)
{
return false;
}
int Http::setBuffer(char *buffer, size_t size, size_t nmemb)
{
int result = 0;
if (buffer!=NULL)
{
m_szbuffer.append(buffer, size * nmemb);
result = size * nmemb;
}
buffer = NULL ;
return result;
}
void Http::writer(void *buffer, size_t size, size_t nmemb,void* f)
{
static_cast(f)->setBuffer((char*)buffer,size,nmemb);
}
bool Http::InitCurl(const std::string& url, std::string& szbuffer)
{
pthread_mutex_init(&m_http_mutex,NULL);
Http::m_szUrl=url;
CURLcode result;
if(m_pcurl)
{
curl_easy_setopt(m_pcurl, CURLOPT_ERRORBUFFER, Http::m_errorBuffer);
curl_easy_setopt(m_pcurl, CURLOPT_URL,m_szUrl.c_str());
curl_easy_setopt(m_pcurl, CURLOPT_HEADER, 0);
curl_easy_setopt(m_pcurl, CURLOPT_FOLLOWLOCATION, 1);
curl_easy_setopt(m_pcurl, CURLOPT_WRITEFUNCTION,Http::writer);
curl_easy_setopt(m_pcurl, CURLOPT_WRITEDATA,this);
result = curl_easy_perform(m_pcurl);
}
if(result!=CURLE_OK)
return false;
szbuffer=m_szbuffer;
m_szbuffer.clear();
m_szUrl.clear();
pthread_mutex_destroy(&m_http_mutex);
return true;
}
bool Http::DeInitCurl()
{
curl_easy_cleanup(m_pcurl);
curl_global_cleanup();
m_pcurl = NULL;
return true;
}
const string Http::getBuffer()
{
return m_szbuffer;
}
string Http::setUrl()
{
return Http::m_szUrl;
}
void Http::setUrl(const std::string& url)
{
Http::m_szUrl = url;
}
其中 m_szbuffer存放网页的内容。初始网页的内容存放在Init函数的形参。