linux C++ 爬虫抓取网页

       方便易用,传入URL,返回对应页面的内容

#include 
#include 
#include 
#include 
#include 
using namespace std;

void parseHostAndPagePath(const string url, string &hostUrl, string &pagePath){
    hostUrl=url;
    pagePath="/";
    int pos=hostUrl.find("http://");
    if(-1!=pos)
        hostUrl=hostUrl.replace(pos,7,"");
    pos=hostUrl.find("https://");
    if(-1!=pos)
        hostUrl=hostUrl.replace(pos,8,"");
    pos=hostUrl.find("/");
    if(-1!=pos){
        pagePath=hostUrl.substr(pos);
        hostUrl=hostUrl.substr(0,pos);
    }
}

string getPageContent(const string url){
    struct hostent *host;
    string hostUrl, pagePath;
    parseHostAndPagePath(url, hostUrl, pagePath);
    if(0==(host=gethostbyname(hostUrl.c_str()))){
        cout<<"gethostbyname error\n"<h_addr))->s_addr;
    int isock;
    if((isock = socket(AF_INET, SOCK_STREAM, 0))==-1){
        cout<<"open socket error\n"<0){
        if('\r'==c){
            continue;
        }else if('\n'==c){
            if(false==flag)
                break;
            flag=false;
        }else{
            flag=true;
        }
    }

    int len, BUFFER_SIZE=512;
    char buffer[BUFFER_SIZE];
    string pageContent="";
    while((len = recv(isock, buffer, BUFFER_SIZE-1, 0))>0){
        buffer[len]='\0';
        pageContent+=buffer;
    }

    return pageContent;
}

int main()
{
    cout<

你可能感兴趣的:(C++)