以下的分析基于libvirt 3.0版本。

   libvirt是一套免费,开源的支持linux下的主流虚拟化管理工具,目前有大量的应用程序构建在libvirt之上,很多虚拟化产品的开发都是灵活调用libvirt的API接口去实现的。对于应用程序,libvirt提供一套非阻塞调用的框架。

   涉及相关的API:

   virInitialize:初始化libvirt库,主要针对多线程编程

   virEventRegisterDefaultImpl:基于poll系统调用注册默认事件实现,这是一个通用实现。

   virEventRunDefaultImpl:循环运行事件,需要在线程中单独运行该函数

   virConnectOpen:连接libvirt服务端,即libvirtd

   virConnectSetKeepAlive:设置保活周期,此函数控制客户端发送keepalive消息。

相关的实现代码如下:

if (virInitialize()< 0) {

          printf("callvirInitialize initialize libvirt fail\n");

     return 1;

}

   

/* register defaultlibvirt event implement */

if (virEventRegisterDefaultImpl() < 0) { //必须

    printf("failed to registerdefault event implementation\n");

    return 2;

}

 

//创建线程去分发事件,必须

void*libvirt_thread_cb(void *data)

{

    (void)data;

   

    while (!isexit) {

        virEventRunDefaultImpl();

    }

    pthread_exit((void*)"libvirt pthreadwill exit!!!");

    return (void*)0;

}

 

//连接libvirt服务端,并且设计保活机制

contor =virConnectOpen(NULL);//参数为空,表示连接本地的libvirtd服务端

if (contor) {

    //第二个参数,心跳发送周期;  第三个参数,心跳参数,当超过该次数时,连接断开

    if (virConnectSetKeepAlive(contor, 10, 6)< 0) {

       printf("failed to set connkeep alive config\n");

        virConnectClose(contor);

        contor = NULL;

    }

}


通过以上的实现,后续就可以非阻塞调用libvirt其它的API接口,当libvirtd阻塞时,能够超时返回。

对于libvirtd端相关的心跳周期保存在libvirtd.conf文件中,可以修改参数,然后再重启libvirtd即可生效:

keepalive_interval = 10 //default is 5s

keepalive_count = 6 //default is 5s


相关的源码分析:

virEventRegisterDefaultImpl函数分析,源码如下:

int virEventRegisterDefaultImpl(void)
{
    VIR_DEBUG("registering default event implementation");

    virResetLastError();

    if (virEventPollInit() < 0) {
        virDispatchError(NULL);
        return -1;
    }

    virEventRegisterImpl(
        virEventPollAddHandle,
        virEventPollUpdateHandle,
        virEventPollRemoveHandle,
        virEventPollAddTimeout,
        virEventPollUpdateTimeout,
        virEventPollRemoveTimeout
        );

    return 0;
}

通过源码分析,调用virEventRegisterImpl函数对全局函数赋值,为什么会这样做,下一步再分析


virConnectSetKeepAlive函数分析:

通过源码分析,最终会调用virKeepAliveStart函数去启用定时器,发送keepalive消息。

virKeepAliveStart(virKeepAlivePtr ka,
                  int interval,
                  unsigned int count)
{
    int ret = -1;
    time_t delay;
    int timeout;
    time_t now;

    virObjectLock(ka);

    if (ka->timer >= 0) {//如果定时器存在,不做处理
        VIR_DEBUG("Keepalive messages already enabled");
        ret = 0;
        goto cleanup;
    }

    if (interval > 0) {
        if (ka->interval > 0) {//心跳周期已设置,不在设置
            virReportError(VIR_ERR_INTERNAL_ERROR, "%s",
                           _("keepalive interval already set"));
            goto cleanup;
        }
        /* Guard against overflow */
        if (interval > INT_MAX / 1000) {
            virReportError(VIR_ERR_INTERNAL_ERROR,
                           _("keepalive interval %d too large"), interval);
            goto cleanup;
        }
        ka->interval = interval;
        ka->count = count;
        ka->countToDeath = count;
    }

    if (ka->interval <= 0) {//心跳周期小于0,禁用keepalive
        VIR_DEBUG("Keepalive messages disabled by configuration");
        ret = 0;
        goto cleanup;
    }

    PROBE(RPC_KEEPALIVE_START,
          "ka=%p client=%p interval=%d count=%u",
          ka, ka->client, interval, count);

    now = time(NULL);
    delay = now - ka->lastPacketReceived;
    if (delay > ka->interval)
        timeout = 0;
    else
        timeout = ka->interval - delay;
    ka->intervalStart = now - (ka->interval - timeout);
    ka->timer = virEventAddTimeout(timeout * 1000, virKeepAliveTimer,
                                   ka, virObjectFreeCallback);//创建心跳定时器
    if (ka->timer < 0)
        goto cleanup;

    /* the timer now has another reference to this object */
    virObjectRef(ka);
    ret = 0;

 cleanup:
    virObjectUnlock(ka);
    return ret;
}

继续分析virEventAddTimeout函数,源码如下:

int
virEventAddTimeout(int timeout,
                   virEventTimeoutCallback cb,
                   void *opaque,
                   virFreeCallback ff)
{
    if (!addTimeoutImpl)
        return -1;

    return addTimeoutImpl(timeout, cb, opaque, ff);
}

该函数实现很简单,调用全局的函数去设置定时器,addTimeoutImpl是全局的函数接口,这个函数的赋值是应用程式调用virEventRegisterDefaultImpl函数去设置的,这是为什么需要调用virEventRegisterDefaultImpl函数的原因。


如果没有提供以上的设置,为什么会阻塞,阻塞在哪个地方,通过gdb的调用,发现阻塞在virNetClientIOEventLoop函数中的poll系统调用上。堆栈如下:

(gdb) bt
#0  virNetClientIOEventLoop (client=0x2026d30, thiscall=0x1dc57a0) at ../../../src/rpc/virnetclient.c:1595
#1  0x00007fd5dc8368a3 in virNetClientIO (client=0x2026d30, thiscall=0x1dc57a0) at ../../../src/rpc/virnetclient.c:1950
#2  0x00007fd5dc8370b7 in virNetClientSendInternal (client=0x2026d30, msg=0x2026c60, expectReply=true, nonBlock=false) at ../../../src/rpc/virnetclient.c:2122
#3  0x00007fd5dc837141 in virNetClientSendWithReply (client=0x2026d30, msg=0x2026c60) at ../../../src/rpc/virnetclient.c:2150
#4  0x00007fd5dc838048 in virNetClientProgramCall (prog=0x2027150, client=0x2026d30, serial=8, proc=212, noutfds=0, outfds=0x0, ninfds=0x0, infds=0x0, args_filter=0x7fd5dc82ce9f ,
    args=0x7fffd92ffba0, ret_filter=0x7fd5dc82cf19 , ret=0x7fffd92ffb80) at ../../../src/rpc/virnetclientprogram.c:329
#5  0x00007fd5dc819442 in callFull (conn=0x1a24380, priv=0x1a77660, flags=0, fdin=0x0, fdinlen=0, fdout=0x0, fdoutlen=0x0, proc_nr=212, args_filter=0x7fd5dc82ce9f ,
    args=0x7fffd92ffba0 "0T\002\002", ret_filter=0x7fd5dc82cf19 , ret=0x7fffd92ffb80 "") at ../../../src/remote/remote_driver.c:6637
#6  0x00007fd5dc819515 in call (conn=0x1a24380, priv=0x1a77660, flags=0, proc_nr=212, args_filter=0x7fd5dc82ce9f , args=0x7fffd92ffba0 "0T\002\002",
    ret_filter=0x7fd5dc82cf19 , ret=0x7fffd92ffb80 "") at ../../../src/remote/remote_driver.c:6659
#7  0x00007fd5dc7fd77b in remoteDomainGetState (domain=0x1dc8f60, state=0x7fffd92ffcdc, reason=0x0, flags=0) at ../../../src/remote/remote_driver.c:2458
#8  0x00007fd5dc7b5b2f in virDomainGetState (domain=0x1dc8f60, state=0x7fffd92ffcdc, reason=0x0, flags=0) at ../../../src/libvirt-domain.c:2495


virNetClientIOEventLoop函数分析:

static int virNetClientIOEventLoop(virNetClientPtr client,
                                   virNetClientCallPtr thiscall)
{
    struct pollfd fds[2];
    int ret;

    fds[0].fd = virNetSocketGetFD(client->sock);
    fds[1].fd = client->wakeupReadFD;

    for (;;) {
        /* If we are non-blocking, then we don't want to sleep in poll() */
        if (thiscall->nonBlock)
            timeout = 0;

        /* Limit timeout so that we can send keepalive request in time */
        if (timeout == -1)
            timeout = virKeepAliveTimeout(client->keepalive);//返回-1,导致poll阻塞

        fds[0].events = fds[0].revents = 0;
        fds[1].events = fds[1].revents = 0;

        fds[1].events = POLLIN;

        /* Calculate poll events for calls */
        virNetClientCallMatchPredicate(client->waitDispatch,
                                       virNetClientIOEventLoopPollEvents,
                                       &fds[0]);

        if (client->nstreams)
            fds[0].events |= POLLIN;

    repoll:
        ret = poll(fds, ARRAY_CARDINALITY(fds), timeout);
        if (ret < 0 && (errno == EAGAIN || errno == EINTR))
            goto repoll;

}

        通过源码可以分析,由于应用程序没有调用virConnectSetKeepAlive函数设置心跳保活机制,导致client->keepalive为NULL,分析virKeepAliveTimeout函数可知,当client->keepalive为NULL时,直接返回为-1;导致poll系统调用一直阻塞,直到有事件响应。

       总结:通过以上设置,调用virConnectOpen函数连接libvirt的时,就可以实现非阻塞调用libvirt其它的API,当libvirt阻塞时,不会导致调用者阻塞。

       当libvirt主线程阻塞时,上述的设置并不能解决virConnectOpen阻塞的问题,需要修改libvirt相关的代码。至于为什么,自己去思考,怎么解决这个问题?