我的设备有两个网卡,我需要开两路socket,一路UDP,一路TCP,lwip的版本是1.4.1的,实际运行发现,UDP 运行一段时间以后挂了,通信挂了,线程继续运行,调试发现select 函数读取read_fds.fd_bits始终等于0,导致UDP发回来的数据不能被读取,UDP线程像死了一样。
FD_SET(sockfd,&read_fds);
select(sockfd+1,&read_fds,NULL,NULL,&tv_out);
if(FD_ISSET(sockfd, &read_fds)) //read_fds异常,
{
//读取udp数据
}
后来分析了一下select函数的源码,发现每次进来之后他会更新所有socket的描述符,而我的TCP线程用的socket标号是1,UDP用的标号是0,也就是说TCP线程更新socket描述符的时候顺便也更新了UDP线程的描述符,UDP只会去更新自己的,标号越大,管的越宽啊,。。。。。。。在TCP线程抢占之后,同时更新了两个线程的描述符,但他只能处理自己的描述符,不能处理UDP的描述符,但等到TCP处理完毕,UDP去读取描述符的时候,这个时候由于该描述符被更新过,所以早被lwip内核释放了,清零了,因为单片机是单核,所有线程只能是分时复用,所以导致UDP线程来不及处理自己描述符,就表现为像死了一样,
int lwip_select(int maxfdp1, fd_set *readset, fd_set *writeset, fd_set *exceptset,struct timeval *timeout)
{
u32_t waitres = 0;
int nready;
fd_set lreadset, lwriteset, lexceptset;
u32_t msectimeout;
struct lwip_select_cb select_cb;
err_t err;
int i;
SYS_ARCH_DECL_PROTECT(lev);
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_select(%d, %p, %p, %p, tvsec=%"S32_F" tvusec=%"S32_F")\n",
maxfdp1, (void *)readset, (void *) writeset, (void *) exceptset,
timeout ? (s32_t)timeout->tv_sec : (s32_t)-1,
timeout ? (s32_t)timeout->tv_usec : (s32_t)-1));
/* Go through each socket in each list to count number of sockets which
currently match */
nready = lwip_selscan(maxfdp1, readset, writeset, exceptset, &lreadset, &lwriteset, &lexceptset); //根据每个socket发生的事件更新文件描述符
/* If we don't have any current events, then suspend if we are supposed to */
if (!nready)
{
if (timeout && timeout->tv_sec == 0 && timeout->tv_usec == 0)
{
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_select: no timeout, returning 0\n"));
/* This is OK as the local fdsets are empty and nready is zero,
or we would have returned earlier. */
goto return_copy_fdsets;
}
/* None ready: add our semaphore to list:
We don't actually need any dynamic memory. Our entry on the
list is only valid while we are in this function, so it's ok
to use local variables. */
select_cb.next = NULL;
select_cb.prev = NULL;
select_cb.readset = readset;
select_cb.writeset = writeset;
select_cb.exceptset = exceptset;
select_cb.sem_signalled = 0;
err = sys_sem_new(&select_cb.sem, 0);
if (err != ERR_OK)
{
/* failed to create semaphore */
set_errno(ENOMEM);
return -1;
}
/* Protect the select_cb_list */
SYS_ARCH_PROTECT(lev);
/* Put this select_cb on top of list */
select_cb.next = select_cb_list;
if (select_cb_list != NULL)
{
select_cb_list->prev = &select_cb;
}
select_cb_list = &select_cb;
/* Increasing this counter tells even_callback that the list has changed. */
select_cb_ctr++;
/* Now we can safely unprotect */
SYS_ARCH_UNPROTECT(lev);
/* Increase select_waiting for each socket we are interested in */
for(i = 0; i < maxfdp1; i++) //遍历每个socket
{
if ((readset && FD_ISSET(i, readset)) ||
(writeset && FD_ISSET(i, writeset)) ||
(exceptset && FD_ISSET(i, exceptset)))
{
struct lwip_sock *sock = tryget_socket(i);
LWIP_ASSERT("sock != NULL", sock != NULL);
SYS_ARCH_PROTECT(lev);
sock->select_waiting++;
LWIP_ASSERT("sock->select_waiting > 0", sock->select_waiting > 0);
SYS_ARCH_UNPROTECT(lev);
}
}
/* Call lwip_selscan again: there could have been events between
the last scan (whithout us on the list) and putting us on the list! */
nready = lwip_selscan(maxfdp1, readset, writeset, exceptset, &lreadset, &lwriteset, &lexceptset);
if (!nready)
{
/* Still none ready, just wait to be woken */
if (timeout == 0)
{
/* Wait forever */
msectimeout = 0;
}
else
{
msectimeout = ((timeout->tv_sec * 1000) + ((timeout->tv_usec + 500)/1000));
if (msectimeout == 0)
{
/* Wait 1ms at least (0 means wait forever) */
msectimeout = 1;
}
}
waitres = sys_arch_sem_wait(&select_cb.sem, msectimeout);
}
/* Increase select_waiting for each socket we are interested in */
for(i = 0; i < maxfdp1; i++)
{
if ((readset && FD_ISSET(i, readset)) ||
(writeset && FD_ISSET(i, writeset)) ||
(exceptset && FD_ISSET(i, exceptset))) {
struct lwip_sock *sock = tryget_socket(i);
LWIP_ASSERT("sock != NULL", sock != NULL);
SYS_ARCH_PROTECT(lev);
sock->select_waiting--;
LWIP_ASSERT("sock->select_waiting >= 0", sock->select_waiting >= 0);
SYS_ARCH_UNPROTECT(lev);
}
}
/* Take us off the list */
SYS_ARCH_PROTECT(lev);
if (select_cb.next != NULL)
{
select_cb.next->prev = select_cb.prev;
}
if (select_cb_list == &select_cb)
{
LWIP_ASSERT("select_cb.prev == NULL\n", select_cb.prev == NULL);
select_cb_list = select_cb.next;
}
else
{
LWIP_ASSERT("select_cb.prev != NULL\n", select_cb.prev != NULL);
select_cb.prev->next = select_cb.next;
}
/* Increasing this counter tells even_callback that the list has changed. */
select_cb_ctr++;
SYS_ARCH_UNPROTECT(lev);
sys_sem_free(&select_cb.sem);
if (waitres == SYS_ARCH_TIMEOUT)
{
/* Timeout */
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_select: timeout expired\n"));
/* This is OK as the local fdsets are empty and nready is zero,
or we would have returned earlier. */
goto return_copy_fdsets;
}
/* See what's set */
nready = lwip_selscan(maxfdp1, readset, writeset, exceptset, &lreadset, &lwriteset, &lexceptset);
}
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_select: nready=%d\n", nready));
return_copy_fdsets:
set_errno(0);
if (readset)
{
*readset = lreadset;
}
if (writeset)
{
*writeset = lwriteset;
}
if (exceptset)
{
*exceptset = lexceptset;
}
return nready;
}
而实际上select函数会调用lwip_selscan函数来更新各种描述符,而lwip_selscan函数调用tryget_socket函数更新每个socket的发生的事件,然后根据每个socket发生的事件来更新描述符。
static int lwip_selscan(int maxfdp1, fd_set *readset_in, fd_set *writeset_in, fd_set *exceptset_in, fd_set *readset_out, fd_set *writeset_out, fd_set *exceptset_out)
{
int i, nready = 0;
fd_set lreadset, lwriteset, lexceptset;
struct lwip_sock *sock;
SYS_ARCH_DECL_PROTECT(lev);
FD_ZERO(&lreadset);
FD_ZERO(&lwriteset);
FD_ZERO(&lexceptset);
/* Go through each socket in each list to count number of sockets which
currently match */
for(i = 0; i < maxfdp1; i++) //遍历每一个socket,标号为1的会更新标号为0的
{
void* lastdata = NULL;
s16_t rcvevent = 0;
u16_t sendevent = 0;
u16_t errevent = 0;
/* First get the socket's status (protected)... */
SYS_ARCH_PROTECT(lev);
sock = tryget_socket(i); //更新每个socket的事件
if (sock != NULL)//socket 为空
{
lastdata = sock->lastdata; //最新数据
rcvevent = sock->rcvevent; //接收事件
sendevent = sock->sendevent; //发送事件
errevent = sock->errevent; //错误事件
}
else
{
printf("socket 为空\n");
}
SYS_ARCH_UNPROTECT(lev);
/* ... then examine it: */
/* See if netconn of this socket is ready for read */
if (readset_in && FD_ISSET(i, readset_in) && ((lastdata != NULL) || (rcvevent > 0))) //要么发生接收事件,要么数据准备好
{
FD_SET(i, &lreadset); //置位可读标志位
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_selscan: fd=%d ready for reading\n", i));
nready++;
}
/* See if netconn of this socket is ready for write */
if (writeset_in && FD_ISSET(i, writeset_in) && (sendevent != 0))
{
FD_SET(i, &lwriteset);
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_selscan: fd=%d ready for writing\n", i));
nready++;
}
/* See if netconn of this socket had an error */
if (exceptset_in && FD_ISSET(i, exceptset_in) && (errevent != 0))
{
FD_SET(i, &lexceptset);
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_selscan: fd=%d ready for exception\n", i));
nready++;
}
}
/* copy local sets to the ones provided as arguments */
*readset_out = lreadset;
*writeset_out = lwriteset;
*exceptset_out = lexceptset;
LWIP_ASSERT("nready >= 0", nready >= 0);
return nready;
}
下面是tryget_socket函数,他通过socket索引拿到全局变量sockets[s]的各种事件。
static struct lwip_sock * tryget_socket(int s)
{
if ((s < 0) || (s >= NUM_SOCKETS))
{
return NULL;
}
if (!sockets[s].conn)
{
return NULL;
}
return &sockets[s];
}
而sockets[s]在event_callback内更新,只要发生相应的事件,就会有LWIP内核更新。
static void event_callback(struct netconn *conn, enum netconn_evt evt, u16_t len)
{
int s;
struct lwip_sock *sock;
struct lwip_select_cb *scb;
int last_select_cb_ctr;
SYS_ARCH_DECL_PROTECT(lev);
LWIP_UNUSED_ARG(len);
/* Get socket */
if (conn)
{
s = conn->socket;
if (s < 0)
{
/* Data comes in right away after an accept, even though
* the server task might not have created a new socket yet.
* Just count down (or up) if that's the case and we
* will use the data later. Note that only receive events
* can happen before the new socket is set up. */
SYS_ARCH_PROTECT(lev);
if (conn->socket < 0)
{
if (evt == NETCONN_EVT_RCVPLUS)
{
conn->socket--;
}
SYS_ARCH_UNPROTECT(lev);
return;
}
s = conn->socket;
SYS_ARCH_UNPROTECT(lev);
}
sock = get_socket(s);
if (!sock)
{
return;
}
}
else
{
return;
}
SYS_ARCH_PROTECT(lev);
/* Set event as required */
switch (evt)
{
case NETCONN_EVT_RCVPLUS:
sock->rcvevent++; //接收事件累加
break;
case NETCONN_EVT_RCVMINUS:
sock->rcvevent--; //接收事件减少
break;
case NETCONN_EVT_SENDPLUS:
sock->sendevent = 1;
break;
case NETCONN_EVT_SENDMINUS:
sock->sendevent = 0;
break;
case NETCONN_EVT_ERROR:
sock->errevent = 1;
break;
default:
LWIP_ASSERT("unknown event", 0);
break;
}
/***只截取部分代码****************/
}
event_callback是个回调函数,在创建socket通信时就会注册该回调函数。
int lwip_socket(int domain, int type, int protocol)
{
struct netconn *conn;
int i;
LWIP_UNUSED_ARG(domain);
/* create a netconn */
switch (type)
{
case SOCK_RAW:
conn = netconn_new_with_proto_and_callback(NETCONN_RAW, (u8_t)protocol, event_callback);
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_socket(%s, SOCK_RAW, %d) = ",
domain == PF_INET ? "PF_INET" : "UNKNOWN", protocol));
break;
case SOCK_DGRAM:
conn = netconn_new_with_callback( (protocol == IPPROTO_UDPLITE) ?
NETCONN_UDPLITE : NETCONN_UDP, event_callback); //这里注册回调
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_socket(%s, SOCK_DGRAM, %d) = ",
domain == PF_INET ? "PF_INET" : "UNKNOWN", protocol));
break;
case SOCK_STREAM:
conn = netconn_new_with_callback(NETCONN_TCP, event_callback); //这里注册回调
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_socket(%s, SOCK_STREAM, %d) = ",
domain == PF_INET ? "PF_INET" : "UNKNOWN", protocol));
if (conn != NULL)
{
/* Prevent automatic window updates, we do this on our own! */
netconn_set_noautorecved(conn, 1);
}
break;
default:
LWIP_DEBUGF(SOCKETS_DEBUG, ("lwip_socket(%d, %d/UNKNOWN, %d) = -1\n",
domain, type, protocol));
set_errno(EINVAL);
return -1;
}
if (!conn)
{
LWIP_DEBUGF(SOCKETS_DEBUG, ("-1 / ENOBUFS (could not create netconn)\n"));
set_errno(ENOBUFS);
printf("conn is null\n");
return -1;
}
i = alloc_socket(conn, 0);
if (i == -1)
{
netconn_delete(conn);
set_errno(ENFILE);
printf("alloc socket failed\n");
return -1;
}
conn->socket = i;
LWIP_DEBUGF(SOCKETS_DEBUG, ("%d\n", i));
set_errno(0);
return i;
}
/*******************************************************************************************************************************************/
/*******************************************************************************************************************************************/
最后谈谈优化吧
其实优化很简单,将lwip_select函数lwip_selscan函数内的所有 for(i = 0; i < maxfdp1; i++) 改为 for(i = maxfdp1-1; i < maxfdp1; i++)就好了,这样就会各自更新各自的描述符,不会发生标号高的更新了标号低的描述符,导致标号低的描述符读取错误而通信失败,这样改了我的两个socket通信稳定了许多,UDP掉线了,也会很快恢复。
/**************************************************************************************************************************************/
ps:其实以上我分析的原因只是猜的,具体原因我也不太明白,反正这么干了就正常了。