linux0.99网络模块-传输层(TCP接收)

上一篇文章《linux0.99网络模块-网络层(接收)》中我们提到过,注册到IP层的协议有ICMP,TCP,UDP。本文就来分析TCP处理数据报的过程。
我们记得上一篇中网络层通过调用下面的函数来把数据报传递给TCP。
775        ipprot->handler (skb2, dev, &opt, iph->daddr,
776             net16(iph->tot_len) - iph->ihl*4,
777             iph->saddr, 0, ipprot);


net/tcp/protocols.c:
 56 static struct ip_protocol tcp_protocol =
 57 {
 58    tcp_rcv,
 59    tcp_err,
 60    NULL,
 61    IPPROTO_TCP,
 62    0, /* copy */
 63    NULL
 64 };
58行的tcp_rcv对应的就是tcp注册的handler方法
不过在分析该方法之前还是先分析一相关的数据结构和方法
net/tcp/sock.c

1738 /* This routine must find a socket given a tcp header.  Everyhting
1739    is assumed to be in net order. */
 sk=get_sock(&tcp_prot, net16(th->dest), saddr, th->source, daddr);
1741 volatile struct sock *get_sock (struct proto *prot, unsigned short num,
1742                 unsigned long raddr,
1743                 unsigned short rnum, unsigned long laddr)
1744 {
通过对比上面的函数调用可以推测各个参数的意义如下:
@prot:协议
@num:目的端口
@raddr::源IP地址
@rnum:源端口
@laddr:目的IP地址

1745   volatile struct sock *s;
1746   PRINTK ("get_sock (prot=%X, num=%d, raddr=%X, rnum=%d, laddr=%X)\n",
1747       prot, num, raddr, rnum, laddr);
1748
1749   /* SOCK_ARRAY_SIZE must be a power of two.  This will work better
1750      than a prime unless 3 or more sockets end up using the same
1751      array entry.  This should not be a problem because most
1752      well known sockets don't overlap that much, and for
1753      the other ones, we can just be careful about picking our
1754      socket number when we choose an arbitrary one. */
1756   for (s=prot->sock_array[num&(SOCK_ARRAY_SIZE-1)]; s != NULL; s=s->next)
1757     {
1758       if (s->num == num)     //目的端口是否一致
1759     {
1760       /* we need to see if this is the socket that we want. */
1761       if (!ip_addr_match (s->daddr, raddr))
1762         continue;
             检查sock的目的地址与参数中的源地址是否一致,不一致则跳过继续比较其他的。
             该函数首先检查两个参数是否完全一致,如果是说明匹配,返回匹配。否则,自右向左逐个字节进行比较,如果相同,继续比较下一个字节,如果不同,就要看第一个参数前面的字节是否全为0,如果是,也可以匹配,如果不是返回不匹配。也就是全为0的前缀可以匹配同样长度的任何网络地址。
1763       if (s->dummy_th.dest != rnum && s->dummy_th.dest!= 0)
1764         continue;
由于要匹配对应的sock需要源IP:源端口 与 目的IP:目的端口均匹配,这里就是比较源端口

1765       if (!ip_addr_match (s->saddr, laddr))
1766         continue;
检查sock的源地址与数据报的目的地址是否匹配,如果不匹配,跳过
1767       return (s);
找到了匹配的sock(源地址,目的地址均匹配)
1768     }
1769     }
1770   return (NULL);
1771 }
图形化表示如下:

现在总结一下这个函数:我们在计算机网络书籍中也已经知道这样的事实,套接字如果匹配那么必须源IP:源端口与目的IP:目的端口完全匹配。在该函数中我们根据目的端口号从prot的sock_array数组中得到相应的链,也就是说同一个目的端口的sock会链接到一个链上保存到prot的sock_array的num&(SOCK_ARRAY_SIZE-1)索引位置。现在目的端口号已经匹配了,下面就遍历每一个sock,判断它的目的地址,源地址,源端口等是否匹配,如果找到这样的sock将其返回,否则返回NULL。
多说几句,在服务器编程时,每到一个请求,我们会为其创建一个sock(第一次创建,之后查询),在sock里面就保存了一对套接字信息,然后根据目的端口把它挂到相应协议(prot)的sock_array中相应链表中。

tcp_sequence (sk, th, len, opt, saddr)
2635 /* this functions checks to see if the tcp header is actually
2636    acceptible. */
这个函数用来判断到达的数据报是否是可接受的
2638 static  int
2639 tcp_sequence (volatile struct sock *sk, struct tcp_header *th, short len,
2640           struct options *opt, unsigned long saddr)
2641 {
2642    /* this isn't quite right.  sk->acked_seq could be more recent
2643       than sk->window.  This is however close enough.  We will accept
2644       slightly more packets than we should, but it should not cause
2645       problems unless someone is trying to forge packets. */
2647   PRINTK ("tcp_sequence (sk=%X, th=%X, len = %d, opt=%d, saddr=%X)\n",
2648       sk, th, len, opt, saddr);
2650   if (between(th->seq, sk->acked_seq, sk->acked_seq + sk->window)||
接收到的数据报序号在确认号之后,并落在窗口内,可接受
2651       between(th->seq + len-sizeof (*th), sk->acked_seq,
2652           sk->acked_seq + sk->window) ||
接收到数据报的序号+首部长度之和落在确认号之后的窗口内,可接受
2653       (before (th->seq, sk->acked_seq) &&
2654        after (th->seq + len - sizeof (*th), sk->acked_seq + sk->window)))
接收到的数据报序号在确认号之前,同时接收数据延伸到了窗口外面,这时也是可接收的
2655     {
2656        return (1);
2657     }
对应这几种情况图示如下:

2661   /* if it's too far ahead, send an ack to let the other end
2662      know what we expect. */
2663   if (after (th->seq, sk->acked_seq + sk->window))
2664     {
2665        tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, saddr);
2666        return (0);
2667     }
如接收到的数据报的序号都落在了窗口外侧,这可能是由于窗口在中间更新过,这时需要重新通知对方确认信息(包括了窗口信息)

2669   /* in case it's just a late ack, let it through */
2670   if (th->ack && len == th->doff*4 && after (th->seq, sk->acked_seq - 32767) &&
2671       !th->fin && !th->syn) return (1);
迟到报文,也是可接收的

2673   if (!th->rst)
2674     {
2675        /* try to resync things. */
2676        tcp_send_ack (net32(th->ack_seq), sk->acked_seq, sk, th, saddr);
2677     }
运行到2673行说明,数据报不可接收,那么如果没有设置复位标记的话就发送确认数据报,使得发送方可以得知较新的信息
2680   return (0);
2681 }



1873 /* This routine deals with incoming acks, but not outgoing ones. */
//该函数只处理接收的确认报文,而不管发送的确认报文
1875 static  int
1876 tcp_ack (volatile struct sock *sk, struct tcp_header *th, unsigned long saddr)
1877 {
1878   unsigned long ack;
1879   ack = net32(th->ack_seq);

1884   if (after (ack, sk->send_seq+1) || before (ack, sk->rcv_ack_seq-1))
1885     {     //如果接收到的确认号大于发送报文的序号加1  或者 小于已接受的确认-1.
1886       if (after (ack, sk->send_seq) || (sk->state != TCP_ESTABLISHED &&
1887                     sk->state != TCP_CLOSE_WAIT))
1888     {
1889       return (0);
1890     }
到这里说明收到一个合法的确认报文
1891       if (sk->keepopen)
1892     reset_timer ((struct timer *)&sk->time_wait);
             每次收到一个合法确认报文都会重置定时器
1893       sk->retransmits = 0;
不需要重传
1894       return (1);
1895     }
===============================================================
第一步:对确认号不在[rcv_ack_seq-1,send_seq+1]之间的确认报文的处理,也就是下图阴影部分:

它包括两种情况
1.对还未发送报文的确认报文
2.对已经确认报文的确认
对于第1种情况的处理是直接返回0;对于第2种情况,如果不是TCP_ESTABLISHEDTCP_CLOSE_WAIT状态,直接返回0. 如果是其中一种状态,执行1891-1894行,其中1892行重设定时器(如果连接没有关闭),1893行设置重传标记为0.最后返回1。


到这里说明接收到的是[rcv_ack_seq-1,send_seq+1]之间的确认报文,即下图阴影部分:

1897   /* see if our window has been shrunk. */
1898   if (after (sk->window_seq, ack+net16(th->window))) //为true说明窗口对方窗口缩减了
1899     {
1900       /* we may need to move packets from the send queue to the
1901      write queue.if the window has been shrunk on us. */
1902       /* the rfc says you are not allowed to shrink your window like
1903      this, but if the other end does, you must be able to deal
1904      with it. */

1906       struct sk_buff *skb;
1907       struct sk_buff *skb2=NULL;
1908       struct sk_buff *wskb=NULL;
 
1910       sk->window_seq = ack + net16(th->window);
1911       cli();

1912       for (skb = sk->send_head; skb != NULL; skb=skb->link3)
1913     {     //遍历发送队列
1914       if (after( skb->h.seq, sk->window_seq))     //落在发送窗口外部
1915         {         
1917           /* remove it from the send queue. */
1918           if (skb2 == NULL)
1919         {
1920           sk->send_head = skb->link3;
1921         }
1922           else
1923         {
1924           skb2->link3 = skb->link3;
1925         }
             link3字段连接的是发送队列,上面所做的工作就是从发送队列中删除落在窗口外部的数据报
1926           if (sk->send_tail == skb)
1927                sk->send_tail = skb2;

1929           /* we may need to remove this from the dev send list. */
1930           if (skb->next != NULL)
1931         {
1932           int i;
1933           if (skb->next != skb)    
1934             {
1935               skb->next->prev = skb->prev;
1936               skb->prev->next = skb->next;
1937             }
                     从链表中移除(这里prev与next链接的数据报是?
                     这个问题可以从《linux0.99网络模块-数据链路层(发送)》中得到解答,那里我们发现链路层在将数据报发送给设备时,如果设备正忙,那么数据报就会挂到发送缓冲区中,由pre和next来链接。

1938           for (i = 0; i < DEV_NUMBUFFS; i++)
1939           {
1940               if (skb->dev->buffs[i] == skb)
1941             {
1942               if (skb->next == skb)
1943                 skb->dev->buffs[i] = NULL;
1944               else
1945                 skb->dev->buffs[i] = skb->next;
1946               break;
1947             }
                     从设备缓冲区中移除该报文
这里为什么要从设备缓冲区中移除呢?设备缓冲区的作用?
设备缓冲区中保存的是将从设备发送出去的数据报,由于窗口缩减,之前已经放到设备缓冲区中的数据报现在可能会变得不能被发送,那么就要将这样的数据报从设备缓冲区中移除
1948           }//for

1949         if (arp_q == skb)
1950         {
1951                 if (skb->next == skb)
1952                 arp_q = NULL;
1953         else
1954             arp_q = skb->next;
1955         }
              从arp队列中删除
1956     } //skb->next != NULL

1958           /* now add it to the write_queue. */
1959           skb->magic = TCP_WRITE_QUEUE_MAGIC;
1960           if (wskb == NULL)
1961         {
1962           skb->next = sk->wfront;
1963           sk->wfront = skb;
1964         }
1965           else
1966         {
1967           skb->next = wskb->next;
1968           wskb->next = skb;
1969         }
加入写队列头部(这里加入头部是比较合理的)
这里的写队列与上面的设备缓冲区之间有什么联系?写队列与发送队列之间有什么关系?
因为上面是将发送窗口之外的数据报放到了写队列中,可以推测,写队列中的数据报就是应用程序提交的,但是还不可被发送的数据报。因此,写队列与发送队列之间的关系就是写队列中的数据报会填充到发送队列中,至于填充多少是根据发送窗口大小决定的。

1970           wskb = skb;
1971         }
1972       else     //对应前面发送队列中的对象在窗口内的情况
1973         {
1974           skb2 = skb;
1975         }
1976     }
1977       sti();
1978     }
===========================================================
第二步:处理发送方窗口缩小的情况:
遍历发送队列,分两种情况处理:
     如果当前数据报的序号在更新后的发送窗口之前,结束遍历
     如果当前数据报的序号在更新后的发送窗口外部,这些数据报由于窗口缩减变得不能继续发送。
    从发送队列中将其删除,从设备缓冲区中将其删除,并且从arp队列中删除。随后将其加入写队列的首部。


1980   sk->window_seq = ack + net16(th->window);
更新窗口
1982   /* we don't want too many packets out there. */
1983   if (sk->cong_window < 2048 && ack != sk->rcv_ack_seq)
1984     {
1985        if (sk->exp_growth)
1986          sk->cong_window *= 2;
1987        else
1988          sk->cong_window++;
1989     }
上面的cong_window(cwnd)是拥塞窗口,1986行是指数增加的方式,这时处于慢启动过程,1988行是拥塞避免。(处于慢启动过程时,在继续发送之前需要接收到确认)。
慢启动算法初始设置cwnd为1个报文段,每接收到一个确认就加1.
拥塞避免算法要求每次收到一个确认将cwnd增加1/cwnd.
1991   PRINTK ("tcp_ack: Updating rcv ack sequence. \n");
1992   sk->rcv_ack_seq = ack;
更新已收到确认报文序号。
====================================================
第三步:慢启动与拥塞控制
根据当前窗口是否达到门限值,来决定采用慢启动还是拥塞控制。如果对于上面的1986行和1988行有疑问,请考虑一下累积确认。


1994   /* see if we can take anything off of the retransmit queue. */
1995   while (sk->send_head != NULL)
1996     {
1997       if (before (sk->send_head->h.seq, ack+1))
1998     {
重传队列中的报文已经收到确认,为什么需要重传队列呢?这是因为发出去的报文不一定能达到接收方,接收方因此会要求重传,所以我们不能把发出去的报文立刻从发送队列中删掉,只有当收到其确认时才可以这么做。
1999       struct sk_buff *oskb;
2000       /* we have one less packet out there. */
2001       sk->packets_out --;
2002       PRINTK ("skb=%X acked\n", sk->send_head);
2003       /* wake up the process, it can probably
2004          write more. */
2005       if (!sk->dead)
2006         wake_up (sk->sleep);
当删除掉确认的报文后,空余出新的空间,这时要唤醒等待的进程,因为可能有的进程因为发送队列满而在等待
2008       cli();
2010       oskb = sk->send_head;
2011       /* estimate the rtt. */
2012       sk->rtt += ((jiffies - oskb->when) - sk->rtt)/2;
2013       if (sk->rtt < 30) sk->rtt = 30;
这里并没有直接用新的往返时间替换原来的,而是求二者的平均
2014       sk->send_head = oskb->link3;
2015       if (sk->send_head == NULL)
2016         {
2017           sk->send_tail = NULL;
2018         }

2019       /* we may need to remove this from the dev send list. */
2020       if (oskb->next != NULL)
2021         {
2022            int i;
2023            if (oskb->next != oskb)
2024          {
2025             oskb->next->prev = oskb->prev;
2026             oskb->prev->next = oskb->next;
2027          }
2028            for (i = 0; i < DEV_NUMBUFFS; i++)
2029          {
2030            if (oskb->dev->buffs[i] == oskb)
2031              {
2032                if (oskb== oskb->next)
2033              oskb->dev->buffs[i]= NULL;
2034                else
2035              oskb->dev->buffs[i] = oskb->next;
2036                break;
2037              }
2038          }//for
2039            if (arp_q == oskb)
2040          {
2041            if (oskb == oskb->next)
2042              arp_q = NULL;
2043            else
2044              arp_q = oskb->next;
2045          }
2046       }

2047       oskb->magic = 0;
2048       kfree_skb  (oskb, FREE_WRITE); /* write. */
2049       sti();
2050       if (!sk->dead)
2051         wake_up(sk->sleep);
2052     }//
2053       else
2054     {
2055       break;     //不需要继续遍历
2056     }
2058     }//while遍历发送队列完毕
===================================================
第四步:处理已经被发送并且确认的情况。
遍历发送队列,从发送队列中删除已经确认的数据报,并且从设备发送队列中将其删除,从设备缓冲区中将其删除,从arp队列中将其删除。释放数据报空间,唤醒等待队列。


2061   /* at this point we need to check to see if we have anything
2062      which needs to be retransmiteed.  If we have failed to get
2063      some acks i.e. had to retransmit something, and we succeded, we
2064      should then attempt to retransmit everything right now. */
2065
2066   if (sk->retransmits && sk->send_head != NULL)
2067     {
2068       PRINTK ("retransmitting\n");
2069       sk->prot->retransmit (sk,1);
2070     }
2071   sk->retransmits = 0;
================================
第五步:处理重传



2073   /* maybe we can take some stuff off of thewrite queue, and put it onto
2074      the xmit queue. */
2075   if (sk->wfront != NULL && sk->packets_out < sk->cong_window)
2076     {
2077           if (after (sk->window_seq, sk->wfront->h.seq))
2078         {
2079               tcp_write_xmit (sk);
2080         }
2081     }
2082   else
2083   {
2084       if (sk->send_head == NULL && sk->ack_backlog == 0 &&
2085        sk->state != TCP_TIME_WAIT && !sk->keepopen)
2086      {//数据已经全部发送出去,也没有需要被确认的数据报,同时TCP的状态也不是TIME_WAIT,同时没有设置保持打开状态,也就是处于无事可做的状态
2087        PRINTK ("Nothing to do, going to sleep.\n");
2088         if (!sk->dead)
2089           wake_up (sk->sleep);
2090
2091         /* Lets send a probe once in a while. */
2092         sk->time_wait.len = TCP_PROBEWAIT_LEN;
2093         sk->timeout = TIME_KEEPOPEN;
2094         reset_timer((struct timer *)&sk->time_wait);
2095         sk->timeout = 0;
这里的定时器对应《TCP/IP详解卷一》中第22章的坚持定时器。我们知道,窗口大小为0会阻止发送方发送数据,直到发送方接收到一个数据报(ACK报文),它的窗口被更新为大于0的数之后,才可以继续发送数据。假如该ACK报文丢失就会造成死锁。为了防止这种情况,发送方使用一个坚持定时器来周期性的向接收方查询,以便发现窗口是否已增大。
2096      }
2097        else
2098      {
2099         if (sk->state == TCP_TIME_WAIT)
2100           {
2101                  sk->time_wait.len = TCP_TIMEWAIT_LEN;
2102                  sk->timeout = TIME_CLOSE;
2103           }
2104         sk->timeout = TIME_WRITE;
2105         sk->time_wait.len = sk->rtt*2;
2106         reset_timer ((struct timer *)&sk->time_wait);
2107      }
2108     }
=========================================
第六步:处理发送队列
我们在第三步更新了拥塞窗口,现在可以发送更多的数据报了,这里就是来处理这种情况的。
处理过程:
如果写出的数据报(就是放到待发送队列中的数据报)数量小于拥塞窗口指定的大小,并且当前数据报又属于发送窗口内的,那么就把它写入到xmit队列。
如果写队列为空或者写出的数据报数量已经达到拥塞窗口指定的大小:
     如果发送队列为空同时累积确认队列(ack_backlog)为空同时状态不是TCP_TIME_WAITsk->keepopen为false,那么这时需要发送试探帧。
     否则执行2099-2107行,这几行是什么用途

2111   if (sk->packets_out == 0 && sk->send_tmp != NULL &&
2112       sk->wfront == NULL && sk->send_head == NULL)
2113     {
2114       tcp_send_partial (sk);
2115     }

2117   /* see if we are done. */
2118   if ( sk->state == TCP_TIME_WAIT)
2119     {
2120        if (sk->rcv_ack_seq == sk->send_seq &&
2121        sk->acked_seq == sk->fin_seq);
2122        if (!sk->dead) wake_up (sk->sleep);
2123        sk->state = TCP_CLOSE;
2124     }
处理TCP_TIME_WAITTCP_TIME_WAIT是客户端在接收到服务端的最终确认后所处的状态。现在接收到确认后,设置TCP状态为TCP_CLOSE。2120-2121行的判断是在搞笑吗(什么也没做!)?
2126   if (sk->state == TCP_LAST_ACK || sk->state == TCP_FIN_WAIT2)
2127     {
2128       if (sk->rcv_ack_seq == sk->send_seq)  //如果发送的都被确认了
2129         {   
2130            if (sk->acked_seq != sk->fin_seq)  //如果是客户端,就是TCP_FIN_WAIT2;如果是服务端,就是TCP_LAST_ACK
2131              {
2132                 tcp_time_wait(sk);
如果是客户端,会进入TIME_WAIT状态,也成为2MSL等待状态。MSL是任何报文段丢弃前在网络中的最长时间。等待2MSL可以让TCP再次发送最后的ACK以防止其丢失(但是如果客户端发生了断电,这就不起作用了,下面会分析这种情况)。另外,在2MSL等待期间,定义这个连接的插口不能再被使用。
如果是服务端,这时需要设置定时器,因为(参照下图)如果客户端发生了断电,那么它再也不可能给服务器发送ACK了,如果没有定时器会导致服务端TCP连接无法关闭
2133              }
2134            else
2135              {
2136                 tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, sk->daddr);
2137                 sk->state = TCP_CLOSE;
                        对于处于sk->state == TCP_LAST_ACK服务端来说,它收到了最后的确认就可以关闭了;
                        对于处于
sk->state == TCP_FIN_WAIT2
TCP_LAST_ACK和TCP_FIN_WAIT2状态,在这两个情况下如果发送的报文都已经确认,就进行下面的处理:
对于TCP_FIN_WAIT2状态来说,它的fin报文一定是被确认了, 的客户端来说,它收到最后确认后也可以关闭了。
2138              }
2139         } //
2140       if (!sk->dead) wake_up (sk->sleep);
2141     }


关于TCP正常连接建立和终止所对应的图如下:

2143   PRINTK ("leaving tcp_ack\n");
2145   return (1);
2146 }


2148 /* This routine handles the data.  If there is room in the buffer, it
2149    will be have already been moved into it.  If there is no room,
2150    then we will just have to discard the packet. */
2151
2152 static  int
2153 tcp_data (struct sk_buff *skb, volatile struct sock *sk,
2154       unsigned long saddr, unsigned short len)
2155 {
2156   struct sk_buff *skb1, *skb2;
2157   struct tcp_header *th;
2158
2159   th = skb->h.th;
2160   print_th (th);
2161   skb->len = len - (th->doff*4);//其中doff是数据偏移
2162
2163   PRINTK("tcp_data len = %d sk = %X:\n",skb->len, sk);
2164   print_sk(sk);
2165  
2166   sk->bytes_rcv += skb->len;

2168   if (skb->len == 0 && !th->fin && !th->urg && !th->psh)
2169     {
2170       /* don't want to keep passing ack's back and fourth. */
2171       if (!th->ack)
2172       tcp_send_ack (sk->send_seq, sk->acked_seq,sk, th, saddr);
2173       kfree_skb(skb, FREE_READ);
2174       return (0);
2175     }
2176
2177   if (sk->shutdown & RCV_SHUTDOWN)
2178     {
2179        /* just ack everything. */
2180        sk->acked_seq = th->seq + skb->len + th->syn + th->fin;
2181        tcp_send_ack (sk->send_seq, sk->acked_seq, sk, skb->h.th, saddr);
2182        kfree_skb (skb, FREE_READ);
2183        if (sk->acked_seq == sk->fin_seq)
2184      {
2185         if (!sk->dead) wake_up (sk->sleep);
2186         if (sk->state == TCP_TIME_WAIT || sk->state == TCP_LAST_ACK
2187         || sk->state == TCP_FIN_WAIT2)
2188           sk->state = TCP_CLOSE;
2189      }
2190        return (0);
2191     }

2193   /* now we have to walk the chain, and figure out where this one
2194      goes into it
.  This is set up so that the last packet we received
2195      will be the first one we look at, that way if everything comes
2196      in order, there will be no performance loss, and if they come
2197      out of order we will be able to fit things in nicely. */
注释中已经说的比较清楚了,数据报以队列的形式组织,这样在所有数据报都按序到达的情况下,是没有性能损失的。即使有一部分没有按序到达,效率也是比较高的
2199   if (sk->rqueue == NULL)
2200     {
2201        PRINTK ("tcp_data: skb = %X:\n",skb);
2202        print_skb (skb);
2203
2204        sk->rqueue = skb;
2205        skb->next = skb;
2206        skb->prev = skb;
2207        skb1= NULL;
2208     }
2209   else
2210     {
2211       PRINTK ("tcp_data adding to chain sk = %X:\n",sk);
2212       print_sk (sk);
2213
2214       for (skb1=sk->rqueue; ; skb1=skb1->prev)
2215     {
2216       PRINTK ("skb1=%X\n",skb1);
2217       print_skb(skb1);
2218       PRINTK ("skb1->h.th->seq = %d\n", skb1->h.th->seq);
2219       if (after ( th->seq+1, skb1->h.th->seq))
2220         {
2221           skb->prev = skb1;
2222           skb->next = skb1->next;
2223           skb->next->prev = skb;
2224           skb1->next = skb;
2225           if (skb1 == sk->rqueue)
2226           sk->rqueue = skb;
2227           break;
2228         }
2229       if  ( skb1->prev == sk->rqueue)
2230         {
2231            skb->next= skb1;
2232            skb->prev = skb1->prev;
2233            skb->prev->next = skb;
2234            skb1->prev = skb;
2235            skb1 = NULL; /* so we know we might be able to ack stuff. */
2236            break;
2237         }
2238     }//for
for循环的处理过程如下图所示:


2240       PRINTK ("skb = %X:\n",skb);
2241       print_skb (skb);
2242       PRINTK ("sk now equals:\n");
2243       print_sk (sk);
2244
2245     }

2247   th->ack_seq = th->seq + skb->len; //期望接收对方的下一个报文段的第一个字节的序号
2248   if (th->syn) th->ack_seq ++;
2249   if (th->fin) th->ack_seq ++;
2250
2251   if (before (sk->acked_seq, sk->copied_seq))
2252     {
2253        PRINTK ("*** tcp.c:tcp_data bug acked < copied\n");
2254        sk->acked_seq = sk->copied_seq;
2255     }

2257   /* now figure out if we can ack anything. */
2258   if (skb1 == NULL || skb1->acked || before (th->seq, sk->acked_seq+1))
2259     {
2260       if (before (th->seq, sk->acked_seq+1))
2261     {
2262       if (after (th->ack_seq, sk->acked_seq))
2263           sk->acked_seq = th->ack_seq;
2264       skb->acked = 1;
2265
2266       /* when we ack the fin, we turn on the RCV_SHUTDOWN flag. */
2267       if (skb->h.th->fin)
2268         {
2269           sk->shutdown |= RCV_SHUTDOWN;
2270         }
2271
2272       for (skb2=skb->next; skb2 != sk->rqueue->next; skb2=skb2->next)
2273         {
2274            if (before(skb2->h.th->seq, sk->acked_seq+1))
2275          {
2276            if (after (skb2->h.th->ack_seq, sk->acked_seq))
2277              sk->acked_seq = skb2->h.th->ack_seq;          //更新确认号
2278             skb2->acked = 1;
2279
2280            /* when we ack the fin, we turn on the RCV_SHUTDOWN flag. */
2281            if (skb2->h.th->fin)
2282              {
2283                sk->shutdown |= RCV_SHUTDOWN;
2284              }
2285
2286             /* force an immediate ack. */
2287             sk->ack_backlog = sk->max_ack_backlog;
2288          }
2289            else
2290          {
2291            break;
2292          }
2293         }
2295       /* this also takes care of updating the window. */
2296       /* this if statement needs to be simplified. */
2297
2298       if (!sk->delay_acks ||
2299           sk->ack_backlog >= sk->max_ack_backlog ||
2300           sk->bytes_rcv > sk->max_unacked ||
2301           th->fin)
2302         {
2303         tcp_send_ack (sk->send_seq, sk->acked_seq,sk,th, saddr);
2304         }
2305       else
2306         {
2307            sk->ack_backlog++;
2308            sk->time_wait.len = TCP_ACK_TIME;
2309            sk->timeout = TIME_WRITE;
2310            reset_timer ((struct timer *)&sk->time_wait);
2311         }
2312        }
2313    }
2314   else
2315     {
2316        /* we missed a packet.  Send an ack to try to resync things. */
2317        tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, saddr);
2318     }
2319    
2320   /* now tell the user we may have some data. */
2321   if (!sk->dead)
2322     {
2323        wake_up (sk->sleep);
2324     }
2325   else
2326     {
2327        PRINTK ("data received on dead socket. \n");
2328     }
2329
2330   if (sk->state == TCP_FIN_WAIT2 && sk->acked_seq == sk->fin_seq)
2331     {
2332       tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, saddr);
2333       sk->state = TCP_LAST_ACK;
2334     }
2335
2336   return (0);
2337 }

在分析tcp_rcv(TCP接收处理函数)之前,我们介绍了上面的函数,现在对其中的一些易混淆的点进行总结。首先,关于写队列,发送队列、接收队列、重传队列是如何表示的以及它们之间的关系。我们前面已经讲过,sock_buff是用来描述数据报的;sock是用来描述TCP连接的。发送队列是保存在sock的send_head开始的链表中,其中的sk_buff用link3指针进行连接。需要注意的是,重传队列与发送队列是在一个共同的链表中的,那些发出去的还未接收到确认的数据报不能删除,因为它可能会丢失重传。写队列是由wfront指向的,其中的数据报是由应用程序提交的,它会根据窗口情况填充到发送队列中。另外还有一种就是窗口缩小的情况,这时需要把发送队列中的数据报摘下来重新加入到写队列中。读队列是由rqueue指向的。
关于慢启动与快重传可以直接看上面的介绍。
关于如何根据目的端口号找到对应的sock,最开始的函数也已经介绍过了。

现在我们就来看一下这个方法:
net/tcp/tcp.c
775        ipprot->handler (skb2, dev, &opt, iph->daddr,
776             net16(iph->tot_len) - iph->ihl*4,
777             iph->saddr, 0, ipprot);


2697 int
2698 tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
2699     unsigned long daddr, unsigned short len,
2700     unsigned long saddr, int redo, struct ip_protocol * protocol)
2701 {

2702   struct tcp_header *th;
2703   volatile struct sock *sk;
2704
2705   if (!skb)
2706     {
2707       PRINTK ("tcp.c: tcp_rcv skb = NULL\n");
2708       return (0);
2709     }
2710 #if 0 /* it's ok for protocol to be NULL */
2711   if (!protocol)
2712     {
2713       PRINTK ("tcp.c: tcp_rcv protocol = NULL\n");
2714       return (0);
2715     }
2717   if (!opt) /* it's ok for opt to be NULL */
2718     {
2719       PRINTK ("tcp.c: tcp_rcv opt = NULL\n");
2720     }
2721 #endif
上面没有什么可说的,#if 0 #endif起到注释的作用。
2722   if (!dev)
2723     {
2724       PRINTK ("tcp.c: tcp_rcv dev = NULL\n");
2725       return (0);
2726     }
2728   th = skb->h.th;
2730   /* find the socket. */
2731   sk=get_sock(&tcp_prot, net16(th->dest), saddr, th->source, daddr);
根据参数信息,也可以说是根据参数和传输层首部信息,寻找对应的sock。
2732   PRINTK("<<\n");
2733   PRINTK("len = %d, redo = %d, skb=%X\n", len, redo, skb);
2735   if (sk)
2736     {
2737       PRINTK ("sk = %X:\n",sk);
2738       print_sk (sk);
2739     }
找到后打印一下sock的信息。
2741   if (!redo)     //对于tcp来说,这个参数为0,也就是一定会执行下面的内容
2742     {
2743        if (th->check && tcp_check (th, len, saddr, daddr ))
2744      {     //校验
2745         skb->sk = NULL;
2746         PRINTK ("packet dropped with bad checksum.\n");
2747         kfree_skb (skb, 0);
2748         /* we don't release the socket because it was never
2749            marked in use. */
2750         return (0);
2751      }
2752
2753        /*See if we know about the socket. */
2754        if (sk == NULL)     //没有找到对应的socket
2755     {
2756       if (!th->rst)     //重置
2757         tcp_reset (daddr, saddr, th, &tcp_prot, opt,dev);
2758       skb->sk = NULL;
2759       kfree_skb (skb, 0);
2760       return (0);
2761     }

2763        skb->len = len;                           //tcp数据报数据+首部长度
2764        skb->sk = sk;                             //把数据报与相应的sock关联起来
2765        skb->acked = 0;
2766        skb->used = 0;
2767        skb->free = 0;
2768        skb->urg_used = 0;
2769        skb->saddr = daddr;
2770        skb->daddr = saddr;

2772        th->seq = net32(th->seq);
数据报序号
2774        cli();
关中断
2776        /* we may need to add it to the backlog here. */
2777        if (sk->inuse)     //如果sock正在被使用
2778      {
2779         if (sk->back_log == NULL)     //如果没有积压队列就创建
2780           {
2781          sk->back_log = skb;
2782          skb->next = skb;
2783          skb->prev = skb;
2784           }
2785         else     //有的话就添加进去
2786           {
2787          skb->next = sk->back_log;
2788          skb->prev = sk->back_log->prev;
2789          skb->prev->next = skb;
2790          skb->next->prev = skb;
2791           }
2792         sti();
2793         return (0);     //挂到积压队列上就可以返回了
2794      }
运行到这里说明sock此时没有被使用
2795        sk->inuse = 1;     //设置标记,
2796        sti();
2797      }     //if(!redo)
2798   else
2799     {
2800       if (!sk)
2801     {
2802       PRINTK ("tcp.c: tcp_rcv bug sk=NULL redo = 1\n");
2803       return (0);
2804     }
2805     }

2807   if (!sk->prot)
2808     {
2809       PRINTK ("tcp.c: tcp_rcv sk->prot = NULL \n");
2810       return (0);
2811     }

2813   /* charge the memory to the socket. */
//socket内存管理,
2814   if (sk->rmem_alloc + skb->mem_len >= SK_RMEM_MAX)
2815     {
2816        skb->sk = NULL;
2817        PRINTK ("dropping packet due to lack of buffer space.\n");
2818        kfree_skb (skb, 0);
2819        release_sock (sk);
2820        return (0);
2821     }
2823   sk->rmem_alloc += skb->mem_len;
更新长度(rmem_alloc就是读内存分配量)
2825   PRINTK ("About to do switch. \n");

2827   /* now deal with it.*/
2829   switch (sk->state)
2830     {
2831        /* this should close the system down if it's waiting for an
2832       ack that is never going to be sent. */
2833     case TCP_LAST_ACK:
2834       if (th->rst)
2835     {
2836       sk->err = ECONNRESET;
2837       sk->state = TCP_CLOSE;
2838       if (!sk->dead)
2839         {
2840           wake_up (sk->sleep);
2841         }
2842       kfree_skb (skb, FREE_READ);
2843       release_sock(sk);
2844       return (0);
2845     }
处于TCP_LAST_ACK状态的服务端可能永远收不到客户端的确认,这种情况可能是由于服务端发送的报文(FIN)丢失。
2847     case TCP_ESTABLISHED:
2848     case TCP_CLOSE_WAIT:
2849     case TCP_FIN_WAIT1:
2850     case TCP_FIN_WAIT2:
2851     case TCP_TIME_WAIT:
              
2853       if (!tcp_sequence (sk, th, len, opt, saddr))
2854     {
2855        kfree_skb (skb, FREE_READ);
2856        release_sock(sk);
2857        return (0);
                该数据报不可接收
2858     }

2860       if (th->rst)     //如果设置了复位字段
2861     {     //关于RST标记的说明可以参考TCP协议
2862       sk->err = ECONNRESET;
2863       sk->state = TCP_CLOSE;
2864       if (!sk->dead)
2865         {
2866           wake_up (sk->sleep);
2867         }
2868       kfree_skb (skb, FREE_READ);
2869       release_sock(sk);
2870       return (0);
2871     }

2872       if (opt && (opt->security != 0 || opt->compartment != 0 || th->syn))
2873     {
2874        sk->err = ECONNRESET;
2875        sk->state = TCP_CLOSE;
2876        tcp_reset (daddr, saddr,  th, sk->prot, opt,dev);
2877        if (!sk->dead)
2878          {
2879         wake_up (sk->sleep);
2880          }
2881        kfree_skb (skb, FREE_READ);
2882        release_sock(sk);
2883        return (0);
2884     }

2886       if (th->ack)
2887     {     //说明是一个确认报文段
2888        if(!tcp_ack (sk, th, saddr))
2889         {
2890            kfree_skb (skb, FREE_READ);
2891            release_sock(sk);
2892            return (0);
2893        }
2894     }
对确认报文段进行处理,这个过程已经在上面详细分析了。
2895       if (th->urg)
2896     {
2897       if (tcp_urg (sk, th, saddr))
2898         {
2899            kfree_skb (skb, FREE_READ);
2900            release_sock(sk);
2901            return (0);
2902         }
2903     }
处理紧急字段

2905       if ( tcp_data (skb, sk, saddr, len))
2906     {
2907        kfree_skb (skb, FREE_READ);
2908        release_sock(sk);
2909        return (0);
2910     }
2911
2912       if (!th->fin)
2913     {
2914       release_sock(sk);
2915       return (0);
2916     }
2918       tcp_fin (sk, th, saddr, dev);
2919       release_sock(sk);
2920       return (0);
2922     case TCP_CLOSE:
2923
2924       if (sk->dead || sk->daddr)
2925     {
2926        PRINTK ("packet received for closed,dead socket\n");
2927        kfree_skb (skb, FREE_READ);
2928        release_sock (sk);
2929        return (0);
2930     }
2931
2932       if (!th->rst)
2933     {
2934       if (!th->ack)
2935         th->ack_seq=0;
2936       tcp_reset (daddr, saddr, th, sk->prot, opt,dev);
2937     }
2938       kfree_skb (skb, FREE_READ);
2939       release_sock(sk);
2940       return (0);
2941
2942     case TCP_LISTEN:
2943       if (th->rst)
2944     {
2945        kfree_skb (skb, FREE_READ);
2946        release_sock(sk);
2947        return (0);
2948     }
2949       if (th->ack)
2950     {
2951       tcp_reset (daddr, saddr, th, sk->prot, opt,dev );
2952       kfree_skb (skb, FREE_READ);
2953       release_sock(sk);
2954       return (0);
2955     }
2956
2957       if (th->syn)
2958     {
2959 /*    if (opt->security != 0 || opt->compartment != 0)
2960         {
2961           tcp_reset (daddr, saddr, th, prot, opt,dev);
2962           release_sock(sk);
2963           return (0);
2964         } */
2965
2966       /* now we just put the whole thing including the header
2967          and saddr, and protocol pointer into the buffer.
2968          We can't respond until the user tells us to accept
2969          the connection. */
2970
2971       tcp_conn_request (sk, skb, daddr, saddr, opt, dev);

这个函数是用来处理TCP连接请求的,它会判断当前的等待应用层接收的TCP连接是否已经达到最大值,如果达到最大值,就忽略当前的连接请求;否则,为新的TCP连接建立套接字sock并添加到sock_array中,并把接收到的数据报,添加到读队列rqueue中。该函数的分析见《
linux0.99网络模块-应用层 or 传输层读写》。
2972
2973       release_sock(sk);
2974       return (0);
2975     }
2976
2977       kfree_skb (skb, FREE_READ);
2978       release_sock(sk);
2979       return (0);
2980
2981     default:
2982       if (!tcp_sequence (sk, th, len, opt, saddr))
2983     {
2984        kfree_skb (skb, FREE_READ);
2985        release_sock(sk);
2986        return (0);
2987     }
2988
2989     case TCP_SYN_SENT:
2990       if (th->rst)
2991     {
2992       sk->err = ECONNREFUSED;
2993       sk->state = TCP_CLOSE;
2994       if (!sk->dead)
2995         {
2996           wake_up (sk->sleep);
2997         }
2998       kfree_skb (skb, FREE_READ);
2999       release_sock(sk);
3000       return (0);
3016       if (!th->ack)
3017     {
3018       if (th->syn)
3019         {
3020           sk->state = TCP_SYN_RECV;
3021         }
3022
3023       kfree_skb (skb, FREE_READ);
3024       release_sock(sk);
3025       return (0);
3026     }
3027
3028       switch (sk->state)
3029     {
3030     case TCP_SYN_SENT:
3031       if (!tcp_ack(sk, th, saddr))
3032         {
3033           tcp_reset(daddr, saddr, th, sk->prot, opt,dev);
3034           kfree_skb (skb, FREE_READ);
3035           release_sock(sk);
3036           return (0);
3037         }
3038
3039       /* if the syn bit is also set, switch to tcp_syn_recv,
3040          and then to established. */
3041
3042       if (!th->syn)
3043         {
3044           kfree_skb (skb, FREE_READ);
3045           release_sock (sk);
3046           return (0);
3047         }
3048
3049       /* ack the syn and fall through. */
3050       sk->acked_seq = th->seq+1;
3051       sk->fin_seq = th->seq;
3052       tcp_send_ack (sk->send_seq, th->seq+1, sk,
3053                  th, sk->daddr);
3054
3055     case TCP_SYN_RECV:
3056       if (!tcp_ack(sk, th, saddr))
3057         {
3058           tcp_reset(daddr, saddr, th, sk->prot, opt, dev);
3059           kfree_skb (skb, FREE_READ);
3060           release_sock(sk);
3061           return (0);
3062         }
3063
3064       sk->state = TCP_ESTABLISHED;
3065       /* now we need to finish filling out some of the tcp
3066          header. */
3067
3068       /* we need to check for mtu info. */
3069       tcp_options(sk, th);
3070       sk->dummy_th.dest = th->source;
3071       sk->copied_seq = sk->acked_seq-1;
3072       if (!sk->dead)
3073         {
3074           wake_up (sk->sleep);
3075         }
3076
3077       /* now process the rest like we were already in the established
3078          state. */
3079       if (th->urg)
3080         if (tcp_urg (sk, th, saddr))
3081           {
3082          kfree_skb (skb, FREE_READ);
3083          release_sock(sk);
3084          return (0);
3085           }
3086       if (tcp_data (skb, sk, saddr, len))
3087         kfree_skb (skb, FREE_READ);
3088
3089       if (th->fin)
3090         tcp_fin(sk, th, saddr, dev);
3091
3092       release_sock(sk);
3093       return (0);
3094     }
3096       if (th->urg)
3097     {
3098       if (tcp_urg (sk, th, saddr))
3099         {
3100            kfree_skb (skb, FREE_READ);
3101            release_sock (sk);
3102            return (0);
3103         }
3104     }
3105
3106       if (tcp_data (skb, sk, saddr, len))
3107     {
3108        kfree_skb (skb, FREE_READ);
3109        release_sock (sk);
3110        return (0);
3111     }
3112
3113       if (!th->fin)
3114     {
3115       release_sock(sk);
3116       return (0);
3117     }
3118       tcp_fin (sk, th, saddr, dev);
3119       release_sock(sk);
3120       return (0);
3121     }
3122 }







你可能感兴趣的:(linux0.99网络模块-传输层(TCP接收))