上一篇文章《linux0.99网络模块-网络层(接收)》中我们提到过,注册到IP层的协议有ICMP,TCP,UDP。本文就来分析TCP处理数据报的过程。
我们记得上一篇中网络层通过调用下面的函数来把数据报传递给TCP。
775 ipprot->handler (skb2, dev, &opt, iph->daddr,
776 net16(iph->tot_len) - iph->ihl*4,
777 iph->saddr, 0, ipprot);
net/tcp/protocols.c:
56 static struct ip_protocol tcp_protocol =
57 {
58 tcp_rcv,
59 tcp_err,
60 NULL,
61 IPPROTO_TCP,
62 0, /* copy */
63 NULL
64 };
58行的tcp_rcv对应的就是tcp注册的handler方法
不过在分析该方法之前还是先分析一相关的数据结构和方法
net/tcp/sock.c
1738 /* This routine must find a socket given a tcp header. Everyhting
1739 is assumed to be in net order. */
sk=get_sock(&tcp_prot, net16(th->dest), saddr, th->source, daddr);
1741 volatile struct sock *get_sock (struct proto *prot, unsigned short num,
1742 unsigned long raddr,
1743 unsigned short rnum, unsigned long laddr)
1744 {
通过对比上面的函数调用可以推测各个参数的意义如下:
@prot:协议
@num:目的端口
@raddr::源IP地址
@rnum:源端口
@laddr:目的IP地址
1745 volatile struct sock *s;
1746 PRINTK ("get_sock (prot=%X, num=%d, raddr=%X, rnum=%d, laddr=%X)\n",
1747 prot, num, raddr, rnum, laddr);
1748
1749 /* SOCK_ARRAY_SIZE must be a power of two. This will work better
1750 than a prime unless 3 or more sockets end up using the same
1751 array entry. This should not be a problem because most
1752 well known sockets don't overlap that much, and for
1753 the other ones, we can just be careful about picking our
1754 socket number when we choose an arbitrary one. */
1756 for (s=prot->sock_array[num&(SOCK_ARRAY_SIZE-1)]; s != NULL; s=s->next)
1757 {
1758 if (s->num == num) //目的端口是否一致
1759 {
1760 /* we need to see if this is the socket that we want. */
1761 if (!ip_addr_match (s->daddr, raddr))
1762 continue;
检查sock的目的地址与参数中的源地址是否一致,不一致则跳过继续比较其他的。
该函数首先检查两个参数是否完全一致,如果是说明匹配,返回匹配。否则,自右向左逐个字节进行比较,如果相同,继续比较下一个字节,如果不同,就要看第一个参数前面的字节是否全为0,如果是,也可以匹配,如果不是返回不匹配。也就是全为0的前缀可以匹配同样长度的任何网络地址。
1763 if (s->dummy_th.dest != rnum && s->dummy_th.dest!= 0)
1764 continue;
由于要匹配对应的sock需要源IP:源端口 与 目的IP:目的端口均匹配,这里就是比较源端口
1765 if (!ip_addr_match (s->saddr, laddr))
1766 continue;
检查sock的源地址与数据报的目的地址是否匹配,如果不匹配,跳过
1767 return (s);
找到了匹配的sock(源地址,目的地址均匹配)
1768 }
1769 }
1770 return (NULL);
1771 }
图形化表示如下:
现在总结一下这个函数:我们在计算机网络书籍中也已经知道这样的事实,套接字如果匹配那么必须源IP:源端口与目的IP:目的端口完全匹配。在该函数中我们根据目的端口号从prot的sock_array数组中得到相应的链,也就是说同一个目的端口的sock会链接到一个链上保存到prot的sock_array的num&(SOCK_ARRAY_SIZE-1)索引位置。现在目的端口号已经匹配了,下面就遍历每一个sock,判断它的目的地址,源地址,源端口等是否匹配,如果找到这样的sock将其返回,否则返回NULL。
多说几句,在服务器编程时,每到一个请求,我们会为其创建一个sock(第一次创建,之后查询),在sock里面就保存了一对套接字信息,然后根据目的端口把它挂到相应协议(prot)的sock_array中相应链表中。
tcp_sequence (sk, th, len, opt, saddr)
2635 /* this functions checks to see if the tcp header is actually
2636 acceptible. */
这个函数用来判断到达的数据报是否是可接受的
2638 static int
2639 tcp_sequence (volatile struct sock *sk, struct tcp_header *th, short len,
2640 struct options *opt, unsigned long saddr)
2641 {
2642 /* this isn't quite right. sk->acked_seq could be more recent
2643 than sk->window. This is however close enough. We will accept
2644 slightly more packets than we should, but it should not cause
2645 problems unless someone is trying to forge packets. */
2647 PRINTK ("tcp_sequence (sk=%X, th=%X, len = %d, opt=%d, saddr=%X)\n",
2648 sk, th, len, opt, saddr);
2650 if (between(th->seq, sk->acked_seq, sk->acked_seq + sk->window)||
接收到的数据报序号在确认号之后,并落在窗口内,可接受
2651 between(th->seq + len-sizeof (*th), sk->acked_seq,
2652 sk->acked_seq + sk->window) ||
接收到数据报的序号+首部长度之和落在确认号之后的窗口内,可接受
2653 (before (th->seq, sk->acked_seq) &&
2654 after (th->seq + len - sizeof (*th), sk->acked_seq + sk->window)))
接收到的数据报序号在确认号之前,同时接收数据延伸到了窗口外面,这时也是可接收的
2655 {
2656 return (1);
2657 }
对应这几种情况图示如下:
2661 /* if it's too far ahead, send an ack to let the other end
2662 know what we expect. */
2663 if (after (th->seq, sk->acked_seq + sk->window))
2664 {
2665 tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, saddr);
2666 return (0);
2667 }
如接收到的数据报的序号都落在了窗口外侧,这可能是由于窗口在中间更新过,这时需要重新通知对方确认信息(包括了窗口信息)
2669 /* in case it's just a late ack, let it through */
2670 if (th->ack && len == th->doff*4 && after (th->seq, sk->acked_seq - 32767) &&
2671 !th->fin && !th->syn) return (1);
迟到报文,也是可接收的
2673 if (!th->rst)
2674 {
2675 /* try to resync things. */
2676 tcp_send_ack (net32(th->ack_seq), sk->acked_seq, sk, th, saddr);
2677 }
运行到2673行说明,数据报不可接收,那么如果没有设置复位标记的话就发送确认数据报,使得发送方可以得知较新的信息
2680 return (0);
2681 }
1873 /* This routine deals with incoming acks, but not outgoing ones. */
//该函数只处理接收的确认报文,而不管发送的确认报文
1875 static int
1876 tcp_ack (volatile struct sock *sk, struct tcp_header *th, unsigned long saddr)
1877 {
1878 unsigned long ack;
1879 ack = net32(th->ack_seq);
1884 if (after (ack, sk->send_seq+1) || before (ack, sk->rcv_ack_seq-1))
1885 { //如果接收到的确认号大于发送报文的序号加1 或者 小于已接受的确认-1.
1886 if (after (ack, sk->send_seq) || (sk->state != TCP_ESTABLISHED &&
1887 sk->state != TCP_CLOSE_WAIT))
1888 {
1889 return (0);
1890 }
到这里说明收到一个合法的确认报文
1891 if (sk->keepopen)
1892 reset_timer ((struct timer *)&sk->time_wait);
每次收到一个合法确认报文都会重置定时器
1893 sk->retransmits = 0;
不需要重传
1894 return (1);
1895 }
===============================================================
第一步:对确认号不在[rcv_ack_seq-1,send_seq+1]之间的确认报文的处理,也就是下图阴影部分:
它包括两种情况
1.对还未发送报文的确认报文
2.对已经确认报文的确认
对于第1种情况的处理是直接返回0;对于第2种情况,如果不是TCP_ESTABLISHED和TCP_CLOSE_WAIT状态,直接返回0. 如果是其中一种状态,执行1891-1894行,其中1892行重设定时器(如果连接没有关闭),1893行设置重传标记为0.最后返回1。
到这里说明接收到的是[rcv_ack_seq-1,send_seq+1]之间的确认报文,即下图阴影部分:
1897 /* see if our window has been shrunk. */
1898 if (after (sk->window_seq, ack+net16(th->window))) //为true说明窗口对方窗口缩减了
1899 {
1900 /* we may need to move packets from the send queue to the
1901 write queue.if the window has been shrunk on us. */
1902 /* the rfc says you are not allowed to shrink your window like
1903 this, but if the other end does, you must be able to deal
1904 with it. */
1906 struct sk_buff *skb;
1907 struct sk_buff *skb2=NULL;
1908 struct sk_buff *wskb=NULL;
1910 sk->window_seq = ack + net16(th->window);
1911 cli();
1912 for (skb = sk->send_head; skb != NULL; skb=skb->link3)
1913 { //遍历发送队列
1914 if (after( skb->h.seq, sk->window_seq)) //落在发送窗口外部
1915 {
1917 /* remove it from the send queue. */
1918 if (skb2 == NULL)
1919 {
1920 sk->send_head = skb->link3;
1921 }
1922 else
1923 {
1924 skb2->link3 = skb->link3;
1925 }
link3字段连接的是发送队列,上面所做的工作就是从发送队列中删除落在窗口外部的数据报
1926 if (sk->send_tail == skb)
1927 sk->send_tail = skb2;
1929 /* we may need to remove this from the dev send list. */
1930 if (skb->next != NULL)
1931 {
1932 int i;
1933 if (skb->next != skb)
1934 {
1935 skb->next->prev = skb->prev;
1936 skb->prev->next = skb->next;
1937 }
从链表中移除(这里prev与next链接的数据报是?)
这个问题可以从《linux0.99网络模块-数据链路层(发送)》中得到解答,那里我们发现链路层在将数据报发送给设备时,如果设备正忙,那么数据报就会挂到发送缓冲区中,由pre和next来链接。
1938 for (i = 0; i < DEV_NUMBUFFS; i++)
1939 {
1940 if (skb->dev->buffs[i] == skb)
1941 {
1942 if (skb->next == skb)
1943 skb->dev->buffs[i] = NULL;
1944 else
1945 skb->dev->buffs[i] = skb->next;
1946 break;
1947 }
从设备缓冲区中移除该报文
这里为什么要从设备缓冲区中移除呢?设备缓冲区的作用?
设备缓冲区中保存的是将从设备发送出去的数据报,由于窗口缩减,之前已经放到设备缓冲区中的数据报现在可能会变得不能被发送,那么就要将这样的数据报从设备缓冲区中移除
1948 }//for
1949 if (arp_q == skb)
1950 {
1951 if (skb->next == skb)
1952 arp_q = NULL;
1953 else
1954 arp_q = skb->next;
1955 }
从arp队列中删除
1956 } //skb->next != NULL
1958 /* now add it to the write_queue. */
1959 skb->magic = TCP_WRITE_QUEUE_MAGIC;
1960 if (wskb == NULL)
1961 {
1962 skb->next = sk->wfront;
1963 sk->wfront = skb;
1964 }
1965 else
1966 {
1967 skb->next = wskb->next;
1968 wskb->next = skb;
1969 }
加入写队列头部(这里加入头部是比较合理的)
这里的写队列与上面的设备缓冲区之间有什么联系?写队列与发送队列之间有什么关系?
因为上面是将发送窗口之外的数据报放到了写队列中,可以推测,写队列中的数据报就是应用程序提交的,但是还不可被发送的数据报。因此,写队列与发送队列之间的关系就是写队列中的数据报会填充到发送队列中,至于填充多少是根据发送窗口大小决定的。
1970 wskb = skb;
1971 }
1972 else //对应前面发送队列中的对象在窗口内的情况
1973 {
1974 skb2 = skb;
1975 }
1976 }
1977 sti();
1978 }
===========================================================
第二步:处理发送方窗口缩小的情况:
遍历发送队列,分两种情况处理:
如果当前数据报的序号在更新后的发送窗口之前,结束遍历
如果当前数据报的序号在更新后的发送窗口外部,这些数据报由于窗口缩减变得不能继续发送。
从发送队列中将其删除,从设备缓冲区中将其删除,并且从arp队列中删除。随后将其加入写队列的首部。
1980 sk->window_seq = ack + net16(th->window);
更新窗口
1982 /* we don't want too many packets out there. */
1983 if (sk->cong_window < 2048 && ack != sk->rcv_ack_seq)
1984 {
1985 if (sk->exp_growth)
1986 sk->cong_window *= 2;
1987 else
1988 sk->cong_window++;
1989 }
上面的cong_window(cwnd)是拥塞窗口,1986行是指数增加的方式,这时处于慢启动过程,1988行是拥塞避免。(处于慢启动过程时,在继续发送之前需要接收到确认)。
慢启动算法初始设置cwnd为1个报文段,每接收到一个确认就加1.
拥塞避免算法要求每次收到一个确认将cwnd增加1/cwnd.
1991 PRINTK ("tcp_ack: Updating rcv ack sequence. \n");
1992 sk->rcv_ack_seq = ack;
更新已收到确认报文序号。
====================================================
第三步:慢启动与拥塞控制
根据当前窗口是否达到门限值,来决定采用慢启动还是拥塞控制。如果对于上面的1986行和1988行有疑问,请考虑一下累积确认。
1994 /* see if we can take anything off of the retransmit queue. */
1995 while (sk->send_head != NULL)
1996 {
1997 if (before (sk->send_head->h.seq, ack+1))
1998 {
重传队列中的报文已经收到确认,为什么需要重传队列呢?这是因为发出去的报文不一定能达到接收方,接收方因此会要求重传,所以我们不能把发出去的报文立刻从发送队列中删掉,只有当收到其确认时才可以这么做。
1999 struct sk_buff *oskb;
2000 /* we have one less packet out there. */
2001 sk->packets_out --;
2002 PRINTK ("skb=%X acked\n", sk->send_head);
2003 /* wake up the process, it can probably
2004 write more. */
2005 if (!sk->dead)
2006 wake_up (sk->sleep);
当删除掉确认的报文后,空余出新的空间,这时要唤醒等待的进程,因为可能有的进程因为发送队列满而在等待
2008 cli();
2010 oskb = sk->send_head;
2011 /* estimate the rtt. */
2012 sk->rtt += ((jiffies - oskb->when) - sk->rtt)/2;
2013 if (sk->rtt < 30) sk->rtt = 30;
这里并没有直接用新的往返时间替换原来的,而是求二者的平均
2014 sk->send_head = oskb->link3;
2015 if (sk->send_head == NULL)
2016 {
2017 sk->send_tail = NULL;
2018 }
2019 /* we may need to remove this from the dev send list. */
2020 if (oskb->next != NULL)
2021 {
2022 int i;
2023 if (oskb->next != oskb)
2024 {
2025 oskb->next->prev = oskb->prev;
2026 oskb->prev->next = oskb->next;
2027 }
2028 for (i = 0; i < DEV_NUMBUFFS; i++)
2029 {
2030 if (oskb->dev->buffs[i] == oskb)
2031 {
2032 if (oskb== oskb->next)
2033 oskb->dev->buffs[i]= NULL;
2034 else
2035 oskb->dev->buffs[i] = oskb->next;
2036 break;
2037 }
2038 }//for
2039 if (arp_q == oskb)
2040 {
2041 if (oskb == oskb->next)
2042 arp_q = NULL;
2043 else
2044 arp_q = oskb->next;
2045 }
2046 }
2047 oskb->magic = 0;
2048 kfree_skb (oskb, FREE_WRITE); /* write. */
2049 sti();
2050 if (!sk->dead)
2051 wake_up(sk->sleep);
2052 }//
2053 else
2054 {
2055 break; //不需要继续遍历
2056 }
2058 }//while遍历发送队列完毕
===================================================
第四步:处理已经被发送并且确认的情况。
遍历发送队列,从发送队列中删除已经确认的数据报,并且从设备发送队列中将其删除,从设备缓冲区中将其删除,从arp队列中将其删除。释放数据报空间,唤醒等待队列。
2061 /* at this point we need to check to see if we have anything
2062 which needs to be retransmiteed. If we have failed to get
2063 some acks i.e. had to retransmit something, and we succeded, we
2064 should then attempt to retransmit everything right now. */
2065
2066 if (sk->retransmits && sk->send_head != NULL)
2067 {
2068 PRINTK ("retransmitting\n");
2069 sk->prot->retransmit (sk,1);
2070 }
2071 sk->retransmits = 0;
================================
第五步:处理重传
2073 /* maybe we can take some stuff off of thewrite queue, and put it onto
2074 the xmit queue. */
2075 if (sk->wfront != NULL && sk->packets_out < sk->cong_window)
2076 {
2077 if (after (sk->window_seq, sk->wfront->h.seq))
2078 {
2079 tcp_write_xmit (sk);
2080 }
2081 }
2082 else
2083 {
2084 if (sk->send_head == NULL && sk->ack_backlog == 0 &&
2085 sk->state != TCP_TIME_WAIT && !sk->keepopen)
2086 {//数据已经全部发送出去,也没有需要被确认的数据报,同时TCP的状态也不是TIME_WAIT,同时没有设置保持打开状态,也就是处于无事可做的状态
2087 PRINTK ("Nothing to do, going to sleep.\n");
2088 if (!sk->dead)
2089 wake_up (sk->sleep);
2090
2091 /* Lets send a probe once in a while. */
2092 sk->time_wait.len = TCP_PROBEWAIT_LEN;
2093 sk->timeout = TIME_KEEPOPEN;
2094 reset_timer((struct timer *)&sk->time_wait);
2095 sk->timeout = 0;
这里的定时器对应《TCP/IP详解卷一》中第22章的坚持定时器。我们知道,窗口大小为0会阻止发送方发送数据,直到发送方接收到一个数据报(ACK报文),它的窗口被更新为大于0的数之后,才可以继续发送数据。假如该ACK报文丢失就会造成死锁。为了防止这种情况,发送方使用一个坚持定时器来周期性的向接收方查询,以便发现窗口是否已增大。
2096 }
2097 else
2098 {
2099 if (sk->state == TCP_TIME_WAIT)
2100 {
2101 sk->time_wait.len = TCP_TIMEWAIT_LEN;
2102 sk->timeout = TIME_CLOSE;
2103 }
2104 sk->timeout = TIME_WRITE;
2105 sk->time_wait.len = sk->rtt*2;
2106 reset_timer ((struct timer *)&sk->time_wait);
2107 }
2108 }
=========================================
第六步:处理发送队列
我们在第三步更新了拥塞窗口,现在可以发送更多的数据报了,这里就是来处理这种情况的。
处理过程:
如果写出的数据报(就是放到待发送队列中的数据报)数量小于拥塞窗口指定的大小,并且当前数据报又属于发送窗口内的,那么就把它写入到xmit队列。
如果写队列为空或者写出的数据报数量已经达到拥塞窗口指定的大小:
如果发送队列为空同时累积确认队列(ack_backlog)为空同时状态不是TCP_TIME_WAIT,sk->keepopen为false,那么这时需要发送试探帧。
否则执行2099-2107行,这几行是什么用途?
2111 if (sk->packets_out == 0 && sk->send_tmp != NULL &&
2112 sk->wfront == NULL && sk->send_head == NULL)
2113 {
2114 tcp_send_partial (sk);
2115 }
2117 /* see if we are done. */
2118 if ( sk->state == TCP_TIME_WAIT)
2119 {
2120 if (sk->rcv_ack_seq == sk->send_seq &&
2121 sk->acked_seq == sk->fin_seq);
2122 if (!sk->dead) wake_up (sk->sleep);
2123 sk->state = TCP_CLOSE;
2124 }
处理TCP_TIME_WAIT,TCP_TIME_WAIT是客户端在接收到服务端的最终确认后所处的状态。现在接收到确认后,设置TCP状态为TCP_CLOSE。2120-2121行的判断是在搞笑吗(什么也没做!)?
2126 if (sk->state == TCP_LAST_ACK || sk->state == TCP_FIN_WAIT2)
2127 {
2128 if (sk->rcv_ack_seq == sk->send_seq) //如果发送的都被确认了
2129 {
2130 if (sk->acked_seq != sk->fin_seq) //如果是客户端,就是TCP_FIN_WAIT2;如果是服务端,就是TCP_LAST_ACK
2131 {
2132 tcp_time_wait(sk);
如果是客户端,会进入TIME_WAIT状态,也成为2MSL等待状态。MSL是任何报文段丢弃前在网络中的最长时间。等待2MSL可以让TCP再次发送最后的ACK以防止其丢失(但是如果客户端发生了断电,这就不起作用了,下面会分析这种情况)。另外,在2MSL等待期间,定义这个连接的插口不能再被使用。
如果是服务端,这时需要设置定时器,因为(参照下图)如果客户端发生了断电,那么它再也不可能给服务器发送ACK了,如果没有定时器会导致服务端TCP连接无法关闭
2133 }
2134 else
2135 {
2136 tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, sk->daddr);
2137 sk->state = TCP_CLOSE;
对于处于sk->state == TCP_LAST_ACK服务端来说,它收到了最后的确认就可以关闭了;
对于处于
sk->state == TCP_FIN_WAIT2
TCP_LAST_ACK和TCP_FIN_WAIT2状态,在这两个情况下如果发送的报文都已经确认,就进行下面的处理:
对于TCP_FIN_WAIT2状态来说,它的fin报文一定是被确认了,
的客户端来说,它收到最后确认后也可以关闭了。
2138 }
2139 } //
2140 if (!sk->dead) wake_up (sk->sleep);
2141 }
关于TCP正常连接建立和终止所对应的图如下:
2143 PRINTK ("leaving tcp_ack\n");
2145 return (1);
2146 }
2148 /* This routine handles the data. If there is room in the buffer, it
2149 will be have already been moved into it. If there is no room,
2150 then we will just have to discard the packet. */
2151
2152 static int
2153 tcp_data (struct sk_buff *skb, volatile struct sock *sk,
2154 unsigned long saddr, unsigned short len)
2155 {
2156 struct sk_buff *skb1, *skb2;
2157 struct tcp_header *th;
2158
2159 th = skb->h.th;
2160 print_th (th);
2161 skb->len = len - (th->doff*4);//其中doff是数据偏移
2162
2163 PRINTK("tcp_data len = %d sk = %X:\n",skb->len, sk);
2164 print_sk(sk);
2165
2166 sk->bytes_rcv += skb->len;
2168 if (skb->len == 0 && !th->fin && !th->urg && !th->psh)
2169 {
2170 /* don't want to keep passing ack's back and fourth. */
2171 if (!th->ack)
2172 tcp_send_ack (sk->send_seq, sk->acked_seq,sk, th, saddr);
2173 kfree_skb(skb, FREE_READ);
2174 return (0);
2175 }
2176
2177 if (sk->shutdown & RCV_SHUTDOWN)
2178 {
2179 /* just ack everything. */
2180 sk->acked_seq = th->seq + skb->len + th->syn + th->fin;
2181 tcp_send_ack (sk->send_seq, sk->acked_seq, sk, skb->h.th, saddr);
2182 kfree_skb (skb, FREE_READ);
2183 if (sk->acked_seq == sk->fin_seq)
2184 {
2185 if (!sk->dead) wake_up (sk->sleep);
2186 if (sk->state == TCP_TIME_WAIT || sk->state == TCP_LAST_ACK
2187 || sk->state == TCP_FIN_WAIT2)
2188 sk->state = TCP_CLOSE;
2189 }
2190 return (0);
2191 }
2193 /* now we have to walk the chain, and figure out where this one
2194 goes into it. This is set up so that the last packet we received
2195 will be the first one we look at, that way if everything comes
2196 in order, there will be no performance loss, and if they come
2197 out of order we will be able to fit things in nicely. */
注释中已经说的比较清楚了,数据报以队列的形式组织,这样在所有数据报都按序到达的情况下,是没有性能损失的。即使有一部分没有按序到达,效率也是比较高的
2199 if (sk->rqueue == NULL)
2200 {
2201 PRINTK ("tcp_data: skb = %X:\n",skb);
2202 print_skb (skb);
2203
2204 sk->rqueue = skb;
2205 skb->next = skb;
2206 skb->prev = skb;
2207 skb1= NULL;
2208 }
2209 else
2210 {
2211 PRINTK ("tcp_data adding to chain sk = %X:\n",sk);
2212 print_sk (sk);
2213
2214 for (skb1=sk->rqueue; ; skb1=skb1->prev)
2215 {
2216 PRINTK ("skb1=%X\n",skb1);
2217 print_skb(skb1);
2218 PRINTK ("skb1->h.th->seq = %d\n", skb1->h.th->seq);
2219 if (after ( th->seq+1, skb1->h.th->seq))
2220 {
2221 skb->prev = skb1;
2222 skb->next = skb1->next;
2223 skb->next->prev = skb;
2224 skb1->next = skb;
2225 if (skb1 == sk->rqueue)
2226 sk->rqueue = skb;
2227 break;
2228 }
2229 if ( skb1->prev == sk->rqueue)
2230 {
2231 skb->next= skb1;
2232 skb->prev = skb1->prev;
2233 skb->prev->next = skb;
2234 skb1->prev = skb;
2235 skb1 = NULL; /* so we know we might be able to ack stuff. */
2236 break;
2237 }
2238 }//for
for循环的处理过程如下图所示:
2240 PRINTK ("skb = %X:\n",skb);
2241 print_skb (skb);
2242 PRINTK ("sk now equals:\n");
2243 print_sk (sk);
2244
2245 }
2247 th->ack_seq = th->seq + skb->len; //期望接收对方的下一个报文段的第一个字节的序号
2248 if (th->syn) th->ack_seq ++;
2249 if (th->fin) th->ack_seq ++;
2250
2251 if (before (sk->acked_seq, sk->copied_seq))
2252 {
2253 PRINTK ("*** tcp.c:tcp_data bug acked < copied\n");
2254 sk->acked_seq = sk->copied_seq;
2255 }
2257 /* now figure out if we can ack anything. */
2258 if (skb1 == NULL || skb1->acked || before (th->seq, sk->acked_seq+1))
2259 {
2260 if (before (th->seq, sk->acked_seq+1))
2261 {
2262 if (after (th->ack_seq, sk->acked_seq))
2263 sk->acked_seq = th->ack_seq;
2264 skb->acked = 1;
2265
2266 /* when we ack the fin, we turn on the RCV_SHUTDOWN flag. */
2267 if (skb->h.th->fin)
2268 {
2269 sk->shutdown |= RCV_SHUTDOWN;
2270 }
2271
2272 for (skb2=skb->next; skb2 != sk->rqueue->next; skb2=skb2->next)
2273 {
2274 if (before(skb2->h.th->seq, sk->acked_seq+1))
2275 {
2276 if (after (skb2->h.th->ack_seq, sk->acked_seq))
2277 sk->acked_seq = skb2->h.th->ack_seq; //更新确认号
2278 skb2->acked = 1;
2279
2280 /* when we ack the fin, we turn on the RCV_SHUTDOWN flag. */
2281 if (skb2->h.th->fin)
2282 {
2283 sk->shutdown |= RCV_SHUTDOWN;
2284 }
2285
2286 /* force an immediate ack. */
2287 sk->ack_backlog = sk->max_ack_backlog;
2288 }
2289 else
2290 {
2291 break;
2292 }
2293 }
2295 /* this also takes care of updating the window. */
2296 /* this if statement needs to be simplified. */
2297
2298 if (!sk->delay_acks ||
2299 sk->ack_backlog >= sk->max_ack_backlog ||
2300 sk->bytes_rcv > sk->max_unacked ||
2301 th->fin)
2302 {
2303 tcp_send_ack (sk->send_seq, sk->acked_seq,sk,th, saddr);
2304 }
2305 else
2306 {
2307 sk->ack_backlog++;
2308 sk->time_wait.len = TCP_ACK_TIME;
2309 sk->timeout = TIME_WRITE;
2310 reset_timer ((struct timer *)&sk->time_wait);
2311 }
2312 }
2313 }
2314 else
2315 {
2316 /* we missed a packet. Send an ack to try to resync things. */
2317 tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, saddr);
2318 }
2319
2320 /* now tell the user we may have some data. */
2321 if (!sk->dead)
2322 {
2323 wake_up (sk->sleep);
2324 }
2325 else
2326 {
2327 PRINTK ("data received on dead socket. \n");
2328 }
2329
2330 if (sk->state == TCP_FIN_WAIT2 && sk->acked_seq == sk->fin_seq)
2331 {
2332 tcp_send_ack (sk->send_seq, sk->acked_seq, sk, th, saddr);
2333 sk->state = TCP_LAST_ACK;
2334 }
2335
2336 return (0);
2337 }
在分析tcp_rcv(TCP接收处理函数)之前,我们介绍了上面的函数,现在对其中的一些易混淆的点进行总结。首先,关于写队列,发送队列、接收队列、重传队列是如何表示的以及它们之间的关系。我们前面已经讲过,sock_buff是用来描述数据报的;sock是用来描述TCP连接的。发送队列是保存在sock的send_head开始的链表中,其中的sk_buff用link3指针进行连接。需要注意的是,重传队列与发送队列是在一个共同的链表中的,那些发出去的还未接收到确认的数据报不能删除,因为它可能会丢失重传。写队列是由wfront指向的,其中的数据报是由应用程序提交的,它会根据窗口情况填充到发送队列中。另外还有一种就是窗口缩小的情况,这时需要把发送队列中的数据报摘下来重新加入到写队列中。读队列是由rqueue指向的。
关于慢启动与快重传可以直接看上面的介绍。
关于如何根据目的端口号找到对应的sock,最开始的函数也已经介绍过了。
现在我们就来看一下这个方法:
net/tcp/tcp.c
775 ipprot->handler (skb2, dev, &opt, iph->daddr,
776 net16(iph->tot_len) - iph->ihl*4,
777 iph->saddr, 0, ipprot);
2697 int
2698 tcp_rcv(struct sk_buff *skb, struct device *dev, struct options *opt,
2699 unsigned long daddr, unsigned short len,
2700 unsigned long saddr, int redo, struct ip_protocol * protocol)
2701 {
2702 struct tcp_header *th;
2703 volatile struct sock *sk;
2704
2705 if (!skb)
2706 {
2707 PRINTK ("tcp.c: tcp_rcv skb = NULL\n");
2708 return (0);
2709 }
2710 #if 0 /* it's ok for protocol to be NULL */
2711 if (!protocol)
2712 {
2713 PRINTK ("tcp.c: tcp_rcv protocol = NULL\n");
2714 return (0);
2715 }
2717 if (!opt) /* it's ok for opt to be NULL */
2718 {
2719 PRINTK ("tcp.c: tcp_rcv opt = NULL\n");
2720 }
2721 #endif
上面没有什么可说的,#if 0 #endif起到注释的作用。
2722 if (!dev)
2723 {
2724 PRINTK ("tcp.c: tcp_rcv dev = NULL\n");
2725 return (0);
2726 }
2728 th = skb->h.th;
2730 /* find the socket. */
2731 sk=get_sock(&tcp_prot, net16(th->dest), saddr, th->source, daddr);
根据参数信息,也可以说是根据参数和传输层首部信息,寻找对应的sock。
2732 PRINTK("<<\n");
2733 PRINTK("len = %d, redo = %d, skb=%X\n", len, redo, skb);
2735 if (sk)
2736 {
2737 PRINTK ("sk = %X:\n",sk);
2738 print_sk (sk);
2739 }
找到后打印一下sock的信息。
2741 if (!redo) //对于tcp来说,这个参数为0,也就是一定会执行下面的内容
2742 {
2743 if (th->check && tcp_check (th, len, saddr, daddr ))
2744 { //校验
2745 skb->sk = NULL;
2746 PRINTK ("packet dropped with bad checksum.\n");
2747 kfree_skb (skb, 0);
2748 /* we don't release the socket because it was never
2749 marked in use. */
2750 return (0);
2751 }
2752
2753 /*See if we know about the socket. */
2754 if (sk == NULL) //没有找到对应的socket
2755 {
2756 if (!th->rst) //重置
2757 tcp_reset (daddr, saddr, th, &tcp_prot, opt,dev);
2758 skb->sk = NULL;
2759 kfree_skb (skb, 0);
2760 return (0);
2761 }
2763 skb->len = len; //tcp数据报数据+首部长度
2764 skb->sk = sk; //把数据报与相应的sock关联起来
2765 skb->acked = 0;
2766 skb->used = 0;
2767 skb->free = 0;
2768 skb->urg_used = 0;
2769 skb->saddr = daddr;
2770 skb->daddr = saddr;
2772 th->seq = net32(th->seq);
数据报序号
2774 cli();
关中断
2776 /* we may need to add it to the backlog here. */
2777 if (sk->inuse) //如果sock正在被使用
2778 {
2779 if (sk->back_log == NULL) //如果没有积压队列就创建
2780 {
2781 sk->back_log = skb;
2782 skb->next = skb;
2783 skb->prev = skb;
2784 }
2785 else //有的话就添加进去
2786 {
2787 skb->next = sk->back_log;
2788 skb->prev = sk->back_log->prev;
2789 skb->prev->next = skb;
2790 skb->next->prev = skb;
2791 }
2792 sti();
2793 return (0); //挂到积压队列上就可以返回了
2794 }
运行到这里说明sock此时没有被使用
2795 sk->inuse = 1; //设置标记,
2796 sti();
2797 } //if(!redo)
2798 else
2799 {
2800 if (!sk)
2801 {
2802 PRINTK ("tcp.c: tcp_rcv bug sk=NULL redo = 1\n");
2803 return (0);
2804 }
2805 }
2807 if (!sk->prot)
2808 {
2809 PRINTK ("tcp.c: tcp_rcv sk->prot = NULL \n");
2810 return (0);
2811 }
2813 /* charge the memory to the socket. */
//socket内存管理,
2814 if (sk->rmem_alloc + skb->mem_len >= SK_RMEM_MAX)
2815 {
2816 skb->sk = NULL;
2817 PRINTK ("dropping packet due to lack of buffer space.\n");
2818 kfree_skb (skb, 0);
2819 release_sock (sk);
2820 return (0);
2821 }
2823 sk->rmem_alloc += skb->mem_len;
更新长度(rmem_alloc就是读内存分配量)
2825 PRINTK ("About to do switch. \n");
2827 /* now deal with it.*/
2829 switch (sk->state)
2830 {
2831 /* this should close the system down if it's waiting for an
2832 ack that is never going to be sent. */
2833 case TCP_LAST_ACK:
2834 if (th->rst)
2835 {
2836 sk->err = ECONNRESET;
2837 sk->state = TCP_CLOSE;
2838 if (!sk->dead)
2839 {
2840 wake_up (sk->sleep);
2841 }
2842 kfree_skb (skb, FREE_READ);
2843 release_sock(sk);
2844 return (0);
2845 }
处于TCP_LAST_ACK状态的服务端可能永远收不到客户端的确认,这种情况可能是由于服务端发送的报文(FIN)丢失。
2847 case TCP_ESTABLISHED:
2848 case TCP_CLOSE_WAIT:
2849 case TCP_FIN_WAIT1:
2850 case TCP_FIN_WAIT2:
2851 case TCP_TIME_WAIT:
2853 if (!tcp_sequence (sk, th, len, opt, saddr))
2854 {
2855 kfree_skb (skb, FREE_READ);
2856 release_sock(sk);
2857 return (0);
该数据报不可接收
2858 }
2860 if (th->rst) //如果设置了复位字段
2861 { //关于RST标记的说明可以参考TCP协议
2862 sk->err = ECONNRESET;
2863 sk->state = TCP_CLOSE;
2864 if (!sk->dead)
2865 {
2866 wake_up (sk->sleep);
2867 }
2868 kfree_skb (skb, FREE_READ);
2869 release_sock(sk);
2870 return (0);
2871 }
2872 if (opt && (opt->security != 0 || opt->compartment != 0 || th->syn))
2873 {
2874 sk->err = ECONNRESET;
2875 sk->state = TCP_CLOSE;
2876 tcp_reset (daddr, saddr, th, sk->prot, opt,dev);
2877 if (!sk->dead)
2878 {
2879 wake_up (sk->sleep);
2880 }
2881 kfree_skb (skb, FREE_READ);
2882 release_sock(sk);
2883 return (0);
2884 }
2886 if (th->ack)
2887 { //说明是一个确认报文段
2888 if(!tcp_ack (sk, th, saddr))
2889 {
2890 kfree_skb (skb, FREE_READ);
2891 release_sock(sk);
2892 return (0);
2893 }
2894 }
对确认报文段进行处理,这个过程已经在上面详细分析了。
2895 if (th->urg)
2896 {
2897 if (tcp_urg (sk, th, saddr))
2898 {
2899 kfree_skb (skb, FREE_READ);
2900 release_sock(sk);
2901 return (0);
2902 }
2903 }
处理紧急字段
2905 if ( tcp_data (skb, sk, saddr, len))
2906 {
2907 kfree_skb (skb, FREE_READ);
2908 release_sock(sk);
2909 return (0);
2910 }
2911
2912 if (!th->fin)
2913 {
2914 release_sock(sk);
2915 return (0);
2916 }
2918 tcp_fin (sk, th, saddr, dev);
2919 release_sock(sk);
2920 return (0);
2922 case TCP_CLOSE:
2923
2924 if (sk->dead || sk->daddr)
2925 {
2926 PRINTK ("packet received for closed,dead socket\n");
2927 kfree_skb (skb, FREE_READ);
2928 release_sock (sk);
2929 return (0);
2930 }
2931
2932 if (!th->rst)
2933 {
2934 if (!th->ack)
2935 th->ack_seq=0;
2936 tcp_reset (daddr, saddr, th, sk->prot, opt,dev);
2937 }
2938 kfree_skb (skb, FREE_READ);
2939 release_sock(sk);
2940 return (0);
2941
2942 case TCP_LISTEN:
2943 if (th->rst)
2944 {
2945 kfree_skb (skb, FREE_READ);
2946 release_sock(sk);
2947 return (0);
2948 }
2949 if (th->ack)
2950 {
2951 tcp_reset (daddr, saddr, th, sk->prot, opt,dev );
2952 kfree_skb (skb, FREE_READ);
2953 release_sock(sk);
2954 return (0);
2955 }
2956
2957 if (th->syn)
2958 {
2959 /* if (opt->security != 0 || opt->compartment != 0)
2960 {
2961 tcp_reset (daddr, saddr, th, prot, opt,dev);
2962 release_sock(sk);
2963 return (0);
2964 } */
2965
2966 /* now we just put the whole thing including the header
2967 and saddr, and protocol pointer into the buffer.
2968 We can't respond until the user tells us to accept
2969 the connection. */
2970
2971 tcp_conn_request (sk, skb, daddr, saddr, opt, dev);
这个函数是用来处理TCP连接请求的,它会判断当前的等待应用层接收的TCP连接是否已经达到最大值,如果达到最大值,就忽略当前的连接请求;否则,为新的TCP连接建立套接字sock并添加到sock_array中,并把接收到的数据报,添加到读队列rqueue中。该函数的分析见《linux0.99网络模块-应用层 or 传输层读写》。
2972
2973 release_sock(sk);
2974 return (0);
2975 }
2976
2977 kfree_skb (skb, FREE_READ);
2978 release_sock(sk);
2979 return (0);
2980
2981 default:
2982 if (!tcp_sequence (sk, th, len, opt, saddr))
2983 {
2984 kfree_skb (skb, FREE_READ);
2985 release_sock(sk);
2986 return (0);
2987 }
2988
2989 case TCP_SYN_SENT:
2990 if (th->rst)
2991 {
2992 sk->err = ECONNREFUSED;
2993 sk->state = TCP_CLOSE;
2994 if (!sk->dead)
2995 {
2996 wake_up (sk->sleep);
2997 }
2998 kfree_skb (skb, FREE_READ);
2999 release_sock(sk);
3000 return (0);
3016 if (!th->ack)
3017 {
3018 if (th->syn)
3019 {
3020 sk->state = TCP_SYN_RECV;
3021 }
3022
3023 kfree_skb (skb, FREE_READ);
3024 release_sock(sk);
3025 return (0);
3026 }
3027
3028 switch (sk->state)
3029 {
3030 case TCP_SYN_SENT:
3031 if (!tcp_ack(sk, th, saddr))
3032 {
3033 tcp_reset(daddr, saddr, th, sk->prot, opt,dev);
3034 kfree_skb (skb, FREE_READ);
3035 release_sock(sk);
3036 return (0);
3037 }
3038
3039 /* if the syn bit is also set, switch to tcp_syn_recv,
3040 and then to established. */
3041
3042 if (!th->syn)
3043 {
3044 kfree_skb (skb, FREE_READ);
3045 release_sock (sk);
3046 return (0);
3047 }
3048
3049 /* ack the syn and fall through. */
3050 sk->acked_seq = th->seq+1;
3051 sk->fin_seq = th->seq;
3052 tcp_send_ack (sk->send_seq, th->seq+1, sk,
3053 th, sk->daddr);
3054
3055 case TCP_SYN_RECV:
3056 if (!tcp_ack(sk, th, saddr))
3057 {
3058 tcp_reset(daddr, saddr, th, sk->prot, opt, dev);
3059 kfree_skb (skb, FREE_READ);
3060 release_sock(sk);
3061 return (0);
3062 }
3063
3064 sk->state = TCP_ESTABLISHED;
3065 /* now we need to finish filling out some of the tcp
3066 header. */
3067
3068 /* we need to check for mtu info. */
3069 tcp_options(sk, th);
3070 sk->dummy_th.dest = th->source;
3071 sk->copied_seq = sk->acked_seq-1;
3072 if (!sk->dead)
3073 {
3074 wake_up (sk->sleep);
3075 }
3076
3077 /* now process the rest like we were already in the established
3078 state. */
3079 if (th->urg)
3080 if (tcp_urg (sk, th, saddr))
3081 {
3082 kfree_skb (skb, FREE_READ);
3083 release_sock(sk);
3084 return (0);
3085 }
3086 if (tcp_data (skb, sk, saddr, len))
3087 kfree_skb (skb, FREE_READ);
3088
3089 if (th->fin)
3090 tcp_fin(sk, th, saddr, dev);
3091
3092 release_sock(sk);
3093 return (0);
3094 }
3096 if (th->urg)
3097 {
3098 if (tcp_urg (sk, th, saddr))
3099 {
3100 kfree_skb (skb, FREE_READ);
3101 release_sock (sk);
3102 return (0);
3103 }
3104 }
3105
3106 if (tcp_data (skb, sk, saddr, len))
3107 {
3108 kfree_skb (skb, FREE_READ);
3109 release_sock (sk);
3110 return (0);
3111 }
3112
3113 if (!th->fin)
3114 {
3115 release_sock(sk);
3116 return (0);
3117 }
3118 tcp_fin (sk, th, saddr, dev);
3119 release_sock(sk);
3120 return (0);
3121 }
3122 }