假期结束, 根据上次讲的 ip_local_deliver
int ip_local_deliver(struct sk_buff *skb)
{
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))/*还原包*/
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
static int ip_local_deliver_finish(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); __skb_pull(skb, ip_hdrlen(skb)); /*去除L3的头部*/ /* Point into the IP datagram, just past the header. */ skb_reset_transport_header(skb); rcu_read_lock(); { int protocol = ip_hdr(skb)->protocol; int hash, raw; const struct net_protocol *ipprot; resubmit: raw = raw_local_deliver(skb, protocol);/*先给raw handler处理如果有的话*/ //..... hash = protocol & (MAX_INET_PROTOS - 1); ret = ipprot->handler(skb);/*关键是这里 TCP:tcp_v4_rcv*/ //..... kfree_skb(skb); } } out: rcu_read_unlock(); return 0; }
这里在tcp4 注册的就是 tcp_v4_rcv(struct sk_buff *skb)
下面我们就来看看今天的主角
int tcp_v4_rcv(struct sk_buff *skb) { const struct iphdr *iph; struct tcphdr *th; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); if (skb->pkt_type != PACKET_HOST)/*检查属主*/ goto discard_it; /* Count it even if it's bad */ TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);/*snmp:oid 之前说过了*/ if (!pskb_may_pull(skb, sizeof(struct tcphdr)))/*头部>=存放的结构体就丢弃*/ goto discard_it; th = tcp_hdr(skb); /**//*首部4bit 且最多可以60表示字节 所以/4*/ if (th->doff < sizeof(struct tcphdr) / 4) goto bad_packet; if (!pskb_may_pull(skb, th->doff * 4)) goto discard_it; /* An explanation is required here, I think. * Packet length and doff are validated by header prediction, * provided case of th->doff==0 is eliminated. * So, we defer the checks. *//*校验和*/ if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb)) goto bad_packet; th = tcp_hdr(skb);/*下面就是把头部信息保存在sk_buff的那个48字节cb中*/ iph = ip_hdr(skb); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff * 4);/*期待的序号,就是下一次的序号*/ TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);/*确认序号*/ TCP_SKB_CB(skb)->when = 0;/*通告窗口*/ TCP_SKB_CB(skb)->flags = iph->tos;/*tos居然也到L4了。。*/ TCP_SKB_CB(skb)->sacked = 0; /*找到链路对于的sturct sock 后面再看__inet_lookup_established*/ sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest); if (!sk) goto no_tcp_socket; process: if (sk->sk_state == TCP_TIME_WAIT)/*链路已经进入time-wait状态,收到的包也没啥意思了*/ goto do_time_wait; /*用户通过do_ip_setsockopt 设置了IP_MINTTL 比包里面的TTL要大 说明他不喜欢这个包~*/ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto discard_and_relse; } /*又见安全框架 检查策略*/ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); /*一个状态机 ,给上层tcmpdump这样的程序用的 sock_setsockopt:SO_ATTACH_FILTER*/ if (sk_filter(sk, skb)) goto discard_and_relse; skb->dev = NULL; /*禁止软中断 因为涉及到DMA操作 */ bh_lock_sock_nested(sk); ret = 0;/*这里是关键 ,如果当前sock 没被用户锁定<一些syscall的副作用>*/ if (!sock_owned_by_user(sk)) {/*就先用prequeue处理<mostly>*/ #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else #endif { /*这个函数下面看 挺有意思*/ if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb);/*这个函数 关键流程 后面分析。。*/ } } else if (unlikely(sk_add_backlog(sk, skb))) {/*除了ucopy.prequeue;当然还有一条sk_backlog*/ bh_unlock_sock(sk); NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); goto discard_and_relse; } bh_unlock_sock(sk); sock_put(sk); return ret; no_tcp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { bad_packet: TCP_INC_STATS_BH(net, TCP_MIB_INERRS); } else { tcp_v4_send_reset(NULL, skb);/*因为会很干净 这个函数后面说*/ } discard_it: /* Discard frame. */ kfree_skb(skb); return 0; discard_and_relse: sock_put(sk); goto discard_it; do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { inet_twsk_put(inet_twsk(sk)); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(net, TCP_MIB_INERRS); inet_twsk_put(inet_twsk(sk)); goto discard_it; }/*关键就是这个函数 对于处在time_wait时各个状态的处理 你可以先跳到后面看完这个函数*/ switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),/*接受这个链接 函数后面讲*/ &tcp_hashinfo, iph->daddr, th->dest, inet_iif(skb)); if (sk2) { inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process;/*这条链路又去处理包了*/ } /* Fall through to ACK */ } case TCP_TW_ACK: tcp_v4_timewait_ack(sk, skb);/*给予ACK确认*/ break; case TCP_TW_RST: goto no_tcp_socket;/*我很喜欢这种情况*/ case TCP_TW_SUCCESS:; } goto discard_it; }
欢迎你先过来看 tcp_timewait_state_process
先说几句, 你在看源码的过程中如果看到net_twsk_put()调用 你应该十分高兴,因为time_wait sock被真正关闭不占kernel memory了
同样如果你看到return TCP_TW_SUCCESS 也应该很高兴,因为一切都到尽头了。 如果是别的可能就分支下去了
先只要了解: 更详细的看后面分析
还有一个函数
用来调度time_wait的存活时间 这个有下面说
enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_options_received tmp_opt; u8 *hash_location;/*在inet_timewait_sock 最简time_wait sock基础上的扩展*/ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); int paws_reject = 0; tmp_opt.saw_tstamp = 0;/*有额外的TCP选项*/ if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst);/*根据paws算法中判断包时效性*/ } } /*很明显在等对方FIN ,有经验的都应该知道对方的程序多半烂在CLOS_WAIT*/ if (tw->tw_substate == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* caseF.1: send ACK 如果包的seq不在滑动窗口内 就直接发ACK<仅仅给于确认> over~*/ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt, tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return TCP_TW_ACK; if (th->rst)/*case F.2: 如果是rst最happy! ps:记得我以前hp-unix下面用过rst这招*/ goto kill; /*case F.3:这个时这是一个带着SYN新包。根据协议当然RST回敬(记住所有RST情况很有用)*/ if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) goto kill_with_rst; /* Dup ACK? */ if (!th->ack ||/*case F.4:如果不是ACK,或者这个包已经被收过了(seq) 都没什么意义 over~*/ !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw);/*回收*/ return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin ||/*case F.5:如果这个包最后也不是FIN,或者结果发现是新数据包 就RST掉吧*/ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw);/*over~*/ return TCP_TW_RST; } /* FIN arrived, enter true time-wait state. */ tw->tw_substate = TCP_TIME_WAIT;/*case F.6:终于等到FIN了,成功跃迁到TIME_WAIT吧*/ tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;/*最后一包*/ if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent_stamp = get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } /*proc:tcp_tw_recycle启动了快速回收*/ if (tcp_death_row.sysctl_tw_recycle &&/*重新设置 inet_peer->dtime*/ tcptw->tw_ts_recent_stamp && tcp_tw_remember_stamp(tw)) inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout,/*取决于RTO 这个后面分析*/ TCP_TIMEWAIT_LEN); else inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: 这个遇到了再解释 * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ /*下面就是对真正处于TIME_WAIT的sock处理*/ if (!paws_reject && (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&/*这个包seq要是我们期待的*/ (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) {/*先处理掉RST*/ /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. *//*TIME_WAIT下收到RST是否清理,要看是否遵守rfc1337(proc:tcp_rfc1337)*/ if (sysctl_tcp_rfc1337 == 0) { kill: inet_twsk_deschedule(tw, &tcp_death_row);/*从kill的来源可以看到如果是FIN_WAIT_2收到RST:直接去除TIME_WAIT*/ inet_twsk_put(tw); return TCP_TW_SUCCESS; } } inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; tcptw->tw_ts_recent_stamp = get_seconds();/*更新时间撮*/ } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. 下面就是seq异常TCP包的处理 All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ /*我之前一直以为对于BSD socket,新到SYN seq >上一次保留的最后recv_seq,那么将会接受这个SYN 而不是丢掉 其实这是rfc 1122规定的*/ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->when = isn; return TCP_TW_SYN;/*从这里出去就恍然大悟了*/ } if (paws_reject) NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) {/*异常的数据包和SYN等 直接ACK掉*/ /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack)/*如果是SYN不应该影响TIMEWAIT的消亡时间*/ inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller, 这里说了 交给用户去free */ return TCP_TW_ACK; } inet_twsk_put(tw); return TCP_TW_SUCCESS; }