Tcpdump抓包内核代码分析


注册pf_packet协议

   .create函数是在PF_PACKET类型socket创建时调用,调用时注册了钩子函数具体看packet_create函数的实现。

static const struct net_proto_familypacket_family_ops = {

         .family=   PF_PACKET,

         .create=  packet_create,

         .owner     =       THIS_MODULE,

};

 

static int __init packet_init(void)

{

         …………..

         sock_register(&packet_family_ops);

         …………..

}

创建SOCK_PACKET sock时注册回调函数

/*

 *     Create a packet of type SOCK_PACKET.

 */

static int packet_create(struct net *net,struct socket *sock, int protocol,

                             int kern)

{

         structsock *sk;

         structpacket_sock *po;

         __be16proto = (__force __be16)protocol; /* weird, but documented */

         interr;

 

         if(!ns_capable(net->user_ns, CAP_NET_RAW))

                   return-EPERM;

         if(sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&

             sock->type != SOCK_PACKET)

                   return-ESOCKTNOSUPPORT;

 

         sock->state= SS_UNCONNECTED;

 

         err= -ENOBUFS;

         sk= sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);

         if(sk == NULL)

                   gotoout;

 

         sock->ops= &packet_ops;

         if(sock->type == SOCK_PACKET)

                   sock->ops= &packet_ops_spkt;

 

         sock_init_data(sock,sk);

 

         po= pkt_sk(sk);

         sk->sk_family= PF_PACKET;

         po->num= proto;

 

         err= packet_alloc_pending(po);

         if(err)

                   gotoout2;

 

         packet_cached_dev_reset(po);

 

         sk->sk_destruct= packet_sock_destruct;

         sk_refcnt_debug_inc(sk);

 

         /*

          *     Attacha protocol block

          */

 

         spin_lock_init(&po->bind_lock);

         mutex_init(&po->pg_vec_lock);

         po->prot_hook.func= packet_rcv;

 

         //注册处理函数

         if (sock->type == SOCK_PACKET)

                   po->prot_hook.func =packet_rcv_spkt;

 

         po->prot_hook.af_packet_priv= sk;

 

         if (proto) {

                   po->prot_hook.type =proto;

                   将这个socket挂载到ptype_all链表上

                   register_prot_hook(sk);

         }

 

         mutex_lock(&net->packet.sklist_lock);

         sk_add_node_rcu(sk,&net->packet.sklist);

         mutex_unlock(&net->packet.sklist_lock);

 

         preempt_disable();

         sock_prot_inuse_add(net,&packet_proto, 1);

         preempt_enable();

 

         return0;

out2:

         sk_free(sk);

out:

         returnerr;

}

 

接收方向内核抓包函数

    两个调用场景,一个是网卡启用NAPI,在轮询流程中调用process_backlog;另外一个是非NAPI场景,直接netif_receive_skb接收数据报文,递交给网络层。

static int __netif_receive_skb_core(structsk_buff *skb, bool pfmemalloc)

{

         structpacket_type *ptype, *pt_prev;

         rx_handler_func_t*rx_handler;

         structnet_device *orig_dev;

         structnet_device *null_or_dev;

         booldeliver_exact = false;

         intret = NET_RX_DROP;

         __be16type;

 

         net_timestamp_check(!netdev_tstamp_prequeue,skb);

 

         trace_netif_receive_skb(skb);

 

         orig_dev= skb->dev;

 

         skb_reset_network_header(skb);

         if(!skb_transport_header_was_set(skb))

                   skb_reset_transport_header(skb);

         skb_reset_mac_len(skb);

 

         pt_prev= NULL;

 

another_round:

         skb->skb_iif= skb->dev->ifindex;

 

         __this_cpu_inc(softnet_data.processed);

 

         if(skb->protocol == cpu_to_be16(ETH_P_8021Q) ||

             skb->protocol ==cpu_to_be16(ETH_P_8021AD)) {

                   skb= vlan_untag(skb);

                   if(unlikely(!skb))

                            gotoout;

         }

 

#ifdef CONFIG_NET_CLS_ACT

         if(skb->tc_verd & TC_NCLS) {

                   skb->tc_verd= CLR_TC_NCLS(skb->tc_verd);

                   gotoncls;

         }

#endif

 

         if(pfmemalloc)

                   gotoskip_taps;

 

//遍历tcpdumpsocket创建时挂载的钩子

         list_for_each_entry_rcu(ptype,&ptype_all, list) {

                   if (!ptype->dev ||ptype->dev == skb->dev) {

                            if (pt_prev)

                                     //拷贝数据报文

                                     ret =deliver_skb(skb, pt_prev, orig_dev);

                            pt_prev = ptype;

                   }

         }

 

skip_taps:

#ifdef CONFIG_NET_CLS_ACT

         skb= handle_ing(skb, &pt_prev, &ret, orig_dev);

         if(!skb)

                   gotoout;

ncls:

#endif

 

         if(pfmemalloc && !skb_pfmemalloc_protocol(skb))

                   gotodrop;

 

         if(skb_vlan_tag_present(skb)) {

                   if(pt_prev) {

                            ret= deliver_skb(skb, pt_prev, orig_dev);

                            pt_prev= NULL;

                   }

                   if(vlan_do_receive(&skb))

                            gotoanother_round;

                   elseif (unlikely(!skb))

                            gotoout;

         }

 

         rx_handler= rcu_dereference(skb->dev->rx_handler);

         if(rx_handler) {

                   if(pt_prev) {

                            ret= deliver_skb(skb, pt_prev, orig_dev);

                            pt_prev= NULL;

                   }

                   switch(rx_handler(&skb)) {

                   caseRX_HANDLER_CONSUMED:

                            ret= NET_RX_SUCCESS;

                            gotoout;

                   caseRX_HANDLER_ANOTHER:

                            gotoanother_round;

                   caseRX_HANDLER_EXACT:

                            deliver_exact= true;

                   caseRX_HANDLER_PASS:

                            break;

                   default:

                            BUG();

                   }

         }

 

         if(unlikely(skb_vlan_tag_present(skb))) {

                   if(skb_vlan_tag_get_id(skb))

                            skb->pkt_type= PACKET_OTHERHOST;

                   /*Note: we might in the future use prio bits

                    * and set skb->priority like invlan_do_receive()

                    * For the time being, just ignore PriorityCode Point

                    */

                   skb->vlan_tci= 0;

         }

 

         /*deliver only exact match when indicated */

         null_or_dev= deliver_exact ? skb->dev : NULL;

 

         type= skb->protocol;

         //真实的数据报文处理流程,如果是ip那么调用ip_rcv函数了

        list_for_each_entry_rcu(ptype,

                            &ptype_base[ntohs(type)& PTYPE_HASH_MASK], list) {

                   if (ptype->type == type&&

                       (ptype->dev == null_or_dev ||ptype->dev == skb->dev ||

                        ptype->dev == orig_dev)) {

                            if (pt_prev)

                                     ret =deliver_skb(skb, pt_prev, orig_dev);

                            pt_prev = ptype;

                   }

         }

 

         if(pt_prev) {

                   if(unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))

                            gotodrop;

                   else

                            ret= pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

         }else {

drop:

                   atomic_long_inc(&skb->dev->rx_dropped);

                   kfree_skb(skb);

                   /*Jamal, now you will not able to escape explaining

                    * me how you were going to use this. :-)

                    */

                   ret= NET_RX_DROP;

         }

 

out:

         returnret;

}

发送方向内核抓包函数

         数据发送也存在两个分支,一个是调用dev_queue_xmit直接将数据递交到网卡(没有配置qdisc);另外一个分支是如果配置了qdisc,dev_queue_xmit流程检查是否配置了queue,如果配置了将调用__dev_xmit_skb函数将数据放入到了qdisc队列中,然后等待发送中断函数net_tx_action轮询调用,进而触发拷贝调用流程。

/*

 *     Support routine. Sends outgoing frames toany network

 *     taps currently in use.

 */

 

static void dev_queue_xmit_nit(structsk_buff *skb, struct net_device *dev)

{

         structpacket_type *ptype;

         structsk_buff *skb2 = NULL;

         structpacket_type *pt_prev = NULL;

 

         rcu_read_lock();

        //遍历tcpdumpsocket创建时挂载的钩子

         list_for_each_entry_rcu(ptype,&ptype_all, list) {

                   /* Never send packets back tothe socket

                    * they originated from - MvS([email protected])

                    */

                   if ((ptype->dev == dev ||!ptype->dev) &&

                       (!skb_loop_sk(ptype, skb))) {

                            if (pt_prev) {

                                     //拷贝数据报文

                                     deliver_skb(skb2,pt_prev, skb->dev);

                                     pt_prev =ptype;

                                     continue;

                            }

 

                            skb2 =skb_clone(skb, GFP_ATOMIC);

                            if (!skb2)

                                     break;

 

                            net_timestamp_set(skb2);

 

                            /* skb->nh shouldbe correctly

                               set by sender, so that the second statementis

                               just protection against buggy protocols.

                             */

                            skb_reset_mac_header(skb2);

 

                            if(skb_network_header(skb2) < skb2->data ||

                                skb_network_header(skb2) >skb_tail_pointer(skb2)) {

                                     net_crit_ratelimited("protocol%04x is buggy, dev %s\n",

                                                             ntohs(skb2->protocol),

                                                             dev->name);

                                     skb_reset_network_header(skb2);

                            }

 

                            skb2->transport_header= skb2->network_header;

                            skb2->pkt_type =PACKET_OUTGOING;

                            pt_prev = ptype;

                   }

         }

         if(pt_prev)

                   pt_prev->func(skb2,skb->dev, pt_prev, skb->dev);

         rcu_read_unlock();

}

 

 

 

销毁SOCK_PACKET sock时注册回调

当sock_packet类型 socket 关闭时会调用release函数,这时候会摘掉之前的注册函数

static int packet_release(struct socket*sock)

{

         structsock *sk = sock->sk;

         structpacket_sock *po;

         structnet *net;

         uniontpacket_req_u req_u;

 

         if(!sk)

                   return0;

 

         net= sock_net(sk);

         po= pkt_sk(sk);

 

         mutex_lock(&net->packet.sklist_lock);

         sk_del_node_init_rcu(sk);

         mutex_unlock(&net->packet.sklist_lock);

 

         preempt_disable();

         sock_prot_inuse_add(net,sk->sk_prot, -1);

         preempt_enable();

         spin_lock(&po->bind_lock);

         //从ptype_all函数中摘掉注册的钩子函数

         unregister_prot_hook(sk, false);

         packet_cached_dev_reset(po);

 

         if(po->prot_hook.dev) {

                   dev_put(po->prot_hook.dev);

                   po->prot_hook.dev= NULL;

         }

         spin_unlock(&po->bind_lock);

 

         packet_flush_mclist(sk);

 

         if(po->rx_ring.pg_vec) {

                   memset(&req_u,0, sizeof(req_u));

                   packet_set_ring(sk,&req_u, 1, 0);

         }

 

         if(po->tx_ring.pg_vec) {

                   memset(&req_u,0, sizeof(req_u));

                   packet_set_ring(sk,&req_u, 1, 1);

         }

 

         fanout_release(sk);

 

         synchronize_net();

         /*

          *     Nowthe socket is dead. No more input will appear.

          */

         sock_orphan(sk);

         sock->sk= NULL;

 

         /*Purge queues */

 

         skb_queue_purge(&sk->sk_receive_queue);

         packet_free_pending(po);

         sk_refcnt_debug_release(sk);

 

         sock_put(sk);

         return0;

}

总结

        Tcpdump抓包时创建SOCK_PACKET类型的socket,并且在socket创建流程时调用了packet_family_opspacket_create函数(packet_create),进而将抓包的钩子函数注册到ptype_all链表,当在数据接收方向__netif_receive_skb_core函数中调用注册的钩子函数将数据报文拷贝到af_packet.c文件的具体处理流程函数中;同样在发送函数dev_queue_xmit_nit中调用钩子函数实现数据报文拷贝。


Jensonqiu[email protected] 2018/05/08


你可能感兴趣的:(linux,TCP协议)