.create函数是在PF_PACKET类型socket创建时调用,调用时注册了钩子函数具体看packet_create函数的实现。
static const struct net_proto_familypacket_family_ops = {
.family= PF_PACKET,
.create= packet_create,
.owner = THIS_MODULE,
};
static int __init packet_init(void)
{
…………..
sock_register(&packet_family_ops);
…………..
}
/*
* Create a packet of type SOCK_PACKET.
*/
static int packet_create(struct net *net,struct socket *sock, int protocol,
int kern)
{
structsock *sk;
structpacket_sock *po;
__be16proto = (__force __be16)protocol; /* weird, but documented */
interr;
if(!ns_capable(net->user_ns, CAP_NET_RAW))
return-EPERM;
if(sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
sock->type != SOCK_PACKET)
return-ESOCKTNOSUPPORT;
sock->state= SS_UNCONNECTED;
err= -ENOBUFS;
sk= sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
if(sk == NULL)
gotoout;
sock->ops= &packet_ops;
if(sock->type == SOCK_PACKET)
sock->ops= &packet_ops_spkt;
sock_init_data(sock,sk);
po= pkt_sk(sk);
sk->sk_family= PF_PACKET;
po->num= proto;
err= packet_alloc_pending(po);
if(err)
gotoout2;
packet_cached_dev_reset(po);
sk->sk_destruct= packet_sock_destruct;
sk_refcnt_debug_inc(sk);
/*
* Attacha protocol block
*/
spin_lock_init(&po->bind_lock);
mutex_init(&po->pg_vec_lock);
po->prot_hook.func= packet_rcv;
//注册处理函数
if (sock->type == SOCK_PACKET)
po->prot_hook.func =packet_rcv_spkt;
po->prot_hook.af_packet_priv= sk;
if (proto) {
po->prot_hook.type =proto;
将这个socket挂载到ptype_all链表上
register_prot_hook(sk);
}
mutex_lock(&net->packet.sklist_lock);
sk_add_node_rcu(sk,&net->packet.sklist);
mutex_unlock(&net->packet.sklist_lock);
preempt_disable();
sock_prot_inuse_add(net,&packet_proto, 1);
preempt_enable();
return0;
out2:
sk_free(sk);
out:
returnerr;
}
两个调用场景,一个是网卡启用NAPI,在轮询流程中调用process_backlog;另外一个是非NAPI场景,直接netif_receive_skb接收数据报文,递交给网络层。
static int __netif_receive_skb_core(structsk_buff *skb, bool pfmemalloc)
{
structpacket_type *ptype, *pt_prev;
rx_handler_func_t*rx_handler;
structnet_device *orig_dev;
structnet_device *null_or_dev;
booldeliver_exact = false;
intret = NET_RX_DROP;
__be16type;
net_timestamp_check(!netdev_tstamp_prequeue,skb);
trace_netif_receive_skb(skb);
orig_dev= skb->dev;
skb_reset_network_header(skb);
if(!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev= NULL;
another_round:
skb->skb_iif= skb->dev->ifindex;
__this_cpu_inc(softnet_data.processed);
if(skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
skb->protocol ==cpu_to_be16(ETH_P_8021AD)) {
skb= vlan_untag(skb);
if(unlikely(!skb))
gotoout;
}
#ifdef CONFIG_NET_CLS_ACT
if(skb->tc_verd & TC_NCLS) {
skb->tc_verd= CLR_TC_NCLS(skb->tc_verd);
gotoncls;
}
#endif
if(pfmemalloc)
gotoskip_taps;
//遍历tcpdumpsocket创建时挂载的钩子
list_for_each_entry_rcu(ptype,&ptype_all, list) {
if (!ptype->dev ||ptype->dev == skb->dev) {
if (pt_prev)
//拷贝数据报文
ret =deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
skip_taps:
#ifdef CONFIG_NET_CLS_ACT
skb= handle_ing(skb, &pt_prev, &ret, orig_dev);
if(!skb)
gotoout;
ncls:
#endif
if(pfmemalloc && !skb_pfmemalloc_protocol(skb))
gotodrop;
if(skb_vlan_tag_present(skb)) {
if(pt_prev) {
ret= deliver_skb(skb, pt_prev, orig_dev);
pt_prev= NULL;
}
if(vlan_do_receive(&skb))
gotoanother_round;
elseif (unlikely(!skb))
gotoout;
}
rx_handler= rcu_dereference(skb->dev->rx_handler);
if(rx_handler) {
if(pt_prev) {
ret= deliver_skb(skb, pt_prev, orig_dev);
pt_prev= NULL;
}
switch(rx_handler(&skb)) {
caseRX_HANDLER_CONSUMED:
ret= NET_RX_SUCCESS;
gotoout;
caseRX_HANDLER_ANOTHER:
gotoanother_round;
caseRX_HANDLER_EXACT:
deliver_exact= true;
caseRX_HANDLER_PASS:
break;
default:
BUG();
}
}
if(unlikely(skb_vlan_tag_present(skb))) {
if(skb_vlan_tag_get_id(skb))
skb->pkt_type= PACKET_OTHERHOST;
/*Note: we might in the future use prio bits
* and set skb->priority like invlan_do_receive()
* For the time being, just ignore PriorityCode Point
*/
skb->vlan_tci= 0;
}
/*deliver only exact match when indicated */
null_or_dev= deliver_exact ? skb->dev : NULL;
type= skb->protocol;
//真实的数据报文处理流程,如果是ip那么调用ip_rcv函数了
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type)& PTYPE_HASH_MASK], list) {
if (ptype->type == type&&
(ptype->dev == null_or_dev ||ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret =deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
if(pt_prev) {
if(unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
gotodrop;
else
ret= pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
}else {
drop:
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/*Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret= NET_RX_DROP;
}
out:
returnret;
}
数据发送也存在两个分支,一个是调用dev_queue_xmit直接将数据递交到网卡(没有配置qdisc);另外一个分支是如果配置了qdisc,dev_queue_xmit流程检查是否配置了queue,如果配置了将调用__dev_xmit_skb函数将数据放入到了qdisc队列中,然后等待发送中断函数net_tx_action轮询调用,进而触发拷贝调用流程。
/*
* Support routine. Sends outgoing frames toany network
* taps currently in use.
*/
static void dev_queue_xmit_nit(structsk_buff *skb, struct net_device *dev)
{
structpacket_type *ptype;
structsk_buff *skb2 = NULL;
structpacket_type *pt_prev = NULL;
rcu_read_lock();
//遍历tcpdumpsocket创建时挂载的钩子
list_for_each_entry_rcu(ptype,&ptype_all, list) {
/* Never send packets back tothe socket
* they originated from - MvS([email protected])
*/
if ((ptype->dev == dev ||!ptype->dev) &&
(!skb_loop_sk(ptype, skb))) {
if (pt_prev) {
//拷贝数据报文
deliver_skb(skb2,pt_prev, skb->dev);
pt_prev =ptype;
continue;
}
skb2 =skb_clone(skb, GFP_ATOMIC);
if (!skb2)
break;
net_timestamp_set(skb2);
/* skb->nh shouldbe correctly
set by sender, so that the second statementis
just protection against buggy protocols.
*/
skb_reset_mac_header(skb2);
if(skb_network_header(skb2) < skb2->data ||
skb_network_header(skb2) >skb_tail_pointer(skb2)) {
net_crit_ratelimited("protocol%04x is buggy, dev %s\n",
ntohs(skb2->protocol),
dev->name);
skb_reset_network_header(skb2);
}
skb2->transport_header= skb2->network_header;
skb2->pkt_type =PACKET_OUTGOING;
pt_prev = ptype;
}
}
if(pt_prev)
pt_prev->func(skb2,skb->dev, pt_prev, skb->dev);
rcu_read_unlock();
}
当sock_packet类型 socket 关闭时会调用release函数,这时候会摘掉之前的注册函数
static int packet_release(struct socket*sock)
{
structsock *sk = sock->sk;
structpacket_sock *po;
structnet *net;
uniontpacket_req_u req_u;
if(!sk)
return0;
net= sock_net(sk);
po= pkt_sk(sk);
mutex_lock(&net->packet.sklist_lock);
sk_del_node_init_rcu(sk);
mutex_unlock(&net->packet.sklist_lock);
preempt_disable();
sock_prot_inuse_add(net,sk->sk_prot, -1);
preempt_enable();
spin_lock(&po->bind_lock);
//从ptype_all函数中摘掉注册的钩子函数
unregister_prot_hook(sk, false);
packet_cached_dev_reset(po);
if(po->prot_hook.dev) {
dev_put(po->prot_hook.dev);
po->prot_hook.dev= NULL;
}
spin_unlock(&po->bind_lock);
packet_flush_mclist(sk);
if(po->rx_ring.pg_vec) {
memset(&req_u,0, sizeof(req_u));
packet_set_ring(sk,&req_u, 1, 0);
}
if(po->tx_ring.pg_vec) {
memset(&req_u,0, sizeof(req_u));
packet_set_ring(sk,&req_u, 1, 1);
}
fanout_release(sk);
synchronize_net();
/*
* Nowthe socket is dead. No more input will appear.
*/
sock_orphan(sk);
sock->sk= NULL;
/*Purge queues */
skb_queue_purge(&sk->sk_receive_queue);
packet_free_pending(po);
sk_refcnt_debug_release(sk);
sock_put(sk);
return0;
}
Tcpdump抓包时创建SOCK_PACKET类型的socket,并且在socket创建流程时调用了packet_family_opspacket_create函数(packet_create),进而将抓包的钩子函数注册到ptype_all链表,当在数据接收方向__netif_receive_skb_core函数中调用注册的钩子函数将数据报文拷贝到af_packet.c文件的具体处理流程函数中;同样在发送函数dev_queue_xmit_nit中调用钩子函数实现数据报文拷贝。
Jensonqiu[email protected] 2018/05/08