数据包的读写包括从网卡中读取数据包到内核及从内核将数据包传递到用户空间,越分析到下面,就越底层了,上面已经分析了PF_RING内核补丁的大部分内容了,做完上面的工作之后,就可以读取数据包了。有时间我会以intel e1000e-1.2.17的驱动为例进行讲解,网卡驱动是怎样将数据包导入到内核的。
因为在PF_RING的初始化中,注册了prot_hook。其func指针指向packet_rcv函数:代码如下:
voidregister_device_handler(void) {
if(transparent_mode != standard_linux_path)return;
prot_hook.func = packet_rcv;
prot_hook.type = htons(ETH_P_ALL);
dev_add_pack(&prot_hook);
}
在这里就有一个疑问了,如果transparent_mode不是第一种模式,那么怎样接收数据包呢,还得继续研究。当数据报文进入Linux网络协议栈队列时,netif_receive_skb会遍历这些注册的Hook:
int netif_receive_skb(structsk_buff *skb)
{
list_for_each_entry_rcu(ptype, &ptype_all, list) {
if (ptype->dev ==null_or_orig || ptype->dev == skb->dev ||
ptype->dev == orig_dev) {
if (pt_prev)
ret =deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype;
}
}
}
相应的Hook函数得到调用:
static inlineint deliver_skb(struct sk_buff *skb,
structpacket_type *pt_prev,
struct net_device*orig_dev)
{
atomic_inc(&skb->users);
return pt_prev->func(skb,skb->dev, pt_prev, orig_dev);
}
pt_prev->func其实就是我们在pf_ring.c中定义的packet_rcv函数,源码如下:
/*********************************** */
staticint packet_rcv(struct sk_buff *skb, structnet_device *dev,
struct packet_type *pt
#if(LINUX_VERSION_CODE>= KERNEL_VERSION(2,6,16))
, struct net_device *orig_dev
#endif
)
{
int rc;
//环回包不处理
if(skb->pkt_type != PACKET_LOOPBACK) {
rc = skb_ring_handler(skb,
(skb->pkt_type == PACKET_OUTGOING) ? 0 :1,
1, UNKNOWN_RX_CHANNEL,UNKNOWN_NUM_RX_CHANNELS);
} else
rc = 0;
kfree_skb(skb);
return(rc);
}
/************************************/
packet_rcv其实是调用skb_ring_handler,让数据包入队列,skb_ring_handler是捕获数据包的主函数,当采用netif_receive_skb数据包时,实际上就是调用skb_ring_handler接收数据包。
/* PF_RING main entry point PF_RING的入口函数 */
staticint skb_ring_handler(struct sk_buff *skb, //要捕获的数据包
u_char recv_packet, //
u_char real_skb /* 1=real skb, 0=faked skb*/,
u_int8_t channel_id,
u_int8_t num_rx_channels)
{
struct sock *skElement;
int rc = 0, is_ip_pkt;
struct list_head *ptr;
struct pfring_pkthdr hdr;
int displ;
struct sk_buff *skk = NULL;
struct sk_buff *orig_skb = skb;
/* Check if there's atleast one PF_RING ring defined that
could receive thepacket: if none just stop here */
if(ring_table_size == 0) return(rc);
if(enable_debug) {
if(skb->dev &&(skb->dev->ifindex < MAX_NUM_IFIDX))
printk("[PF_RING] -->skb_ring_handler(%s): %d rings [num_any_rings=%d]\n",
skb->dev->name, num_rings_per_device[skb->dev->ifindex],num_any_rings);
}
if((num_any_rings == 0)
&& (skb->dev
&& (skb->dev->ifindex <MAX_NUM_IFIDX)
&&(num_rings_per_device[skb->dev->ifindex] == 0)))
return(rc);
#ifdefPROFILING
uint64_t rdt = _rdtsc(), rdt1, rdt2;
#endif
#if(LINUX_VERSION_CODE>= KERNEL_VERSION(2,6,30))
channel_id = skb_get_rx_queue(skb);
#endif
if(enable_debug)
if((!skb) /* Invalid skb */||((!enable_tx_capture) && (!recv_packet))) {
/*An outgoing packet isabout to be sent out
but we decided not tohandle transmitted
packets.
*/
return(0);
}
if(enable_debug) {
struct timeval tv;
skb_get_timestamp(skb, &tv);
printk("[PF_RING] skb_ring_handler()[skb=%p][%u.%u][len=%d][dev=%s][csum=%u]\n",
skb, (unsigned int)tv.tv_sec, (unsigned int)tv.tv_usec,
skb->len, skb->dev == NULL ? "<NULL>" :skb->dev->name,
skb->csum);
}
#ifdefPROFILING
rdt1 = _rdtsc();
#endif
if(recv_packet) {
/* Hack for identifying apacket received by the e1000 */
if(real_skb)
displ = SKB_DISPLACEMENT;
else
displ = 0; /* Received by the e1000 wrapper */
} else
displ = 0;
is_ip_pkt = parse_pkt(skb, displ, &hdr,1);
if(enable_ip_defrag) {
if(real_skb
&& is_ip_pkt
&& recv_packet
&& (ring_table_size > 0)) {
skb = defrag_skb(skb, displ, &hdr);
if(skb == NULL)
return(0);
}
}
/* BD - API changed fortime keeping */
#if(LINUX_VERSION_CODE< KERNEL_VERSION(2,6,14))
if(skb->stamp.tv_sec == 0)
do_gettimeofday(&skb->stamp); /* If timestamp is missing add it */
hdr.ts.tv_sec = skb->stamp.tv_sec,hdr.ts.tv_usec = skb->stamp.tv_usec;
hdr.extended_hdr.timestamp_ns = 0; /* No nsecfor old kernels */
#elif(LINUX_VERSION_CODE< KERNEL_VERSION(2,6,22))
if(skb->tstamp.off_sec == 0)
__net_timestamp(skb); /* If timestamp ismissing add it */
hdr.ts.tv_sec = skb->tstamp.off_sec,hdr.ts.tv_usec = skb->tstamp.off_usec;
hdr.extended_hdr.timestamp_ns = 0; /* No nsecfor old kernels */
#else /*2.6.22 and above */
if(skb->tstamp.tv64 == 0)
__net_timestamp(skb); /* If timestamp ismissing add it */
hdr.ts = ktime_to_timeval(skb->tstamp);
#if(LINUX_VERSION_CODE>= KERNEL_VERSION(2,6,30))
{
/* Use hardware timestamps when present. Ifnot, just use software timestamps */
hdr.extended_hdr.timestamp_ns =ktime_to_ns(skb_hwtstamps(skb)->hwtstamp);
if(enable_debug)
printk("[PF_RING]hwts=%llu/dev=%s\n",
hdr.extended_hdr.timestamp_ns,
skb->dev ? skb->dev->name : "???");
}
#endif
if(hdr.extended_hdr.timestamp_ns == 0)
hdr.extended_hdr.timestamp_ns =ktime_to_ns(skb->tstamp);
#endif
hdr.len = hdr.caplen = skb->len + displ;
if(skb->dev)
hdr.extended_hdr.if_index =skb->dev->ifindex;
else
hdr.extended_hdr.if_index = -1;
/* Avoid the ring to bemanipulated while playing with it */
ring_read_lock();
/* [1] Check unclustered sockets */
list_for_each(ptr, &ring_table) {
struct pf_ring_socket *pfr;
struct ring_element *entry;
entry = list_entry(ptr, struct ring_element,list);
skElement = entry->sk;
pfr = ring_sk(skElement);
if((pfr != NULL)
&& ((pfr->ring_netdev->dev== skb->dev)
|| (pfr->ring_netdev == &any_device_element) /* Socket bound to'any' */
|| ((skb->dev->flags & IFF_SLAVE) &&(pfr->ring_netdev->dev == skb->dev->master)))
&& (pfr->ring_netdev !=&none_device_element) /* Not a dummy socket bound to "none" */
&& (pfr->cluster_id == 0 /*No cluster */ )
&& (pfr->ring_slots != NULL)
&& is_valid_skb_direction(pfr->direction,recv_packet)
) {
/* We've found the ring where the packetcan be stored */
int old_caplen = hdr.caplen; /* Keep old lenght */
hdr.caplen = min(hdr.caplen,pfr->bucket_len);
add_skb_to_ring(skb,pfr, &hdr, is_ip_pkt, displ, channel_id, num_rx_channels);
hdr.caplen = old_caplen;
rc = 1; /* Ring found: we've done our job */
}
}
/* [2] Check socket clusters */
list_for_each(ptr, &ring_cluster_list) {
ring_cluster_element *cluster_ptr;
struct pf_ring_socket *pfr;
cluster_ptr = list_entry(ptr,ring_cluster_element, list);
if(cluster_ptr->cluster.num_cluster_elements > 0) {
u_int skb_hash =hash_pkt_cluster(cluster_ptr, &hdr);
u_short num_iterations;
/*We try to add the packet to the rightcluster
element, but if we'reworking in round-robin and this
element is full, we tryto add this to the next available
element. If none withat least a free slot can be found
then we give up :-(
*/
for(num_iterations = 0;
num_iterations < cluster_ptr->cluster.num_cluster_elements;
num_iterations++) {
skElement =cluster_ptr->cluster.sk[skb_hash];
if(skElement != NULL) {
pfr = ring_sk(skElement);
if((pfr != NULL)
&& (pfr->ring_slots != NULL)
&& ((pfr->ring_netdev->dev == skb->dev)
|| ((skb->dev->flags & IFF_SLAVE)
&& (pfr->ring_netdev->dev ==skb->dev->master)))
&& is_valid_skb_direction(pfr->direction, recv_packet)
) {
if(check_and_init_free_slot(pfr, pfr->slots_info->insert_off) /*Not full */) {
/* We've found the ring where the packetcan be stored */
add_skb_to_ring(skb, pfr, &hdr, is_ip_pkt, displ, channel_id,num_rx_channels);
rc = 1; /* Ring found: we've done our job */
break;
}
}
}
if(cluster_ptr->cluster.hashing_mode!= cluster_round_robin)
break;
else
skb_hash = (skb_hash + 1) %cluster_ptr->cluster.num_cluster_elements;
}
}
} /* Clustering */
ring_read_unlock();
#ifdefPROFILING
rdt1 =_rdtsc() - rdt1;
rdt2 = _rdtsc();
#endif
/* Fragment handling */
if(skk != NULL)
kfree_skb(skk);
if(rc == 1) {
if(transparent_mode !=driver2pf_ring_non_transparent) { //mode=2 PF_aware driver only
rc = 0;
} else {
if(recv_packet && real_skb) {
if(enable_debug)
printk("[PF_RING] kfree_skb()\n");
kfree_skb(orig_skb);
}
}
}
#ifdefPROFILING
rdt2 = _rdtsc() - rdt2;
rdt = _rdtsc() - rdt;
if(enable_debug)
printk("[PF_RING] # cycles: %d [lockcosted %d %d%%][free costed %d %d%%]\n",
(int)rdt, rdt - rdt1,
(int)((float)((rdt - rdt1) * 100) / (float)rdt), rdt2,
(int)((float)(rdt2 * 100) / (float)rdt));
#endif
//printk("[PF_RING] Returned %d\n",rc);
return(rc); /* 0 = packet not handled */
}
后续的操作是通过add_skb_to_ring函数来完成的,add_skb_to_ring函数的源码如下:
/*
* add_skb_to_ring()
*
* Add the specified skb tothe ring so that userland apps/plugins
* can use the packet.
*
* Return code:
* 0 packet successully processed
* -1 processing error (e.g. the packet has beendiscarded by
* filter, ring notactive...)
*
*/
staticint add_skb_to_ring(struct sk_buff *skb,
struct pf_ring_socket *pfr,
struct pfring_pkthdr *hdr,
int is_ip_pkt, int displ,
u_int8_t channel_id,
u_int8_t num_rx_channels)
{
int fwd_pkt = 0;
struct parse_buffer*parse_memory_buffer[MAX_PLUGIN_ID] = { NULL };
u_int8_t free_parse_mem = 0;
u_int last_matched_plugin = 0, debug = 0;
u_int8_t hash_found = 0;
/* This is a memory holderfor storing parsed packet information
that will then be freedwhen the packet has been handled
*/
if(enable_debug)
printk("[PF_RING] -->add_skb_to_ring(len=%d) [channel_id=%d/%d][active=%d][%s]\n",
hdr->len, channel_id, num_rx_channels,
pfr->ring_active, pfr->ring_netdev->dev->name);
if((!pfring_enabled) ||((!pfr->ring_active) && (pfr->master_ring == NULL)))
return(-1);
pfr->num_rx_channels = num_rx_channels; /* Constantlyupdated */
hdr->extended_hdr.parsed_pkt.last_matched_rule_id= (u_int16_t)-1;
atomic_set(&pfr->num_ring_users, 1);
/* [1] BPF Filtering (from af_packet.c) */
if(pfr->bpfFilter != NULL) {
unsigned res = 1, len;
len = skb->len - skb->data_len;
skb->data -= displ;
res = sk_run_filter(skb,pfr->bpfFilter->insns,
pfr->bpfFilter->len);
skb->data += displ;
if(res == 0) {
/* Filter failed */
if(enable_debug)
printk("[PF_RING]add_skb_to_ring(skb): Filter failed [len=%d][tot=%llu]"
"[insert_off=%d][pkt_type=%d][cloned=%d]\n",
(int)skb->len, pfr->slots_info->tot_pkts,
pfr->slots_info->insert_off, skb->pkt_type,
skb->cloned);
atomic_set(&pfr->num_ring_users,0);
return(-1);
}
}
if(enable_debug) {
printk("[PF_RING] add_skb_to_ring:[%s][displ=%d][len=%d][caplen=%d]"
"[is_ip_pkt=%d][%d -> %d][%p/%p]\n",
(skb->dev->name != NULL) ? skb->dev->name :"<NULL>",
displ,hdr->len, hdr->caplen,
is_ip_pkt, hdr->extended_hdr.parsed_pkt.l4_src_port,
hdr->extended_hdr.parsed_pkt.l4_dst_port,skb->dev,
pfr->ring_netdev);
/* ************************************* */
printk("[PF_RING] add_skb_to_ring(skb)[len=%d][tot=%llu][insert_off=%d]"
"[pkt_type=%d][cloned=%d]\n",
(int)skb->len, pfr->slots_info->tot_pkts,
pfr->slots_info->insert_off, skb->pkt_type, skb->cloned);
}
/* Extensions */
fwd_pkt =pfr->sw_filtering_rules_default_accept_policy;
/* printk("[PF_RING]rules_default_accept_policy: [fwd_pkt=%d]\n", fwd_pkt); */
/*************************** */
/* [2] Filter packet according to rules */
if(debug)
printk("[PF_RING] About to evaluatepacket [len=%d][tot=%llu][insert_off=%d]"
"[pkt_type=%d][cloned=%d]\n", (int)skb->len,
pfr->slots_info->tot_pkts, pfr->slots_info->insert_off,
skb->pkt_type, skb->cloned);
/* [2.1] Search the hash */
if(pfr->sw_filtering_hash != NULL)
hash_found = check_perfect_rules(skb, pfr,hdr, &fwd_pkt, &free_parse_mem,
parse_memory_buffer, displ,&last_matched_plugin);
/* [2.2] Search rules list */
if((!hash_found) &&(pfr->num_sw_filtering_rules > 0)) {
int rc = check_wildcard_rules(skb, pfr,hdr, &fwd_pkt, &free_parse_mem,
parse_memory_buffer, displ,&last_matched_plugin);
if(rc != 0)
fwd_pkt = 0;
}
if(fwd_pkt) {
/* We accept the packet: it needs to bequeued */
if(debug)
printk("[PF_RING] Forwarding packetto userland\n");
/* [3] Packet sampling */
if(pfr->sample_rate > 1) {
write_lock_bh(&pfr->ring_index_lock);
pfr->slots_info->tot_pkts++;
if(pfr->pktToSample <= 1) {
pfr->pktToSample =pfr->sample_rate;
} else {
pfr->pktToSample--;
if(enable_debug)
printk("[PF_RING] add_skb_to_ring(skb): sampled packet[len=%d]"
"[tot=%llu][insert_off=%d][pkt_type=%d][cloned=%d]\n",
(int)skb->len,pfr->slots_info->tot_pkts,
pfr->slots_info->insert_off,skb->pkt_type,
skb->cloned);
write_unlock_bh(&pfr->ring_index_lock);
if(free_parse_mem)
free_parse_memory(parse_memory_buffer);
atomic_set(&pfr->num_ring_users,0);
return(-1);
}
write_unlock_bh(&pfr->ring_index_lock);
}
if(hdr->caplen > 0) {
/* Copy the packet into the bucket */
int offset;
void *mem;
if((last_matched_plugin > 0)
&&(parse_memory_buffer[last_matched_plugin] != NULL)) {
offset =hdr->extended_hdr.parsed_header_len = parse_memory_buffer[last_matched_plugin]->mem_len;
hdr->extended_hdr.parsed_pkt.last_matched_plugin_id= last_matched_plugin;
if(enable_debug)
printk("[PF_RING]--> [last_matched_plugin = %d][extended_hdr.parsed_header_len=%d]\n",
last_matched_plugin,hdr->extended_hdr.parsed_header_len);
if(offset > pfr->bucket_len)
offset = hdr->extended_hdr.parsed_header_len = pfr->bucket_len;
mem = parse_memory_buffer[last_matched_plugin]->mem;
} else
offset = 0,hdr->extended_hdr.parsed_header_len = 0, mem = NULL;
add_pkt_to_ring(skb,pfr, hdr, displ, channel_id, offset, mem);
}
}
if(enable_debug)
printk("[PF_RING] [pfr->slots_info->insert_off=%d]\n",
pfr->slots_info->insert_off);
if(free_parse_mem)
free_parse_memory(parse_memory_buffer);
atomic_set(&pfr->num_ring_users, 0);
return(0);
}
下面接着讲解add_pkt_to_ring的源码,add_skb_to_ring中就是调用它完成数据包的入队列操作。
/* ***********************************/
staticvoid add_pkt_to_ring(struct sk_buff *skb,
struct pf_ring_socket *_pfr,
struct pfring_pkthdr *hdr,
int displ, u_int8_t channel_id,
int offset, void *plugin_mem)
{
struct pf_ring_socket *pfr = (_pfr->master_ring!= NULL) ? _pfr->master_ring : _pfr;
int32_t the_bit = 1 << channel_id;
if(enable_debug)
printk("[PF_RING] -->add_pkt_to_ring(len=%d) [pfr->channel_id=%d][channel_id=%d]\n",
hdr->len, pfr->channel_id, channel_id);
if((!pfr->ring_active) || (!skb))
return;
if((pfr->channel_id != RING_ANY_CHANNEL)
&& (channel_id !=RING_ANY_CHANNEL)
&& ((pfr->channel_id &the_bit) != the_bit))
return; /* Wrongchannel */
hdr->caplen = min(pfr->bucket_len -offset, hdr->caplen);
if(pfr->kernel_consumer_plugin_id
&&plugin_registration[pfr->kernel_consumer_plugin_id]->pfring_packet_reader){
write_lock_bh(&pfr->ring_index_lock); /* Serialize */
plugin_registration[pfr->kernel_consumer_plugin_id]->pfring_packet_reader(pfr,skb, channel_id, hdr, displ);
pfr->slots_info->tot_pkts++; //累计计数器
write_unlock_bh(&pfr->ring_index_lock);
return;
}
copy_data_to_ring(skb, pfr, hdr, displ,offset, plugin_mem, NULL, 0);
}
接着就看看copy_data_to_ring这个函数吧,这个函数的作用是将skb或者Raw拷贝到内核缓冲区;
/*Genericfunction for copying either a skb or a raw
memory block to the ring buffer
它主要的作用是将sk_buff(skb)插入到环状缓存;
add_pkt_to_ring(skb, pfr, hdr, 0,RING_ANY_CHANNEL, displ, NULL);
copy_data_to_ring(skb, pfr, hdr,displ, offset, plugin_mem, NULL, 0);
*/
inlinevoid copy_data_to_ring(struct sk_buff *skb,
struct pf_ring_socket *pfr,
struct pfring_pkthdr *hdr,
int displ, int offset, void *plugin_mem,
void *raw_data, uint raw_data_len) {
char *ring_bucket;
u_int32_t off, taken;
if(pfr->ring_slots == NULL) return; /*ring_slot Pointsto ring_memory+sizeof(FlowSlotInfo) 如果pfr->ring_slots==NULL,就说明环形缓冲区内存错误*/
write_lock_bh(&pfr->ring_index_lock);
// smp_rmb();
off = pfr->slots_info->insert_off; //得到偏移量
pfr->slots_info->tot_pkts++; //累计计数器
if(!check_and_init_free_slot(pfr, off)) /* Full */ {
/* No room left */
pfr->slots_info->tot_lost++; //丢数据包计数器
if(enable_debug)
printk("[PF_RING] ==>slot(off=%d) is full[insert_off=%u][remove_off=%u][slot_len=%u][num_queued_pkts=%u]\n",
off, pfr->slots_info->insert_off, pfr->slots_info->remove_off,pfr->slots_info->slot_len, num_queued_pkts(pfr));
write_unlock_bh(&pfr->ring_index_lock);
return;
}
ring_bucket = get_slot(pfr, off); //获取槽的位置
if(skb != NULL) {
/* skb copy mode */
if((plugin_mem != NULL) && (offset> 0)) {
memcpy(&ring_bucket[pfr->slot_header_len],plugin_mem, offset);
}
if(hdr->caplen > 0) {
if(enable_debug)
printk("[PF_RING] -->[caplen=%d][len=%d][displ=%d][extended_hdr.parsed_header_len=%d][bucket_len=%d][sizeof=%d]\n",
hdr->caplen, hdr->len, displ,hdr->extended_hdr.parsed_header_len, pfr->bucket_len,
pfr->slot_header_len);
skb_copy_bits(skb,-displ, &ring_bucket[pfr->slot_header_len + offset], hdr->caplen);
/* skb_copy_bits在net/core/skbuff.c中实现,函数的作用将数据从skb拷贝到内核buffer,即将skb的-disp位置开始的hdr->caplen长度拷贝到ring_bucket,当然有很多异常需要判断*/
} else {
if(hdr->extended_hdr.parsed_header_len>= pfr->bucket_len) {
static u_char print_once = 0;
if(!print_once) {
printk("[PF_RING] WARNING: the bucket len is [%d] shorter than theplugin parsed header [%d]\n",
pfr->bucket_len,hdr->extended_hdr.parsed_header_len);
print_once = 1;
}
}
}
} else {
/* Raw data copy mode */
raw_data_len = min(raw_data_len,pfr->bucket_len); /* Avoid overruns */
memcpy(&ring_bucket[pfr->slot_header_len], raw_data,raw_data_len); /* Copy raw data if present */
hdr->len = hdr->caplen =raw_data_len, hdr->extended_hdr.if_index = FAKE_PACKET;
/* printk("[PF_RING] Copied raw dataat slot with offset %d [len=%d]\n", off, raw_data_len); */
}
memcpy(ring_bucket, hdr,pfr->slot_header_len); /* Copy extended packet header 拷贝包头*/
pfr->slots_info->insert_off =get_next_slot_offset(pfr, off, &taken); //获取下一个槽位信息
if(enable_debug)
printk("[PF_RING] ==>insert_off=%d\n", pfr->slots_info->insert_off);
/*NOTE: smp_* barriers are_compiler_ barriers on UP, mandatory barriers on SMP
a consumer _must_see the new value of tot_insert only after the buffer update completes
*/
smp_wmb();
pfr->slots_info->tot_insert++; //累计插入计数器
write_unlock_bh(&pfr->ring_index_lock);
if(waitqueue_active(&pfr->ring_slots_waitqueue)
&& (num_queued_pkts(pfr) >=pfr->poll_num_pkts_watermark))
wake_up_interruptible(&pfr->ring_slots_waitqueue);
/*最后唤醒ring_create()中初始化的pfr->ring_slots_waitqueue,以备add_skb_to_ring再次调用*/
#if(LINUX_VERSION_CODE> KERNEL_VERSION(2,6,32))
/* Signaling on vPFRing's eventfd ctx whenneeded */
if(pfr->vpfring_ctx &&(!(pfr->slots_info->vpfring_guest_flags &VPFRING_GUEST_NO_INTERRUPT))) {
eventfd_signal(pfr->vpfring_ctx,1);
}
#endif
}
//get_slot用来获取槽位信息
staticinline char* get_slot(struct pf_ring_socket *pfr, u_int32_t off)
{
return(&(pfr->ring_slots[off]));
}
以上都是在transparent_mode=0的情况下进行分析的。是正常包的处理流程。