dpvs中ARP协议

  • arp_init

    • neigh_table per-lcore,hash桶初始化
    • 注册ARP协议的pkt_type,用于处理接收的arp数据包
    • 注册loop任务neigh_process_ring,arp数据包会通过ring广播到所有的lcore,取出其中的arp数据包处理
    static int arp_init(void)
    {
        int i, j;
        int err;
        //初始化neigh_table哈希表,每个lcore维护自己本地的arp缓存表
        for (i = 0; i < DPVS_MAX_LCORE; i++) {
            for (j = 0; j < NEIGH_TAB_SIZE; j++) {
                INIT_LIST_HEAD(&neigh_table[i][j]);
            }
        }
    
        master_cid = rte_lcore_id();
    
        arp_pkt_type.type = rte_cpu_to_be_16(ETHER_TYPE_ARP);
        //注册pkt_type,主要注册arp类型数据处理函数neigh_resolve_input
        if ((err = netif_register_pkt(&arp_pkt_type)) != EDPVS_OK)
            return err;
        //注册ctrl信息回调
        if ((err = sockopt_register(&neigh_sockopts)) != EDPVS_OK)
            return err;
        //创建neigh_ring,arp信息需要广播至每个lcore
        neigh_ring_init();
        //注册 loop 任务,其中slave lcore注册为LCORE_JOB_SLOW类型任务,间隔100次loop执行
        snprintf(neigh_jobs[0].name, sizeof(neigh_jobs[0].name) - 1, "%s", "neigh_sync");
        neigh_jobs[0].func = neigh_process_ring;
        neigh_jobs[0].data = NULL;
        neigh_jobs[0].type = LCORE_JOB_SLOW;
        neigh_jobs[0].skip_loops = NEIGH_PROCESS_MAC_RING_INTERVAL;
        if ((err = dpvs_lcore_job_register(&neigh_jobs[0], LCORE_ROLE_FWD_WORKER)) != EDPVS_OK)
            return err;
    
        snprintf(neigh_jobs[1].name, sizeof(neigh_jobs[1].name) - 1, "%s", "neigh_sync");
        neigh_jobs[1].func = neigh_process_ring;
        neigh_jobs[1].data = NULL;
        neigh_jobs[1].type = LCORE_JOB_LOOP;
        if ((err = dpvs_lcore_job_register(&neigh_jobs[1], LCORE_ROLE_MASTER)) != EDPVS_OK)
            return err;
    
        return EDPVS_OK;
    }
    
  • 看下netif_deliver_mbuf中对ARP包的特殊处理

    • 如果是arp包,并且不是从其他lcore中投递过来的数据包,则进入处理逻辑,此处防止类似arp风暴,重复在dpvs中循环投递
    • 如果是对ARP的回复数据报,投递到其他lcore中,因为dpvs中每个lcore独立维护自己的邻居子系统状态,但是arp回复包不一定在发送arp请求的lcore上接收到
    • 接着接收到arp的lcore上也继续往下处理,会查找pkt_table,找到arp消息处理函数
    static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf,
                                         uint16_t eth_type,
                                         struct netif_port *dev,
                                         struct netif_queue_conf *qconf,
                                         bool forward2kni,
                                         lcoreid_t cid,
                                         bool pkts_from_ring)
    {
    		...
    		if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring)
        {
            struct rte_mempool *mbuf_pool;
            struct rte_mbuf *   mbuf_clone;
            uint8_t             i;
            struct arp_hdr *    arp;
            unsigned            socket_id;
    
            socket_id = rte_socket_id();
            mbuf_pool = pktmbuf_pool[socket_id];
    
            rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr));
            arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *);
            rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr));
            //判断arp_op是否是ARP_OP_REPLY,是的话,需要clone mbuf,然后调用rte_ring_enqueue发送到其他每个核一份
            if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY)
            {
                for (i = 0; i < DPVS_MAX_LCORE; i++)
                {
                    if ((i == cid) || (!is_lcore_id_fwd(i)) ||
                        (i == rte_get_master_lcore()))
                    {
                        continue;
                    }
                    /*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/
                    mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool);
                    if (mbuf_clone)
                    {
                        int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone);
                        if (unlikely(-EDQUOT == ret))
                        {
                            RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\\n",
                                    __func__, i);
                        }
                        else if (ret < 0)
                        {
                            RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\\n",
                                    __func__, i);
                            rte_pktmbuf_free(mbuf_clone);
                        }
                    }
                }
            }
        }
    	...
    }
    
  • ARP协议pkt_type

    static struct pkt_type arp_pkt_type = {
        .type       = rte_cpu_to_be_16(ETHER_TYPE_ARP),
        .func       = neigh_resolve_input,
        .port       = NULL,
    };
    
  • ARP数据包

    dpvs中ARP协议_第1张图片

    /**
     * ARP header IPv4 payload.
     */
    struct rte_arp_ipv4 {
    	struct rte_ether_addr arp_sha;  /**< sender hardware address */
    	uint32_t          arp_sip;  /**< sender IP address */
    	struct rte_ether_addr arp_tha;  /**< target hardware address */
    	uint32_t          arp_tip;  /**< target IP address */
    } __attribute__((__packed__)) __attribute__((aligned(2)));
    
    /**
     * ARP header.
     */
    struct rte_arp_hdr {
    	uint16_t arp_hardware;    /* format of hardware address */
    #define RTE_ARP_HRD_ETHER     1  /* ARP Ethernet address format */
    
    	uint16_t arp_protocol;    /* format of protocol address */
    	uint8_t  arp_hlen;    /* length of hardware address */
    	uint8_t  arp_plen;    /* length of protocol address */
    	uint16_t arp_opcode;     /* ARP opcode (command) */
    #define	RTE_ARP_OP_REQUEST    1 /* request to resolve address */
    #define	RTE_ARP_OP_REPLY      2 /* response to previous request */
    #define	RTE_ARP_OP_REVREQUEST 3 /* request proto addr given hardware */
    #define	RTE_ARP_OP_REVREPLY   4 /* response giving protocol address */
    #define	RTE_ARP_OP_INVREQUEST 8 /* request to identify peer */
    #define	RTE_ARP_OP_INVREPLY   9 /* response identifying peer */
    
    	struct rte_arp_ipv4 arp_data;
    } __attribute__((__packed__)) __attribute__((aligned(2)));
    
  • neigh_resolve_input

    int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port)
    {
        //arp指向接收包的arp首部
        struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *);
        struct ether_hdr *eth;
        uint32_t ipaddr;
        struct neighbour_entry *neighbour = NULL;
        unsigned int hashkey;
        struct inet_ifaddr *ifa;
        //根据ARP数据包中的arp_tip获取对应的IP信息配置块
        ifa = inet_addr_ifa_get(AF_INET, port, (union inet_addr*)&arp->arp_data.arp_tip);
        if (!ifa)
            return EDPVS_KNICONTINUE;
        inet_addr_ifa_put(ifa);
        //eth指向L2层首部
        eth = (struct ether_hdr *)rte_pktmbuf_prepend(m,
                                         (uint16_t)sizeof(struct ether_hdr));
        //判断ARP请求类型,如果是ARP_OP_REQUEST,生成ARP应答包(此处复用接收到的ARP请求包),调用netif_xmit发送出去
        if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REQUEST) {
            //填充回复包的L2层地址
            ether_addr_copy(ð->s_addr, ð->d_addr);
            rte_memcpy(ð->s_addr, &port->addr, 6);
            //arp包中的操作类型变为ARP_OP_REPLY
            arp->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY);
    
            ether_addr_copy(&arp->arp_data.arp_sha, &arp->arp_data.arp_tha);
            ether_addr_copy(ð->s_addr, &arp->arp_data.arp_sha);
    
            ipaddr = arp->arp_data.arp_sip;
            arp->arp_data.arp_sip = arp->arp_data.arp_tip;
            arp->arp_data.arp_tip = ipaddr;
            m->l2_len = sizeof(struct ether_hdr);
            m->l3_len = sizeof(struct arp_hdr);
    
            netif_xmit(m, port);
            return EDPVS_OK;
    
        } else if (arp->arp_op == htons(ARP_OP_REPLY)) {
            ipaddr = arp->arp_data.arp_sip;
            //如果数据包是ARP_OP_REPLY,根据源ip和网卡生成hashkey
            hashkey = neigh_hashkey(AF_INET, (union inet_addr *)&ipaddr, port);
            //查询neighbour条目
            neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr,
                                           port, hashkey);
            //如果查找到邻居项缓存,并且不是STATIC类型,则首先更新缓存项中的邻居mac地址
            if (neighbour && !(neighbour->flag & NEIGHBOUR_STATIC)) {
                neigh_edit(neighbour, &arp->arp_data.arp_sha);
            } else {
                //否则,创建新的邻居缓存项并加入邻居项hash表中
                neighbour = neigh_add_table(AF_INET, (union inet_addr *)&ipaddr,
                                        &arp->arp_data.arp_sha, port, hashkey, 0);
                if (!neighbour) {
                    RTE_LOG(ERR, NEIGHBOUR, "%s: add neighbour wrong\\n", __func__);
                    rte_pktmbuf_free(m);
                    return EDPVS_NOMEM;
                }
            }
            //更新邻居项状态
            neigh_entry_state_trans(neighbour, 1);
            //将缓存在邻居项等待队列中的数据报发送出去
            neigh_send_mbuf_cach(neighbour);
            return EDPVS_KNICONTINUE;
        } else {
            //其他操作类型,dpvs不处理
            rte_pktmbuf_free(m);
            return EDPVS_DROP;
        }
    }
    
  • 邻居项定义

    struct neighbour_entry {
        //协议族
        int                 af;
        struct list_head    neigh_list;
        //邻居ip地址
        union inet_addr     ip_addr;
        //邻居mac地址
        struct ether_addr   eth_addr;
        //出口port
        struct netif_port   *port;
        //维护arp状态机的定时器
        struct dpvs_timer   timer;
        //邻居项上等待发送的neighbour_mbuf_entry队列,neighbour_mbuf_entry中有mbuf的指针
        struct list_head    queue_list;
        uint32_t            que_num;
        //邻居项状态
        uint32_t            state;
        uint32_t            ts;
        uint8_t             flag;
    } __rte_cache_aligned;
    
  • 邻居项状态迁移

    • 迁移表类似tcp state
    enum {
        DPVS_NUD_S_NONE        = 0,
        DPVS_NUD_S_SEND,
        DPVS_NUD_S_REACHABLE,
        DPVS_NUD_S_PROBE,
        DPVS_NUD_S_DELAY,
        DPVS_NUD_S_MAX /*Reserved*/
    };
    
    #define sNNO DPVS_NUD_S_NONE
    #define sNSD DPVS_NUD_S_SEND
    #define sNRE DPVS_NUD_S_REACHABLE
    #define sNPR DPVS_NUD_S_PROBE
    #define sNDE DPVS_NUD_S_DELAY
    struct nud_state {
        int next_state[DPVS_NUD_S_MAX];
    };
    static struct nud_state nud_states[] = {
    /*                sNNO, sNSD, sNRE, sNPR, sNDE*/
    /*send arp*/    {
          {sNSD, sNSD, sNKP, sNDE, sNDE}},
    /*recv arp*/    {
          {sNRE, sNRE, sNRE, sNRE, sNRE}},
    /*ack confirm*/ {
          {sNKP, sNKP, sNRE, sNRE, sNRE}},
    /*mbuf ref*/    {
          {sNKP, sNKP, sNKP, sNPR, sNKP}},
    /*timeout*/     {
          {sNNO, sNNO, sNPR, sNNO, sNNO}},
    };
    
    • 邻居项不同状态超时时间

      #define DPVS_NEIGH_TIMEOUT_DEF 60
      //单位为seconds
      static int nud_timeouts[DPVS_NUD_S_MAX] = {
          [DPVS_NUD_S_NONE]        = 2,
          [DPVS_NUD_S_SEND]        = 3,
          [DPVS_NUD_S_REACHABLE]   = DPVS_NEIGH_TIMEOUT_DEF,
          [DPVS_NUD_S_PROBE]       = 30,
          [DPVS_NUD_S_DELAY]       = 3,
      };
      
    • 状态迁移处理

      void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx)
      {
          struct timeval timeout;
      
          /* DPVS_NUD_S_KEEP is not a real state, just use it to keep original state */
          //如果状态迁移后保持原有状态或者邻居项的状态为STATIC(一般是系统管理员配置的),则不作任何处理
          if ((nud_states[idx].next_state[neighbour->state] != DPVS_NUD_S_KEEP)
              && !(neighbour->flag & NEIGHBOUR_STATIC)) {
              //首先获取原有状态
              int old_state = neighbour->state;
              struct timespec now = { 0 };
              //设置邻居项的新状态
              neighbour->state = nud_states[idx].next_state[neighbour->state];
              if (neighbour->state == old_state) {
                  if (likely(clock_gettime(CLOCK_REALTIME_COARSE, &now)) == 0)
                      /* frequent timer updates hurt performance,
                       * do not update timer unless half timeout passed */
                      if ((now.tv_sec - neighbour->ts) * 2 < nud_timeouts[old_state])
                          return;
              }
              //重新获取邻居项的超时时间,更新超时定时器
              timeout.tv_sec = nud_timeouts[neighbour->state];
              timeout.tv_usec = 0;
              dpvs_time_rand_delay(&timeout, 200000); /* delay 200ms randomly to avoid timer performance problem */
              dpvs_timer_update_nolock(&neighbour->timer, &timeout, false);
              neighbour->ts = now.tv_sec;
      #ifdef CONFIG_DPVS_NEIGH_DEBUG
              if (neighbour->state != old_state)
              {
                  char buf[512];
                  dump_neigh_entry(neighbour, buf, sizeof(buf));
                  RTE_LOG(INFO, NEIGHBOUR, "[%02d] neighbor (%s) trans state: %s -> %s, idx:%d.\\n",
                          rte_lcore_id(), buf, nud_state_name(old_state),
                          nud_state_name(neighbour->state), idx);
              }
      #endif
          }
      }
      
    • 创建新的邻居项

      • neigh_add_table

        struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr,
                                                const struct ether_addr *eth_addr,
                                                struct netif_port *port,
                                                unsigned int hashkey, int flag)
        {
            struct neighbour_entry *new_neighbour=NULL;
            struct timeval delay;
            lcoreid_t cid = rte_lcore_id();
            //创建新的neighbour_entry arp条目,从缓存池中创建
            new_neighbour = dpvs_mempool_get(neigh_mempool, sizeof(struct neighbour_entry));
            if (unlikely(new_neighbour == NULL))
                return NULL;
            //邻居项赋值
            rte_memcpy(&new_neighbour->ip_addr, ipaddr,
                        sizeof(union inet_addr));
            new_neighbour->flag = flag;
            new_neighbour->af   = af;
            //eth_addr为空时,标识是新建项,新建时邻居项状态为DPVS_NUD_S_NONE;否则是接收到邻居项的回复(但是没有查找到邻居项)时新建
            if (eth_addr) {
                rte_memcpy(&new_neighbour->eth_addr, eth_addr, 6);
                new_neighbour->state = DPVS_NUD_S_REACHABLE;
            } else {
                new_neighbour->state = DPVS_NUD_S_NONE;
            }
        
            new_neighbour->port = port;
            new_neighbour->que_num = 0;
            //根据邻居项的状态,确定定时器时间
            delay.tv_sec = nud_timeouts[new_neighbour->state];
            delay.tv_usec = 0;
            
            INIT_LIST_HEAD(&new_neighbour->queue_list);
            //加到定时器,如果处于 DPVS_NUD_S_NONE 状态,neighbour_timer_event 会将条目删除
            if (!(new_neighbour->flag & NEIGHBOUR_STATIC)) {
                dpvs_time_rand_delay(&delay, 200000); /* delay 200ms randomly to avoid timer performance problem */
                dpvs_timer_sched(&new_neighbour->timer, &delay,
                        neighbour_timer_event, new_neighbour, false);
            }
            //将arp条目添加到arp表中
            neigh_hash(new_neighbour, hashkey);
            neigh_nums[cid]++;
        
        #ifdef CONFIG_DPVS_NEIGH_DEBUG
            {
                char buf[512];
                dump_neigh_entry(new_neighbour, buf, sizeof(buf));
                RTE_LOG(INFO, NEIGHBOUR, "[%02d] add neigh entry: %s\\n", cid, buf);
            }
        #endif
        
            return new_neighbour;
        }
        
      • 邻居项超时处理

        • 在新建邻居项时设置处理函数为neighbour_timer_event
        static int neighbour_timer_event(void *data)
        {
            struct neighbour_entry *neighbour = data;
        		//如果处于DPVS_NUD_S_NONE状态时超时,则需要清理邻居项
            if (neighbour->state == DPVS_NUD_S_NONE) {
                return neigh_entry_expire(neighbour);
            }
        		//更新邻居项状态
            neigh_entry_state_trans(neighbour, 4);
            return DTIMER_OK;
        }
        
      • 邻居项清理

        static int neigh_entry_expire(struct neighbour_entry *neighbour)
        {
            struct neighbour_mbuf_entry *mbuf, *mbuf_next;
            lcoreid_t cid = rte_lcore_id();
            assert(cid != master_cid);
            //首先取消定时器操作
            dpvs_timer_cancel_nolock(&neighbour->timer, false);
            //将邻居项从hash表中解除
            neigh_unhash(neighbour);
        
        #ifdef CONFIG_DPVS_NEIGH_DEBUG
            {
                char buf[512];
                dump_neigh_entry(neighbour, buf, sizeof(buf));
                RTE_LOG(INFO, NEIGHBOUR, "%s:[%02d] del neigh entry: %s\\n", __func__, cid, buf);
            }
        #endif
        
            /* release pkts saved in neighbour entry */
            //释放缓存发送队列上的数据包
            list_for_each_entry_safe(mbuf, mbuf_next,
                      &neighbour->queue_list, neigh_mbuf_list) {
                list_del(&mbuf->neigh_mbuf_list);
                rte_pktmbuf_free(mbuf->m);
                dpvs_mempool_put(neigh_mempool, mbuf);
            }
            //释放邻居项资源
            dpvs_mempool_put(neigh_mempool, neighbour);
            neigh_nums[cid]--;
        
            return DTIMER_STOP;
        }
        
      • 邻居项确认

        • 传输层收到数据包时,对nexthop邻居项的确认
        void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port)
        {
            struct neighbour_entry *neighbour;
            unsigned int hashkey;
            lcoreid_t cid = rte_lcore_id();
        
            /*find nexhop/neighbour to confirm, no matter whether it is the route in*/
            hashkey = neigh_hashkey(af, nexthop, port);
            list_for_each_entry(neighbour, &neigh_table[cid][hashkey], neigh_list) {
                if (neigh_key_cmp(af, neighbour, nexthop, port) &&
                    !(neighbour->flag & NEIGHBOUR_STATIC)) {
                    neigh_entry_state_trans(neighbour, 2);
                }
            }
        }
        
      • arp请求

        static void neigh_state_confirm(struct neighbour_entry *neighbour)
        {
            union inet_addr saddr, daddr;
        
            memset(&saddr, 0, sizeof(saddr));
        
            if (neighbour->af == AF_INET) {
                daddr.in.s_addr = neighbour->ip_addr.in.s_addr;
                //选择出口saddr
                inet_addr_select(AF_INET, neighbour->port, &daddr, 0, &saddr);
                if (!saddr.in.s_addr)
                    RTE_LOG(ERR, NEIGHBOUR, "%s: no source ip\\n", __func__);
                //发送ARP请求
                if (neigh_send_arp(neighbour->port, saddr.in.s_addr,
                                   daddr.in.s_addr) != EDPVS_OK)
                    RTE_LOG(ERR, NEIGHBOUR, "%s: send arp failed\\n", __func__);
            } else if (neighbour->af == AF_INET6) {
                ipv6_addr_copy(&daddr.in6, &neighbour->ip_addr.in6);
                inet_addr_select(AF_INET6, neighbour->port, &daddr, 0, &saddr);
        
                if (ipv6_addr_any(&saddr.in6))
                    RTE_LOG(ERR, NEIGHBOUR, "%s: no source ip\\n", __func__);
        
                ndisc_solicit(neighbour, &saddr.in6);
            }
        }
        
        //构造ARP请求包,发送ARP请求
        static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip)
        {
            struct rte_mbuf *m;
            struct ether_hdr *eth;
            struct arp_hdr *arp;
        
            uint32_t addr;
        
            m = rte_pktmbuf_alloc(port->mbuf_pool);
            if (unlikely(m == NULL)) {
                return EDPVS_NOMEM;
            }
            m->userdata = NULL;
        
            eth = rte_pktmbuf_mtod(m, struct ether_hdr *);
            arp = (struct arp_hdr *)ð[1];
        
            memset(ð->d_addr, 0xFF, 6);
            ether_addr_copy(&port->addr, ð->s_addr);
            eth->ether_type = htons(ETHER_TYPE_ARP);
        
            memset(arp, 0, sizeof(struct arp_hdr));
            rte_memcpy(&arp->arp_data.arp_sha, &port->addr, 6);
            addr = src_ip;
            inetAddrCopy(&arp->arp_data.arp_sip, &addr);
        
            memset(&arp->arp_data.arp_tha, 0, 6);
            addr = dst_ip;
            inetAddrCopy(&arp->arp_data.arp_tip, &addr);
        
            arp->arp_hrd = htons(ARP_HRD_ETHER);
            arp->arp_pro = htons(ETHER_TYPE_IPv4);
            arp->arp_hln = 6;
            arp->arp_pln = 4;
            arp->arp_op  = htons(ARP_OP_REQUEST);
            m->pkt_len   = 60;
            m->data_len  = 60;
            m->l2_len    = sizeof(struct ether_hdr);
            m->l3_len    = sizeof(struct arp_hdr);
        
            memset(&arp[1], 0, 18);
        
        #ifdef CONFIG_DPVS_NEIGH_DEBUG
            dump_arp_hdr("send", arp, port->id);
        #endif
        
            netif_xmit(m, port);
            return EDPVS_OK;
        }
        

你可能感兴趣的:(dpvs源码阅读笔记)