在前面的简介中对并行机制进行了一个初步的分析,在DPDK中,对并行的支持是一个整体的流程。在外面,有多核心并行和绑定等的支持,在内部有多线程和多队列网卡以及无锁编程的应用。而从一些算法上则提供了负载均衡等自动控制,从而才能保证在整体上达到最优的并行效果。
也正是通过上面的一系列手段的综合运用,打开了千万并发的突破口。
1、CPU亲和性和独占
在Linux内核中,会通过位的掩码来控制进程的CPU亲合力,为此内核提供了相关的API接口:
sched_set_affinity() //设置位掩码
sched_get_affinity() //获取当前位掩码
而在DPDK的使用POSIX库中的线程可以通过下面的手段来实现:
int pthread_setaffinity_np(pthread_t thread, size_t cpusetsize,
const rte_cpuset_t *cpuset)
{
if (override) {
/* we only allow affinity with a single CPU */
if (CPU_COUNT(cpuset) != 1)
return POSIX_ERRNO(EINVAL);
/* we only allow the current thread to sets its own affinity */
struct lthread *lt = (struct lthread *)thread;
if (lthread_current() != lt)
return POSIX_ERRNO(EINVAL);
/* determine the CPU being requested */
int i;
for (i = 0; i < LTHREAD_MAX_LCORES; i++) {
if (!CPU_ISSET(i, cpuset))
continue;
break;
}
/* check requested core is allowed */
if (i == LTHREAD_MAX_LCORES)
return POSIX_ERRNO(EINVAL);
/* finally we can set affinity to the requested lcore */
lthread_set_affinity(i);
return 0;
}
return _sys_pthread_funcs.f_pthread_setaffinity_np(thread, cpusetsize,
cpuset);
}
/*
* migrate the current thread to another scheduler running
* on the specified lcore.
*/
int lthread_set_affinity(unsigned lcoreid)
{
struct lthread *lt = THIS_LTHREAD;
struct lthread_sched *dest_sched;
if (unlikely(lcoreid >= LTHREAD_MAX_LCORES))
return POSIX_ERRNO(EINVAL);
DIAG_EVENT(lt, LT_DIAG_LTHREAD_AFFINITY, lcoreid, 0);
dest_sched = schedcore[lcoreid];
if (unlikely(dest_sched == NULL))
return POSIX_ERRNO(EINVAL);
if (likely(dest_sched != THIS_SCHED)) {
lt->sched = dest_sched;
lt->pending_wr_queue = dest_sched->pready;
_affinitize();
return 0;
}
return 0;
}
static void
compute_ctrl_threads_cpuset(struct internal_config *internal_cfg)
{
rte_cpuset_t *cpuset = &internal_cfg->ctrl_cpuset;
rte_cpuset_t default_set;
unsigned int lcore_id;
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
if (rte_lcore_has_role(lcore_id, ROLE_OFF))
continue;
RTE_CPU_OR(cpuset, cpuset, &lcore_config[lcore_id].cpuset);
}
RTE_CPU_NOT(cpuset, cpuset);
if (pthread_getaffinity_np(pthread_self(), sizeof(rte_cpuset_t),
&default_set))
CPU_ZERO(&default_set);
RTE_CPU_AND(cpuset, cpuset, &default_set);
/* if no remaining cpu, use master lcore cpu affinity */
if (!CPU_COUNT(cpuset)) {
memcpy(cpuset, &lcore_config[rte_get_master_lcore()].cpuset,
sizeof(*cpuset));
}
}
另外为了实现绑定需要,DPDK提供了配置文件和相关的数据结构:
typedef unsigned long __cpu_mask
typedef struct {
__cpu_mask __bits[16];
} cpu_set_t;
typedef cpu_set_t rte_cpuset_t;
/**
* Structure storing internal configuration (per-lcore)
*/
struct lcore_config {
pthread_t thread_id; /**< pthread identifier */
int pipe_master2slave[2]; /**< communication pipe with master */
int pipe_slave2master[2]; /**< communication pipe with master */
lcore_function_t * volatile f; /**< function to call */
void * volatile arg; /**< argument of function */
volatile int ret; /**< return value of function */
volatile enum rte_lcore_state_t state; /**< lcore state */
unsigned int socket_id; /**< physical socket id for this lcore */
unsigned int core_id; /**< core number on socket for this lcore */
int core_index; /**< relative index, starting from 0 */
uint8_t core_role; /**< role of core eg: OFF, RTE, SERVICE */
uint8_t detected; /**< true if lcore was detected */
rte_cpuset_t cpuset; /**< cpu set which the lcore affinity to */
};
extern struct lcore_config lcore_config[RTE_MAX_LCORE];
/**
* The global RTE configuration structure.
*/
struct rte_config {
uint32_t master_lcore; /**< Id of the master lcore */
uint32_t lcore_count; /**< Number of available logical cores. */
uint32_t numa_node_count; /**< Number of detected NUMA nodes. */
uint32_t numa_nodes[RTE_MAX_NUMA_NODES]; /**< List of detected NUMA nodes. */
uint32_t service_lcore_count;/**< Number of available service cores. */
enum rte_lcore_role_t lcore_role[RTE_MAX_LCORE]; /**< State of cores. */
/** Primary or secondary configuration */
enum rte_proc_type_t process_type;
/** PA or VA mapping mode */
enum rte_iova_mode iova_mode;
/**
* Pointer to memory configuration, which may be shared across multiple
* DPDK instances
*/
struct rte_mem_config *mem_config;
} __attribute__((__packed__));
上面的这些数据结构的变量将会在rte_eal_cpu_init函数中对其进行处理。通过这些配置,来确定相关的核心和线程的具体的工作情况。
而DPDK中的线程独占采用了内核提供的启动参数isolcpus,通过配置参数来处理。
2、管理和调度
在前面分析过,在DPDK中,既可以实现静态的配置文件管理,也可以在实际运行中进行通过API动态的设置管理。这更得适应实际环境。EAL中封装了lcore这层抽象,可以更好的管理相关的并行控制:
/*
* Check that every SLAVE lcores are in WAIT state, then call
* rte_eal_remote_launch() for all of them. If call_master is true
* (set to CALL_MASTER), also call the function on the master lcore.
*/
int
rte_eal_mp_remote_launch(int (*f)(void *), void *arg,
enum rte_rmt_call_master_t call_master)
{
int lcore_id;
int master = rte_get_master_lcore();
/* check state of lcores */
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
if (lcore_config[lcore_id].state != WAIT)
return -EBUSY;
}
/* send messages to cores */
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
rte_eal_remote_launch(f, arg, lcore_id);
}
if (call_master == CALL_MASTER) {
lcore_config[master].ret = f(arg);// callback处理注册
lcore_config[master].state = FINISHED;
}
return 0;
}
通过任务的注册来保证管理调度,然后在回调的线程中执行相关任务。
3、弹性扩展
在初始化中可以通过配置文件来实现:
/* Launch threads, called at application init(). */
int
rte_eal_init(int argc __rte_unused, char **argv __rte_unused)
{
int i;
/* create a map of all processors in the system */
eal_create_cpu_map();
if (rte_eal_cpu_init() < 0) {
rte_eal_init_alert("Cannot detect lcores.");
rte_errno = ENOTSUP;
return -1;
}
eal_thread_init_master(rte_config.master_lcore);
RTE_LCORE_FOREACH_SLAVE(i) {
/*
* create communication pipes between master thread
* and children
*/
if (_pipe(lcore_config[i].pipe_master2slave,
sizeof(char), _O_BINARY) < 0)
rte_panic("Cannot create pipe\n");
if (_pipe(lcore_config[i].pipe_slave2master,
sizeof(char), _O_BINARY) < 0)
rte_panic("Cannot create pipe\n");
lcore_config[i].state = WAIT;
/* create a thread for each lcore */
if (eal_thread_create(&lcore_config[i].thread_id) != 0)
rte_panic("Cannot create thread\n");
}
/*
* Launch a dummy function on all slave lcores, so that master lcore
* knows they are all ready when this function returns.
*/
rte_eal_mp_remote_launch(sync_func, NULL, SKIP_MASTER);
rte_eal_mp_wait_lcore();
return 0;
}
/*
* Create a map of all processors and associated cores on the system
*/
void
eal_create_cpu_map()
{
wcpu_map.total_procs =
GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
LOGICAL_PROCESSOR_RELATIONSHIP lprocRel;
DWORD lprocInfoSize = 0;
BOOL ht_enabled = FALSE;
/* First get the processor package information */
lprocRel = RelationProcessorPackage;
/* Determine the size of buffer we need (pass NULL) */
GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize);
wcpu_map.proc_sockets = lprocInfoSize / 48;
lprocInfoSize = 0;
/* Next get the processor core information */
lprocRel = RelationProcessorCore;
GetLogicalProcessorInformationEx(lprocRel, NULL, &lprocInfoSize);
wcpu_map.proc_cores = lprocInfoSize / 48;
if (wcpu_map.total_procs > wcpu_map.proc_cores)
ht_enabled = TRUE;
/* Distribute the socket and core ids appropriately
* across the logical cores. For now, split the cores
* equally across the sockets.
*/
unsigned int lcore = 0;
for (unsigned int socket = 0; socket <
wcpu_map.proc_sockets; ++socket) {
for (unsigned int core = 0;
core < (wcpu_map.proc_cores / wcpu_map.proc_sockets);
++core) {
wcpu_map.wlcore_map[lcore]
.socket_id = socket;
wcpu_map.wlcore_map[lcore]
.core_id = core;
lcore++;
if (ht_enabled) {
wcpu_map.wlcore_map[lcore]
.socket_id = socket;
wcpu_map.wlcore_map[lcore]
.core_id = core;
lcore++;
}
}
}
}
/*
* Parse /sys/devices/system/cpu to get the number of physical and logical
* processors on the machine. The function will fill the cpu_info
* structure.
*/
int
rte_eal_cpu_init(void)
{
/* pointer to global configuration */
struct rte_config *config = rte_eal_get_configuration();
unsigned lcore_id;
unsigned count = 0;
unsigned int socket_id, prev_socket_id;
int lcore_to_socket_id[RTE_MAX_LCORE];
/*
* Parse the maximum set of logical cores, detect the subset of running
* ones and enable them by default.
*/
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
lcore_config[lcore_id].core_index = count;
/* init cpuset for per lcore config */
CPU_ZERO(&lcore_config[lcore_id].cpuset);
/* find socket first */
socket_id = eal_cpu_socket_id(lcore_id);
lcore_to_socket_id[lcore_id] = socket_id;
/* in 1:1 mapping, record related cpu detected state */
lcore_config[lcore_id].detected = eal_cpu_detected(lcore_id);
if (lcore_config[lcore_id].detected == 0) {
config->lcore_role[lcore_id] = ROLE_OFF;
lcore_config[lcore_id].core_index = -1;
continue;
}
/* By default, lcore 1:1 map to cpu id */
CPU_SET(lcore_id, &lcore_config[lcore_id].cpuset);
/* By default, each detected core is enabled */
config->lcore_role[lcore_id] = ROLE_RTE;
lcore_config[lcore_id].core_role = ROLE_RTE;
lcore_config[lcore_id].core_id = eal_cpu_core_id(lcore_id);
lcore_config[lcore_id].socket_id = socket_id;
RTE_LOG(DEBUG, EAL, "Detected lcore %u as "
"core %u on socket %u\n",
lcore_id, lcore_config[lcore_id].core_id,
lcore_config[lcore_id].socket_id);
count++;
}
/* Set the count of enabled logical cores of the EAL configuration */
config->lcore_count = count;
RTE_LOG(DEBUG, EAL,
"Support maximum %u logical core(s) by configuration.\n",
RTE_MAX_LCORE);
RTE_LOG(INFO, EAL, "Detected %u lcore(s)\n", config->lcore_count);
/* sort all socket id's in ascending order */
qsort(lcore_to_socket_id, RTE_DIM(lcore_to_socket_id),
sizeof(lcore_to_socket_id[0]), socket_id_cmp);
prev_socket_id = -1;
config->numa_node_count = 0;
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
socket_id = lcore_to_socket_id[lcore_id];
if (socket_id != prev_socket_id)
config->numa_nodes[config->numa_node_count++] =
socket_id;
prev_socket_id = socket_id;
}
RTE_LOG(INFO, EAL, "Detected %u NUMA nodes\n", config->numa_node_count);
return 0;
}
CPU的配置文件一般在/sys/devices/system/cpu/cpuX/目录下。
当然,也可以通过命令参数来实现参lcore的处理:
--lcores=' '[@cpu_set][,[@cpu_set],...]'
在DPDK中利用了Linux内核中的cgroup(control group)与pthread良好的配合,实现了CPU的动态管理。有过Docker经验的知道,Docker中也是利用了Cgroup机制来实现的资源管理。有兴趣的可以查看一下相关的技术资料。
需要引起重视的是,并不能简单的认为多核、亲和性等技术就一定会带来整体效能的提升,这仍然需要在实际场景下的判定。任务的并行,也不是简单的多个人干不同的工作,大量的应用场景中仍然需要并行任务间的合作,这就极大的限制了并行任务的线性扩展。而在DPDK中采用的一系列技术,正是通过降低并行任务中的冲突并尽量减少不可并行的部分来保证并行任务的最大化。即使如此,大家仍然需要占在网络通信这个具体的场景中进行分析,来汲取其中的可借鉴部分。