azeqjz OpenStack: 红帽OSP10 NFV配置指南: 4.2. 配置双网口OVS-DPDK与VLAN Tunneling
原文:
NETWORK FUNCTIONS VIRTUALIZATION CONFIGURATION GUIDE > Chapter 4. Configure DPDK Accelerated Open vSwitch (OVS) for Networking
4.2. 配置双网口OVS-DPDK与VLAN Tunneling
*双OVS-DPDK数据面 双OVS网桥 双端口 *
这一章节包括配置与部署双数据平面端口的OVS-DPDK,同时配置OpenStack环境的控制面Linux网桥绑定。
4.2.1. 修改 first-boot.yaml
- 增加额外资源。
resources:
userdata:
type: OS::Heat::MultipartMime
properties:
parts:
- config: {get_resource: set_ovs_config}
- config: {get_resource: set_dpdk_params}
- config: {get_resource: install_tuned}
- config: {get_resource: compute_kernel_args}
- OVS配置。
set_ovs_config:
type: OS::Heat::SoftwareConfig
properties:
config:
str_replace:
template: |
#!/bin/bash
FORMAT=$COMPUTE_HOSTNAME_FORMAT
if [[ -z $FORMAT ]] ; then
FORMAT="compute" ;
else
# Assumption: only %index% and %stackname% are the variables in Host name format
FORMAT=$(echo $FORMAT | sed 's/\%index\%//g' | sed 's/\%stackname\%//g') ;
fi
if [[ $(hostname) == *$FORMAT* ]] ; then
if [ -f /usr/lib/systemd/system/openvswitch-nonetwork.service ]; then
ovs_service_path="/usr/lib/systemd/system/openvswitch-nonetwork.service"
elif [ -f /usr/lib/systemd/system/ovs-vswitchd.service ]; then
ovs_service_path="/usr/lib/systemd/system/ovs-vswitchd.service"
fi
grep -q "RuntimeDirectoryMode=.*" $ovs_service_path
if [ "$?" -eq 0 ]; then
sed -i 's/RuntimeDirectoryMode=.*/RuntimeDirectoryMode=0775/' $ovs_service_path
else
echo "RuntimeDirectoryMode=0775" >> $ovs_service_path
fi
grep -Fxq "Group=qemu" $ovs_service_path
if [ ! "$?" -eq 0 ]; then
echo "Group=qemu" >> $ovs_service_path
fi
grep -Fxq "UMask=0002" $ovs_service_path
if [ ! "$?" -eq 0 ]; then
echo "UMask=0002" >> $ovs_service_path
fi
ovs_ctl_path='/usr/share/openvswitch/scripts/ovs-ctl'
grep -q "umask 0002 \&\& start_daemon \"\$OVS_VSWITCHD_PRIORITY\"" $ovs_ctl_path
if [ ! "$?" -eq 0 ]; then
sed -i 's/start_daemon \"\$OVS_VSWITCHD_PRIORITY.*/umask 0002 \&\& start_daemon \"$OVS_VSWITCHD_PRIORITY\" \"$OVS_VSWITCHD_WRAPPER\" \"$@\"/' $ovs_ctl_path
fi
fi
params:
$COMPUTE_HOSTNAME_FORMAT: {get_param: ComputeHostnameFormat}
- 设置DPDK参数。
set_dpdk_params:
type: OS::Heat::SoftwareConfig
properties:
config:
str_replace:
template: |
#!/bin/bash
set -x
get_mask()
{
local list=$1
local mask=0
declare -a bm
max_idx=0
for core in $(echo $list | sed 's/,/ /g')
do
index=$(($core/32))
bm[$index]=0
if [ $max_idx -lt $index ]; then
max_idx=$(($index))
fi
done
for ((i=$max_idx;i>=0;i--));
do
bm[$i]=0
done
for core in $(echo $list | sed 's/,/ /g')
do
index=$(($core/32))
temp=$((1<<$(($core % 32))))
bm[$index]=$((${bm[$index]} | $temp))
done
printf -v mask "%x" "${bm[$max_idx]}"
for ((i=$max_idx-1;i>=0;i--));
do
printf -v hex "%08x" "${bm[$i]}"
mask+=$hex
done
printf "%s" "$mask"
}
FORMAT=$COMPUTE_HOSTNAME_FORMAT
if [[ -z $FORMAT ]] ; then
FORMAT="compute" ;
else
# Assumption: only %index% and %stackname% are the variables in Host name format
FORMAT=$(echo $FORMAT | sed 's/\%index\%//g' | sed 's/\%stackname\%//g') ;
fi
if [[ $(hostname) == *$FORMAT* ]] ; then
pmd_cpu_mask=$( get_mask $PMD_CORES )
host_cpu_mask=$( get_mask $LCORE_LIST )
socket_mem=$(echo $SOCKET_MEMORY | sed s/\'//g )
ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-init=true
ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-socket-mem=$socket_mem
ovs-vsctl --no-wait set Open_vSwitch . other_config:pmd-cpu-mask=$pmd_cpu_mask
ovs-vsctl --no-wait set Open_vSwitch . other_config:dpdk-lcore-mask=$host_cpu_mask
fi
params:
$COMPUTE_HOSTNAME_FORMAT: {get_param: ComputeHostnameFormat}
$LCORE_LIST: {get_param: HostCpusList}
$PMD_CORES: {get_param: NeutronDpdkCoreList}
$SOCKET_MEMORY: {get_param: NeutronDpdkSocketMemory}
- 设置CPU亲和性的tuned配置。
install_tuned:
type: OS::Heat::SoftwareConfig
properties:
config:
str_replace:
template: |
#!/bin/bash
FORMAT=$COMPUTE_HOSTNAME_FORMAT
if [[ -z $FORMAT ]] ; then
FORMAT="compute" ;
else
# Assumption: only %index% and %stackname% are the variables in Host name format
FORMAT=$(echo $FORMAT | sed 's/\%index\%//g' | sed 's/\%stackname\%//g') ;
fi
if [[ $(hostname) == *$FORMAT* ]] ; then
tuned_conf_path="/etc/tuned/cpu-partitioning-variables.conf"
if [ -n "$TUNED_CORES" ]; then
grep -q "^isolated_cores" $tuned_conf_path
if [ "$?" -eq 0 ]; then
sed -i 's/^isolated_cores=.*/isolated_cores=$TUNED_CORES/' $tuned_conf_path
else
echo "isolated_cores=$TUNED_CORES" >> $tuned_conf_path
fi
tuned-adm profile cpu-partitioning
fi
fi
params:
$COMPUTE_HOSTNAME_FORMAT: {get_param: ComputeHostnameFormat}
$TUNED_CORES: {get_param: HostIsolatedCoreList}
- 设置内核参数。
compute_kernel_args:
type: OS::Heat::SoftwareConfig
properties:
config:
str_replace:
template: |
#!/bin/bash
FORMAT=$COMPUTE_HOSTNAME_FORMAT
if [[ -z $FORMAT ]] ; then
FORMAT="compute" ;
else
# Assumption: only %index% and %stackname% are the variables in Host name format
FORMAT=$(echo $FORMAT | sed 's/\%index\%//g' | sed 's/\%stackname\%//g') ;
fi
if [[ $(hostname) == *$FORMAT* ]] ; then
sed 's/^\(GRUB_CMDLINE_LINUX=".*\)"/\1 $KERNEL_ARGS isolcpus=$TUNED_CORES"/g' -i /etc/default/grub ;
grub2-mkconfig -o /etc/grub2.cfg
reboot
fi
params:
$KERNEL_ARGS: {get_param: ComputeKernelArgs}
$COMPUTE_HOSTNAME_FORMAT: {get_param: ComputeHostnameFormat}
$TUNED_CORES: {get_param: HostIsolatedCoreList}
4.2.2. 修改post-install.yaml
- 设置CPU亲和性的tuned配置。
ExtraConfig:
type: OS::Heat::SoftwareConfig
properties:
group: script
config:
str_replace:
template: |
#!/bin/bash
set -x
FORMAT=$COMPUTE_HOSTNAME_FORMAT
if [[ -z $FORMAT ]] ; then
FORMAT="compute" ;
else
# Assumption: only %index% and %stackname% are the variables in Host name format
FORMAT=$(echo $FORMAT | sed 's/\%index\%//g' | sed 's/\%stackname\%//g') ;
fi
if [[ $(hostname) == *$FORMAT* ]] ; then
tuned_service=/usr/lib/systemd/system/tuned.service
grep -q "network.target" $tuned_service
if [ "$?" -eq 0 ]; then
sed -i '/After=.*/s/network.target//g' $tuned_service
fi
grep -q "Before=.*network.target" $tuned_service
if [ ! "$?" -eq 0 ]; then
grep -q "Before=.*" $tuned_service
if [ "$?" -eq 0 ]; then
sed -i 's/^\(Before=.*\)/\1 network.target openvswitch.service/g' $tuned_service
else
sed -i '/After/i Before=network.target openvswitch.service' $tuned_service
fi
fi
systemctl daemon-reload
fi
params:
$COMPUTE_HOSTNAME_FORMAT: {get_param: ComputeHostnameFormat}
4.2.3. 修改network-environment.yaml
- 在
resource_registry
下添加OVS-DPDK自定义资源。
resource_registry:
# Specify the relative/absolute path to the config files you want to use for override the default.
OS::TripleO::Compute::Net::SoftwareConfig: nic-configs/compute-ovs-dpdk.yaml
OS::TripleO::Controller::Net::SoftwareConfig: nic-configs/controller.yaml
OS::TripleO::NodeUserData: first-boot.yaml
OS::TripleO::NodeExtraConfigPost: post-install.yaml
- 在
parameter_defaults
下,关闭tunnel类型(设置值为""
),设置网络类型为vlan
。
NeutronTunnelTypes: ""
NeutronNetworkType: 'vlan'
- 在
parameter_defaults
下,映射物理网络到虚拟网桥。
NeutronBridgeMappings: 'dpdk0:br-link0,dpdk1:br-link1'
- 在
parameter_defaults
下,设置OpenStack网络ML2与OVS VLAN映射范围。
NeutronNetworkVLANRanges: 'dpdk0:22:22,dpdk1:25:25'
- 在
parameter_defaults
下,设置OVS-DPDK配置参数。
注意:NeutronDPDKCoreList 和NeutronDPDKMemoryChannels是必要的配置,如果部署DPDK时此参数值不正确,部署会失败,或者导致不稳定。
ⅰ 提供可用作DPDK轮循模式驱动(DPDK poll mode drivers,PMDs)的CPU核列表,格式为[allowed_pattern: "'[0-9,-]+'"]
。
NeutronDpdkCoreList: "'4,6,20,22'"
可通过以下选项优化OVS-DPDK性能:
- 选择与DPDK接口的NUMA节点关联CPU。
使用cat /sys/class/net/
列出与接口关联的NUMA节点,使用/device/numa_node lscpu
列出与NUMA节点关联的CPU。 - 超线程情况下把CPU sibling放到同个组里( 什么是CPU sibling? )。
使用cat /sys/devices/system/cpu/
查询CPU sibling。/topology/thread_siblings_list - 为主机进程预留CPU 0。
- 隔离分配给PMD的CPU,保证主机进程不使用这些CPU。(以下第12点,HostCpusList)
- 使用
NovaVcpuPinset
把分配给PMD的CPU从计算调度中排除。(以下第8点,即虚拟机可以使用的vCPU)
》 Type 1: DPDK PMD使用,NeutronDpdkCoreList;Type 2:宿主机进程使用,HostCpusList;Type 3:虚拟机使用,NovaVcpuPinset。
》 NovaVcpuPinSet + NeutronDpdkCoreList = HostIsolatedCoreList
ⅱ提供内存通道的数量,格式[allowed_pattern: "[0-9]+"]
NeutronDpdkMemoryChannels: "4"
ⅲ 设置从CPU socket的大页池中预分配的内存。
NeutronDpdkSocketMemory: "2048,2048"
这是用逗号分隔的字符串,按照CPU socket升序排列。如果只有一个NUMA节点,则设置为 1024,0 。
ⅳ 设置DPDK驱动类型与数据通道类型。
NeutronDpdkDriverType: "vfio-pci"
NeutronDatapathType: "netdev"
- 在
parameter_defaults
下设置OVS的vhost-user socket目录。
NeutronVhostuserSocketDir: "/var/run/openvswitch"
- 在
parameter_defaults
下预留给主机进程的RAM。
NovaReservedHostMemory: 2048
- 在
parameter_defaults
下,设置预留给虚拟机进程的物理CPU核范围,以逗号分隔。
NovaVcpuPinSet: "8,10,12,14,18,24,26,28,30"
- 在
parameter_defaults
下,列出应用的过滤器。
Nova scheduler使用这些列出来的过滤器。优先列出最有拘束力的过滤器,以使节点的过滤进程更加高效运行。
NovaSchedulerDefaultFilters: "RamFilter,ComputeFilter,AvailabilityZoneFilter,ComputeCapabilitiesFilter,ImagePropertiesFilter,PciPassthroughFilter,NUMATopologyFilter"
- 在
parameter_defaults
下,增加ComputeKernelArgs
参数,以在初次启动时增加这些参数到默认的grub
文件中。
ComputeKernelArgs: "default_hugepagesz=1GB hugepagesz=1G hugepages=32 iommu=pt intel_iommu=on"
注意:这些大页内存会被虚拟机消耗使用,也会被OVS-DPDK使用,如在此步骤中的NeutronDpdkSocketMemory参数所示。可以被虚拟机使用的大页内存页数是引导参数减去NeutronDpdkSocketMemory。
需要在使用DPDK的虚拟机实例flavor中添加hw:mem_page_size=1GB
。如果没有做这一步,虚拟机实例会无法获取DHCP分配(大页内存?)。
- 在
parameter_defaults
下,设置需要tuned的物理CPU核范围。
参数在附录调整文档cpu-partitioning
中。
HostIsolatedCoreList: "2,4,6,8,10,12,14,18,20,22,24,26,28,30"
- A set list or range of cores (and their sibling threads) to be appended to the tuned cpu-partitioning profile and isolated from the host.
- These cores will be isolated from any host processes
- Assuming you want to isolate nova cores from all system processes, NovaVcpuPinSet + NeutronDpdkCoreList = HostIsolatedCoreList
- 在
parameter_defaults
下,设置逻辑OVS-DPDK核列表。这些CPU核必须要手工从NeutronDpdkCoreList
与NovaVcpuPinSet
列表中排除出去。一般分配每个NUMA节点第一个物理核与对应进程,不管DPDK接口的NUMA位置
HostCpusList: "'3,5,7,19,21,23'"
4.2.4. 修改 controller.yaml
- 创建分离的provisioning接口。
-
type: interface
name: nic1
use_dhcp: false
addresses:
-
ip_netmask:
list_join:
- '/'
- - {get_param: ControlPlaneIp}
- {get_param: ControlPlaneSubnetCidr}
routes:
-
ip_netmask: 169.254.169.254/32
next_hop: {get_param: EC2MetadataIp}
-
default: true
next_hop: {get_param: ExternalInterfaceDefaultRoute}
- 为隔离网络创建控制面Linux绑定。
-
type: linux_bond
name: bond_api
bonding_options: "mode=active-backup"
use_dhcp: false
dns_servers: {get_param: DnsServers}
members:
-
type: interface
name: nic2
primary: true
-
type: interface
name: nic3
- 分配VLAN给Linux绑定。
-
type: vlan
vlan_id: {get_param: InternalApiNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: InternalApiIpSubnet}
-
type: vlan
vlan_id: {get_param: TenantNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: TenantIpSubnet}
-
type: vlan
vlan_id: {get_param: StorageNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: StorageIpSubnet}
-
type: vlan
vlan_id: {get_param: StorageMgmtNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: StorageMgmtIpSubnet}
-
type: vlan
vlan_id: {get_param: ExternalNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: ExternalIpSubnet}
- 给计算节点创建两个OVS网桥。(4.1是创建一个OVS网桥)
-
type: ovs_bridge
name: br-link0
use_dhcp: false
members:
-
type: interface
name: nic4
-
type: ovs_bridge
name: br-link1
use_dhcp: false
members:
-
type: interface
name: nic5
4.2.5. 修改compute.yaml
复制默认的compute.yaml
为compute-ovs-dpdk.yaml
,并且修改一下内容:
- 创建分离的provisioning接口。
-
type: interface
name: nic1
use_dhcp: false
addresses:
-
ip_netmask:
list_join:
- '/'
- - {get_param: ControlPlaneIp}
- {get_param: ControlPlaneSubnetCidr}
routes:
-
ip_netmask: 169.254.169.254/32
next_hop: {get_param: EC2MetadataIp}
-
default: true
next_hop: {get_param: ControlPlaneDefaultRoute}
- 为隔离的网络创建控制面Linux绑定。
-
type: linux_bond
name: bond_api
bonding_options: "mode=active-backup"
use_dhcp: false
dns_servers: {get_param: DnsServers}
members:
-
type: interface
name: nic2
primary: true
-
type: interface
name: nic3
- 分配VLAN给这个Linux绑定。
-
type: vlan
vlan_id: {get_param: InternalApiNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: InternalApiIpSubnet}
-
type: vlan
vlan_id: {get_param: TenantNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: TenantIpSubnet}
-
type: vlan
vlan_id: {get_param: StorageNetworkVlanID}
device: bond_api
addresses:
-
ip_netmask: {get_param: StorageIpSubnet}
- 设置有DPDK接口的两个网桥,以连接到控制节点。
-
type: ovs_user_bridge
name: br-link0
use_dhcp: false
members:
-
type: ovs_dpdk_port
name: dpdk0
members:
-
type: interface
name: nic4
-
type: ovs_user_bridge
name: br-link1
use_dhcp: false
members:
-
type: ovs_dpdk_port
name: dpdk1
members:
-
type: interface
name: nic5
注意:
如果有多个DPDK设备,为每个需要添加的DPDK设备复制一遍type
字段即可。
注意:
使用OVS-DPDK时,同一个计算节点上的所有网桥类型应该为ovs_user_bridge
。当不是这个类型时,虽然Director可能会接受这个配置,但是Red Hat OpenStack Platform不支持同个节点上同时有ovs_bridge
和ovs_user_bridge
。
4.2.6. 执行 overcloud_deploy.sh 脚本
以下例子定义bash脚本中的OVS-DPDK环境openstack overcloud deploy
命令:
#!/bin/bash
openstack overcloud deploy --templates \
-e /usr/share/openstack-tripleo-heat-templates/environments/network-isolation.yaml \
-e /usr/share/openstack-tripleo-heat-templates/environments/neutron-ovs-dpdk.yaml \
-e /home/stack/ospd-10-vlan-ovs-dpdk-2port-ctlplane/network-environment.yaml
以上最后一行不同于4.1
/usr/share/openstack-tripleo-heat-templates/environments/neutron-ovs-dpdk.yaml
是默认neutron-ovs-dpdk.yaml
文件的位置,这使能计算节点的OVS-DPDK参数。/home/stack/
是/network-environment.yaml network-environment.yaml
文件的路径。使用这个文件来覆盖neutron-ovs-dpdk.yaml
文件的默认值。
注意:
overcloud部署后,需要重启计算节点以执行tuned文件。
注意:
此OVS-DPDK配置不支持安全组与热迁移。