Host Installation for non-Kubernetes Environments
1) Configure the br-dpu
bridge as described in the network configuration prerequisite guide
1. Install DOCA
# Set DOCA repository URL
export
DOCA_URL="https://linux.mellanox.com/public/repo/doca/2.10.0/ubuntu24.04/x86_64/"
# Add NVIDIA Mellanox GPG key
curl https://linux.mellanox.com/public/repo/doca/GPG-KEY-Mellanox.pub | sudo
gpg --dearmor > /etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub
# Add DOCA repository
echo
"deb [signed-by=/etc/apt/trusted.gpg.d/GPG-KEY-Mellanox.pub] $DOCA_URL ./"
| sudo
tee
/etc/apt/sources.list.d/doca.list
# Update and install required packages
sudo
apt-get update
sudo
apt-get -y install
doca-all doca-networking
2. Set Up Required Files
Make sure you have the following files under directory /opt/dpf/
hostnetwork.sh
#!/bin/bash
# 2024
NVIDIA CORPORATION & AFFILIATES
#
# Licensed under the Apache License, Version 2.0
(the License);
# you may not use this
file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for
the specific language governing permissions and
# limitations under the License.
set -euo pipefail
# Define constants
readonly DEFAULT_NUM_OF_VFS=16
readonly BRIDGE_NAME="br-dpu"
readonly PCI_SYS_DIR="/sys/bus/pci/devices"
readonly BR_DPU_DIR="/sys/class/net/${BRIDGE_NAME}"
readonly DPU_DEVICE_LIST=("0xa2dc"
"0xa2d6"
)
readonly DEFAULT_DELETE_FLAG="false"
readonly DEFAULT_BRIDGE_CHECK_FLAG="false"
readonly DEFAULT_RESTORE_VF_FLAG="false"
readonly SKIP_DEFAULT_ROUTE_CHECK_COND_FILE="/var/lib/dpf/dms/hostnetwork-skip-default-route-check"
readonly VF_CONFIG_FILE="/var/lib/dpf/dms/vf-config"
readonly DMS_DIR="/var/lib/dpf/dms"
# Print usage instructions
usage() {
cat <<EOF >&2
Usage: $(basename "$0"
) [OPTIONS]
Options:
--device_pci_address <address> PCI address of the DPU device (e.g. "0000:03:00"
)
--num_of_vfs <num> Number of Virtual Functions to create (default
: ${DEFAULT_NUM_OF_VFS})
--control_plane_mtu <mtu> MTU size for
the control plane network (e.g. 1500
)
--delete Delete existing network configuration
--check-bridge Check if
bridge exists and is configured properly
--restore-vf Restore VF configuration from ${VF_CONFIG_FILE}
Examples:
# Configure networking with default
values
$(basename "$0"
) --device_pci_address 0000
:03
:00
--control_plane_mtu 1500
# Configure with custom number of VFs and MTU
$(basename "$0"
) --device_pci_address 0000
:03
:00
--num_of_vfs 8
--control_plane_mtu 9000
# Delete existing configuration
$(basename "$0"
) --device_pci_address 0000
:03
:00
--delete
# Check if
bridge exists and is configured properly
$(basename "$0"
) --check-bridge
# Restore VF configuration from file
$(basename "$0"
) --restore-vf
EOF
exit 1
}
# Log function
log() {
echo "[hostnetwork] $1"
}
# Error function with improved formatting for
log messages
error() {
# Print error message with file name, function name, and line number
echo "[hostnetwork] Error (${FUNCNAME[1]}:${BASH_LINENO[0]}): $1"
>&2
# Perform cleanup if
device_pci_address exists and is valid
if
[[ -n "$device_pci_address"
&& "$device_pci_address"
=~ ^[0
-9a-fA-F]{4
}:[0
-9a-fA-F]{2
}:[0
-9a-fA-F]{2
}$ ]]; then
cleanup
fi
exit 1
}
# Parse command line arguments
parse_arguments() {
allowed_arguments=(
"--device_pci_address"
"--num_of_vfs"
"--control_plane_mtu"
"--delete"
"--check-bridge"
"--restore-vf"
"--help"
)
while
[[ $# -gt 0
]]; do
case
$1
in
--device_pci_address)
device_pci_address=$2
shift 2
;;
--num_of_vfs)
num_of_vfs=$2
shift 2
;;
--control_plane_mtu)
control_plane_mtu=$2
shift 2
;;
--delete)
delete_flag="true"
shift
;;
--check-bridge)
check_bridge_flag="true"
shift
;;
--restore-vf)
restore_vf_flag="true"
shift
;;
--help)
usage
;;
*)
local is_valid_argument=false
for
arg in "${allowed_arguments[@]}"
; do
if
[[ "$arg"
== "$1"
]]; then
is_valid_argument=true
break
fi
done
if
[[ "$is_valid_argument"
== false
]]; then
log "Unknown option: $1"
>&2
usage
fi
;;
esac
done
restore_vf_flag=${restore_vf_flag:-$DEFAULT_RESTORE_VF_FLAG}
delete_flag=${delete_flag:-$DEFAULT_DELETE_FLAG}
check_bridge_flag=${check_bridge_flag:-$DEFAULT_BRIDGE_CHECK_FLAG}
# Check for
mutually exclusive flags
local flag_count=0
[[ "${delete_flag}"
== "true"
]] && ((flag_count++))
[[ "${check_bridge_flag}"
== "true"
]] && ((flag_count++))
[[ "${restore_vf_flag}"
== "true"
]] && ((flag_count++))
if
[[ $flag_count -gt 1
]]; then
error "Flags --delete, --check-bridge, and --restore-vf are mutually exclusive. Only one can be specified."
fi
# Handle restore VF configuration if
flag is set
if
[[ "${restore_vf_flag}"
== "true"
]]; then
if
[[ -f "${VF_CONFIG_FILE}"
]]; then
# Check if
any command line arguments were provided
if
[[ -n "${device_pci_address:-}"
|| -n "${num_of_vfs:-}"
|| -n "${control_plane_mtu:-}"
]]; then
log "Warning: Command line arguments will take precedence over values from ${VF_CONFIG_FILE}"
else
# Source the config file to get the variables
source "${VF_CONFIG_FILE}"
if
[[ -z "${device_pci_address:-}"
|| -z "${num_of_vfs:-}"
|| -z "${control_plane_mtu:-}"
]]; then
error "Invalid VF config file. Missing required parameters."
fi
log "Restored configuration from ${VF_CONFIG_FILE}"
fi
return
0
else
log "VF config file ${VF_CONFIG_FILE} not found"
exit 0
fi
fi
# Ensure default
values for
arguments that were not provided
device_pci_address=${device_pci_address:-}
num_of_vfs=${num_of_vfs:-$DEFAULT_NUM_OF_VFS}
control_plane_mtu=${control_plane_mtu:-}
if
[ "$check_bridge_flag"
== "true"
]; then
return
0
fi
# Check required arguments and validate format
if
[[ -z "$device_pci_address"
|| ! "$device_pci_address"
=~ ^[0
-9a-fA-F]{4
}:[0
-9a-fA-F]{2
}:[0
-9a-fA-F]{2
}$ ]]; then
error "Invalid or missing device_pci_address. Expected format: xxxx:xx:xx where x is a hexadecimal character."
fi
if
[ "$delete_flag"
== "true"
]; then
return
0
fi
if
[[ -z "$control_plane_mtu"
]]; then
error "control_plane_mtu is a required argument"
fi
}
# Update control plane MTU
update_control_plane_mtu() {
# Set MTU to the br-dpu.
local bridge_mtu
bridge_mtu=$(cat ${BR_DPU_DIR}/mtu)
if
[ "${bridge_mtu}"
-ne "${control_plane_mtu}"
]; then
if
! ip link set dev ${BRIDGE_NAME} mtu "${control_plane_mtu}"
; then
error "Failed to update bridge MTU"
fi
fi
# Set MTU to the interfaces under br-dpu.
for
inf in "${BR_DPU_DIR}"
/brif/*; do
inf=$(basename "$inf"
)
local inf_mtu
inf_mtu=$(cat /sys/class
/net/"${inf}"
/mtu)
if
[ "${inf_mtu}"
-ne "${control_plane_mtu}"
]; then
if
! ip link set dev "${inf}"
mtu "${control_plane_mtu}"
; then
error "Failed to update interface MTU"
fi
fi
done
}
verify_default_route() {
ip route show | grep "default"
| grep "${BRIDGE_NAME}"
|| return
1
}
# Check if
the bridge is ready
bridge_check() {
local max_retries=10
local retry_count=0
while
[ $retry_count -lt $max_retries ]; do
if
[ -d "${BR_DPU_DIR}"
]; then
log "${BRIDGE_NAME} is created"
break
else
if
[ $retry_count -eq $((max_retries-1
)) ]; then
error "${BRIDGE_NAME} bridge does not exist"
fi
log "Bridge not found, retrying in 5 seconds (attempt $((retry_count+1))/$max_retries)"
sleep 5
retry_count=$((retry_count+1
))
continue
fi
done
if
[ -f "${SKIP_DEFAULT_ROUTE_CHECK_COND_FILE}"
]; then
log "condition file ${SKIP_DEFAULT_ROUTE_CHECK_COND_FILE} exists, skipping bridge default route check"
return
0
fi
retry_count=0
while
[ $retry_count -lt $max_retries ]; do
if
verify_default_route; then
return
0
else
if
[ $retry_count -eq $((max_retries-1
)) ]; then
error "Default route for bridge ${BRIDGE_NAME} does not exist"
fi
log "Default route not found, retrying in 5 seconds (attempt $((retry_count+1))/$max_retries)"
sleep 5
retry_count=$((retry_count+1
))
fi
done
}
# Create VFs for
a PF device
create_VFs() {
local pf_device=$1
local vf_num
vf_num=$(cat ${PCI_SYS_DIR}/"${pf_device}"
/sriov_numvfs)
if
[ "${vf_num}"
-eq 0
]; then
if
! echo "${num_of_vfs}"
>${PCI_SYS_DIR}/"${pf_device}"
/sriov_numvfs; then
error "Failed to set the number of VFs ${num_of_vfs}"
fi
log "Set the number of VFs to ${num_of_vfs}."
else
log "The number of VFs is set before to ${vf_num}"
fi
return
0
}
# Add a VF to the bridge
add_vf_to_bridge() {
local pf_device=$1
local vf_device
if
! vf_device=$(find ${PCI_SYS_DIR}/"${pf_device}"
/virtfn0/net -mindepth 1
-maxdepth 1
-type d); then
error "Failed to search for VF device in ${pf_device}"
fi
if
[ -z "${vf_device}"
]; then
error "No VFs found for ${pf_device}"
fi
local vf_name
vf_name=$(basename "${vf_device}"
)
if
! ip link show master "${BRIDGE_NAME}"
| grep -q "${vf_name}"
; then
if
! ip link set dev "${vf_name}"
master ${BRIDGE_NAME}; then
error "Failed to add VF to bridge"
fi
if
! ip link set dev "${vf_name}"
up; then
error "Failed to bring VF up"
fi
log "Added VF ${vf_name} to bridge ${BRIDGE_NAME}"
return
0
else
log "VF ${vf_name} is already part of bridge ${BRIDGE_NAME}"
return
0
fi
}
# Remove a VF from the bridge
delete_vf_from_bridge() {
local pf_device=$1
local vf_device
if
[ ! -d "${PCI_SYS_DIR}/${pf_device}/virtfn0/net"
]; then
log "VF net directory ${PCI_SYS_DIR}/${pf_device}/virtfn0/net does not exist, no need to delete VF from ${pf_device}"
return
0
fi
vf_device=$(find ${PCI_SYS_DIR}/"${pf_device}"
/virtfn0/net -mindepth 1
-maxdepth 1
-type d 2
>/dev/null
)
if
[ -z "${vf_device}"
]; then
log "No VF found, no need to delete VF from ${pf_device}"
return
0
fi
local vf_name
vf_name=$(basename "${vf_device}"
)
if
! ip link show master ${BRIDGE_NAME} | grep -q "${vf_name}"
; then
log "VF ${vf_name} is not connected to the bridge, no need to delete VF from ${pf_device}"
return
0
fi
if
ip link set "${vf_name}"
nomaster; then
log "Disconnected VF ${vf_name} from bridge"
return
0
else
error "Failed to disconnect VF ${vf_name} from bridge"
fi
}
# Function to perform cleanup
cleanup() {
log "Cleaning up..."
if
[[ -n "${device_pci_address:-}"
]]; then
delete_vf_from_bridge "${device_pci_address}.0"
fi
if
[[ -f "${VF_CONFIG_FILE}"
]]; then
rm -f "${VF_CONFIG_FILE}"
log "Removed VF configuration file ${VF_CONFIG_FILE}"
fi
}
# Function that is used in trap to be called on different signals received
trap_handler() {
cleanup
# Need to return
1
so that the provisioning controller understands if
the script was ran successfully or not
exit 1
}
# Main program
main() {
if
! parse_arguments "$@"
; then
exit 1
fi
# If the delete flag is set then perform cleanup and exit immediately.
if
[[ $delete_flag == "true"
]]; then
cleanup
exit 0
fi
# Call bridge_check and store its return
code
bridge_check
local rc=$?
# If check-bridge flag is set, return
the exit code from bridge_check
if
[[ "${check_bridge_flag}"
== "true"
]]; then
return
$rc
fi
trap trap_handler INT TERM ILL FPE SEGV ALRM ERR EXIT
local p0="${device_pci_address}.0"
create_VFs "${p0}"
local p1="${device_pci_address}.1"
if
[[ -d "${PCI_SYS_DIR}/${p1}"
]]; then
local deviceID
deviceID=$(cat "${PCI_SYS_DIR}/${p1}/device"
)
for
dpu_device in "${DPU_DEVICE_LIST[@]}"
; do
if
[[ "${dpu_device}"
== "${deviceID}"
]]; then
create_VFs "${p1}"
break
fi
done
fi
add_vf_to_bridge "${p0}"
update_control_plane_mtu
if
[[ "${restore_vf_flag}"
!= "true"
]]; then
if
! mkdir -p "${DMS_DIR}"
; then
error "Failed to create directory ${DMS_DIR}"
fi
# Save VF configuration to file for
future restoration (overwrites existing file if
any)
cat > "${VF_CONFIG_FILE}"
<< EOF
device_pci_address=${device_pci_address}
num_of_vfs=${num_of_vfs}
control_plane_mtu=${control_plane_mtu}
EOF
log "Saved VF configuration to ${VF_CONFIG_FILE}"
fi
log "Main function completed successfully"
trap - INT TERM ILL KILL FPE SEGV ALRM ERR EXIT
return
0
}
main "$@"
dmsinit.sh
#!/bin/bash
# 2025
NVIDIA CORPORATION & AFFILIATES
# Licensed under the Apache License, Version 2.0
(the License);
# you may not use this
file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for
the specific language governing permissions and
# limitations under the License.
set -euo pipefail
# Define constants
readonly DEFAULT_EXTERNAL_CERTIFICATE='none'
readonly DEFAULT_K8S_ENV='true'
readonly DEFAULT_DMS_IP='0.0.0.0'
readonly DEFAULT_DMS_PORT=9339
readonly DEFAULT_NAMESPACE='dpf-operator-system'
readonly DEFAULT_ISSUER='dpf-provisioning-issuer'
readonly DEFAULT_KUBERNETES_VERSION='1.32.0'
readonly DEFAULT_NODE_REBOOT_METHOD='gNOI'
readonly VALID_NODE_REBOOT_METHODS=('gNOI'
'external'
'script'
)
readonly PCISYS_DIR="/sys/bus/pci/devices"
readonly DEVICE_LIST=("0xa2dc"
"0xa2d6"
) # BlueField-3
and BlueField-2
device IDs
readonly DEFAULT_DMS_CONF_DIR="/opt/dpf/dms"
readonly DEFAULT_PCI_ADDR_FILE="pci_addr"
readonly DEFAULT_DMS_CONF_FILE="dms.conf"
readonly DEFAULT_DMS_IMAGE_DIR="bfb"
readonly DEFAULT_DMS_SYSTEMD_FILE="dmsd.service"
readonly CMD_REGISTER="register"
readonly CMD_CHECK_RSHIM_NOT_OCCUPIED="check-rshim-not-occupied"
readonly VALID_CMDS=("$CMD_REGISTER"
"$CMD_CHECK_RSHIM_NOT_OCCUPIED"
)
readonly SCRIPT_DIR="$(cd "
$(dirname "${BASH_SOURCE[0]}"
)" && pwd)"
readonly HOSTNETWORK_SCRIPT="$SCRIPT_DIR/hostnetwork.sh"
# Global arrays for
DPU discovery
declare -a pci_addr_list=()
declare -a pf0_name_list=()
declare -A number_of_pfs=()
# Log function
log() {
echo "[dmsinit] $1"
}
# Error function
error() {
echo "[dmsinit] Error (${FUNCNAME[1]}:${BASH_LINENO[0]}): $1"
>&2
exit 1
}
# Function to format PCI address
format_pci_address() {
local pci_address=$1
local remove_function_number=${pci_address%.*}
local label_pci_addr=${remove_function_number//:/-} # Convert colons to hyphens
# Output in format 0000
-4b-00
(domain-bus-device)
echo "$label_pci_addr"
}
# Function to get PF0 name
get_pf0_name() {
local pci_address=$1
local net_dir="${PCISYS_DIR}/${pci_address//-/:}/net"
if
[ -d "$net_dir"
]; then
local pf0_name=$(ls "$net_dir"
2
>/dev/null
| head -n1)
if
[ -n "$pf0_name"
]; then
echo "$pf0_name"
else
error "Error: No network interface found in $net_dir"
fi
else
error "Error: Network directory not found: $net_dir"
fi
}
store_pci_addresses() {
local pci_addr_list=("$@"
)
# Output PCI addresses to file for
DMS
log "Writing PCI addresses to $pci_addr_file"
# Create directory if
it doesn't exist
mkdir -p "$(dirname "
$pci_addr_file")"
# Clear the file first
> "$pci_addr_file"
# Write each PCI address to the file
for
pci_addr in "${pci_addr_list[@]}"
; do
echo "$pci_addr"
>> "$pci_addr_file"
done
log "Wrote ${#pci_addr_list[@]} PCI addresses to $pci_addr_file"
}
# Function to discover DPUs
discover_dpu() {
# Clear arrays before use
pci_addr_list=()
pf0_name_list=()
number_of_pfs=()
# Read all PCI devices
for
device_dir in "${PCISYS_DIR}"
/*; do
if
[ ! -d "$device_dir"
]; then
continue
fi
# Read device ID
if
[ ! -f "${device_dir}/device"
]; then
continue
fi
device_id=$(cat "${device_dir}/device"
2
>/dev/null
)
# Check if
device is in our list
for
known_device in "${DEVICE_LIST[@]}"
; do
if
[ "$device_id"
!= "$known_device"
]; then
continue
fi
# Get PCI address
pci_name=$(basename "$device_dir"
)
pci_addr=$(format_pci_address "$pci_name"
)
# Skip if
already discovered
if
[[ -n "${number_of_pfs["
$pci_addr"]:-}"
]]; then
# Increment number of PFs
number_of_pfs["$pci_addr"
]=$((${number_of_pfs["$pci_addr"
]} + 1
))
log "update number of PFs for $pci_addr: ${number_of_pfs["
$pci_addr"]}"
continue
fi
# Get PF0 name
pf0_name=$(get_pf0_name "$pci_name"
)
# Add PCI address to array
pci_addr_list+=("$pci_addr"
)
pf0_name_list+=("$pf0_name"
)
number_of_pfs[$pci_addr]="1"
# Initialize with 1
PF
log "Discovered DPU:"
log " PCI Address: $pci_addr"
log " Device ID: $device_id"
log " PF0 Name: $pf0_name"
done
done
# Create DPU devices array
local dpu_count=${#pci_addr_list[@]}
log "Total DPUs discovered: $dpu_count"
# If no DPUs found, exit early
if
[ $dpu_count -eq 0
]; then
log "No DPUs found on this system"
return
fi
store_pci_addresses "${pci_addr_list[@]}"
}
# Define functions
check_and_install_kubectl() {
if
! command -v kubectl &>/dev/null
; then
log "kubectl not found, downloading and installing kubectl $DEFAULT_KUBERNETES_VERSION in /tmp/doca-bin"
mkdir -p /tmp/doca-bin
if
! curl -sSfL -o /tmp/doca-bin/kubectl "https://dl.k8s.io/release/v$DEFAULT_KUBERNETES_VERSION/bin/linux/amd64/kubectl"
; then
error "Failed to download kubectl"
fi
chmod +x /tmp/doca-bin/kubectl || error "Failed to set execute permissions for kubectl"
export PATH="/tmp/doca-bin:$PATH"
else
log "kubectl already installed"
fi
output=$($kubectl_cmd version 2
>&1
)
if
[ $? -ne 0
]; then
error "Failed to check kubectl version: $output"
fi
}
check_resource_exists() {
local resource_type=$1
local resource_name=$2
output=$($kubectl_cmd -n $namespace get -o json $resource_type $resource_name 2
>&1
)
if
[ $? -ne 0
]; then
return
1
fi
log "Resource $resource_name of type $resource_type already exists, skipping creation"
return
0
}
create_certificate() {
if
! check_resource_exists "Certificate"
"$dpu_node_name-dms-server-cert"
; then
log "Creating certificate"
yaml=$(cat <<EOF
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
name: $dpu_node_name-dms-server-cert
namespace: $namespace
spec:
secretName: $dpu_node_name-server-secret
commonName: $dpu_node_name-dms-server-cert
duration: 8760h
renewBefore: 4360h
issuerRef:
name: $issuer
kind: Issuer
usages:
- server auth
ipAddresses:
- "$dms_ip"
EOF
)
output=$($kubectl_cmd apply -f - <<<"$yaml"
2
>&1
)
if
[ $? -ne 0
]; then
error "Failed to create certificate $dpu_node_name-dms-server-cert: $output"
fi
log "Certificate $dpu_node_name-dms-server-cert created successfully"
else
log "Certificate $dpu_node_name-dms-server-cert already exists, skipping creation"
fi
# Wait until the secret is created and populated
local secret_name=""
local max_attempts=30
local attempt=0
while
[ -z "$secret_name"
] && [ $attempt -lt $max_attempts ]; do
secret_name=$($kubectl_cmd get cert -n $namespace $dpu_node_name-dms-server-cert -o jsonpath='{.spec.secretName}'
)
if
[ -z "$secret_name"
]; then
log "Waiting for certificate secret to be created... (attempt $((attempt+1))/$max_attempts)"
sleep 2
((attempt++))
fi
done
if
[ -z "$secret_name"
]; then
error "Timed out waiting for certificate secret to be created"
fi
log "Certificate secret name: $secret_name"
# pull certificate files from secret
mkdir -p $dms_conf_dir/certs
$kubectl_cmd get secret -n $namespace $secret_name -o jsonpath='{.data.ca\.crt}'
| base64 --decode > $dms_conf_dir/certs/ca.crt
$kubectl_cmd get secret -n $namespace $secret_name -o jsonpath='{.data.tls\.crt}'
| base64 --decode > $dms_conf_dir/certs/tls.crt
$kubectl_cmd get secret -n $namespace $secret_name -o jsonpath='{.data.tls\.key}'
| base64 --decode > $dms_conf_dir/certs/tls.key
}
create_dpu_device() {
local pci_addr=$1
local pf0_name=$2
local number_of_pfs=$3
local name="$dpu_node_name-$pci_addr"
if
! check_resource_exists "DPUDevice"
"$name"
; then
log "Creating DPUDevice for PCI address $pci_addr"
yaml=$(cat <<EOF
apiVersion: provisioning.dpu.nvidia.com/v1alpha1
kind: DPUDevice
metadata:
name: $name
namespace: $namespace
spec:
pciAddress: $pci_addr
pf0Name: $pf0_name
numberOfPFs: $number_of_pfs
EOF
)
output=$($kubectl_cmd apply -f - <<<"$yaml"
2
>&1
)
if
[ $? -ne 0
]; then
error "Failed to create DPUDevice $name: $output"
fi
log "DPUDevice $name applied successfully"
fi
# Add the DPUDevice name to the array for
later use in DPUNode creation
dpudevice_names+=("$name"
)
log "Added $name to dpudevice_names array"
}
create_dpunode() {
if
! check_resource_exists "DPUNode"
"$dpu_node_name"
; then
log "Creating DPUNode"
yaml=$(cat <<EOF
apiVersion: provisioning.dpu.nvidia.com/v1alpha1
kind: DPUNode
metadata:
name: $dpu_node_name
namespace: $namespace
EOF
)
# in k8s env, the label is set by dpu-detector
if
[ "$k8s_env"
!= "true"
]; then
yaml=$(cat <<EOF
$yaml
labels:
feature.node.kubernetes.io/dpu-oob-bridge-configured: "true"
EOF
)
fi
if
[ -n "$kube_node_ref"
]; then
uid=$($kubectl_cmd get node $kube_node_ref -o jsonpath='{.metadata.uid}'
)
if
[ -z "$uid"
]; then
error "Failed to get UID for node $kube_node_ref"
fi
yaml=$(cat <<EOF
$yaml
ownerReferences:
- apiVersion: v1
kind: Node
name: $kube_node_ref
uid: $uid
controller: true
EOF
)
fi
yaml=$(cat <<EOF
$yaml
spec:
nodeRebootMethod:
EOF
)
if
[ "$node_reboot_method"
= "gNOI"
]; then
yaml=$(cat <<EOF
$yaml
gNOI: {}
EOF
)
fi
if
[ "$node_reboot_method"
= "external"
]; then
yaml=$(cat <<EOF
$yaml
external: {}
EOF
)
fi
if
[ "$node_reboot_method"
= "script"
]; then
if
! declare -p script_name 2
>/dev/null
; then
error "when script reboot method is used, the --script must be specified"
fi
yaml=$(cat <<EOF
$yaml
script:
name: "$script_name"
EOF
)
fi
if
[ -n "$dms_ip"
] && [ -n "$dms_port"
]; then
yaml=$(cat <<EOF
$yaml
nodeDMSAddress:
ip: $dms_ip
port: $dms_port
EOF
)
fi
if
[ ${#dpudevice_names[@]} -gt 0
]; then
yaml=$(cat <<EOF
$yaml
dpus:
EOF
)
for
device in "${dpudevice_names[@]}"
; do
yaml=$(cat <<EOF
$yaml
- name: $device
EOF
)
done
fi
output=$($kubectl_cmd apply -f - <<<"$yaml"
2
>&1
)
if
[ $? -ne 0
]; then
error "Failed to create DPUNode $dpu_node_name: $output"
fi
log "DPUNode $dpu_node_name applied successfully"
fi
}
# Function to get the first PCI address from file and format it to "0000:00:00.0"
get_the_first_pci_address() {
local pci_file=$1
local dms_pci
# Read PCI address from file
if
[ -f "$pci_file"
]; then
# Read the first line from the file and assign to dms_pci
dms_pci=$(head -n 1
"$pci_file"
)
if
[ -z "$dms_pci"
]; then
error "PCI address file exists but is empty"
fi
else
error "PCI address file not found: $pci_file"
fi
# Replace hyphens with colons in the PCI address
dms_pci=${dms_pci//-/:}
echo "$dms_pci"
}
create_dms_config(){
mkdir -p $dms_conf_dir
dms_pci=$(get_the_first_pci_address "$pci_addr_file"
)
cat << EOF > $dms_conf_dir/$DEFAULT_DMS_CONF_FILE
-bind_address $dms_ip:$dms_port -v 99
-auth cert -ca $dms_conf_dir/certs/ca.crt -tls_key_file $dms_conf_dir/certs/tls.key -tls_cert_file $dms_conf_dir/certs/tls.crt -password admin -username admin -image_folder $dms_image_dir -target_pci $dms_pci -exec_timeout 900
-disable_unbind_at_activate -reboot_status_check none -debug_command=true
EOF
log "DMS configuration created at $dms_conf_dir/$DEFAULT_DMS_CONF_FILE"
log "DMS configuration: $(cat $dms_conf_dir/$DEFAULT_DMS_CONF_FILE)"
cat << EOF > $dms_conf_dir/$DEFAULT_DMS_SYSTEMD_FILE
[Unit]
Description=DMS server
Wants=network.target
After=network-pre.target
[Service]
Type=exec
#in case
we have a bash script file, which needs to be source'd before
Environment="DMS_CONF=$dms_conf_dir/$DEFAULT_DMS_CONF_FILE"
ExecStart=/usr/bin/sh -c '/opt/mellanox/doca/services/dms/dmsd \$([ -f \$DMS_CONF ] && cat \$DMS_CONF)'
#This prevents systemd from terminating dmsd prematurely before it
#finishes its termination routines.
TimeoutStopSec=30
Restart=on-failure
RestartSec=5
#the following configuration means, that if
dmsd was restarted 5
times
#in the last 120
seconds, it will not be restarted again
#indication for
a serious configuration issue
StartLimitIntervalSec=120
StartLimitBurst=5
#Error codes below are considered critical errors, and not causing systemd to restart
RestartPreventExitStatus=1
6
SIGABRT
# mounts everythig as RO, apart from /dev, /proc, and /sys
ProtectSystem=strict
ProtectHome=read-only
PrivateTmp=true
# allow read/write access to BFB folder and hostnetwork config
ReadWritePaths=$dms_image_dir /var/lib
[Install]
WantedBy=multi-user.target
EOF
log "DMS systemd service file created at $dms_conf_dir/$DEFAULT_DMS_SYSTEMD_FILE"
}
# Function to create and enable restore-vf systemd service
create_restore_vf_service() {
local service_dir="/etc/systemd/system"
local service_name="dpf-restore-vf.service"
local service_file="${service_dir}/${service_name}"
# Create the service file
cat << EOF > "$service_file"
[Unit]
Description=DPF - Restore VF Configuration
After=network-online.target
[Service]
Type=oneshot
ExecStart=${HOSTNETWORK_SCRIPT} --restore-vf
RemainAfterExit=yes
[Install]
WantedBy=multi-user.target
EOF
# Reload systemd to recognize the new
service
if
! systemctl daemon-reload; then
error "Failed to reload systemd daemon"
fi
# Enable the service
if
! systemctl enable $service_name; then
error "Failed to enable $service_name"
fi
log "Created and enabled $service_name at $service_file"
}
usage() {
cat <<EOF
Usage: $0
[options]
Options:
--kubeconfig <path> Path to the kubeconfig file
--external-certificate <cert> External certificate (default
: none)
--kube-node-ref <ref> Kube node reference
--dms-ip <ip> DMS IP address (default
: 0.0
.0.0
)
--dms-port <port> DMS port (default
: 9339
)
--k8s-env <bool> Whether to use K8s environment (default
: true
)
--issuer <issuer> Issuer name (default
: dpf-provisioning-issuer)
--node-reboot-method <method> Node reboot method (default
: gNOI). Valid options: gNOI, external, script
--script-name <name> The name of ConfigMap which includes the pod template for
executing custom reboot script
--dms-conf-dir DIR DMS configuration directory (default
: $DEFAULT_DMS_CONF_DIR)
--dms-image-dir DIR DMS image directory (default
: $DEFAULT_DMS_IMAGE_DIR)
--cmd <cmd> Command (default
: $CMD_REGISTER). Valid options: $VALID_CMDS
EOF
}
parse_arguments() {
allowed_arguments=(
"--kubeconfig"
"--external-certificate"
"--kube-node-ref"
"--dms-ip"
"--dms-port"
"--k8s-env"
"--issuer"
"--node-reboot-method"
"--script-name"
"--namespace"
"--dms-conf-dir"
"--dms-image-dir"
"--cmd"
)
# Initialize variables with defaults
node_reboot_method=$DEFAULT_NODE_REBOOT_METHOD
while
[[ $# -gt 0
]]; do
case
$1
in
--kubeconfig)
kubeconfig=$2
shift 2
;;
--external-certificate)
external_certificate=$2
shift 2
;;
--kube-node-ref)
kube_node_ref=$2
shift 2
;;
--dms-ip)
dms_ip=$2
shift 2
;;
--dms-port)
dms_port=$2
shift 2
;;
--k8s-env)
k8s_env=$2
shift 2
;;
--namespace)
namespace=$2
shift 2
;;
--issuer)
issuer=$2
shift 2
;;
--node-reboot-method)
node_reboot_method=$2
if
[[ ! " ${VALID_NODE_REBOOT_METHODS[@]} "
=~ " ${node_reboot_method} "
]]; then
error "Invalid node reboot method: $node_reboot_method. Valid options are: ${VALID_NODE_REBOOT_METHODS[*]}"
fi
shift 2
;;
--script-name)
script_name=$2
if
[ "$node_reboot_method"
!= "script"
]; then
error "--script-name is only used when the --node-reboot-method is script"
fi
shift 2
;;
--dms-conf-dir)
dms_conf_dir="$2"
shift 2
;;
--dms-image-dir)
dms_image_dir="$2"
shift 2
;;
--cmd)
cmd=$2
if
[[ ! " ${VALID_CMDS[@]} "
=~ " ${cmd} "
]]; then
error "Invalid cmd: $cmd. Valid options are: ${VALID_CMDS[*]}"
fi
shift 2
;;
*)
if
[[ ! " ${allowed_arguments[@]} "
=~ " $1 "
]]; then
usage
error "Unknown option: $1"
fi
;;
esac
done
# Update variable assignments section
external_certificate=${external_certificate:-$DEFAULT_EXTERNAL_CERTIFICATE}
kube_node_ref=${kube_node_ref:-}
dms_ip=${dms_ip:-$DEFAULT_DMS_IP}
dms_port=${dms_port:-$DEFAULT_DMS_PORT}
k8s_env=${k8s_env:-$DEFAULT_K8S_ENV}
namespace=${namespace:-$DEFAULT_NAMESPACE}
issuer=${issuer:-$DEFAULT_ISSUER}
node_reboot_method=${node_reboot_method:-$DEFAULT_NODE_REBOOT_METHOD}
kubeconfig=${kubeconfig:-}
kubectl_cmd="kubectl"
if
[ -n "$kubeconfig"
]; then
kubectl_cmd="kubectl --kubeconfig $kubeconfig"
fi
cmd=${cmd:-$CMD_REGISTER}
if
[ "$cmd"
= "$CMD_REGISTER"
]; then
if
[ "$k8s_env"
= true
]; then
if
[ -z "$kube_node_ref"
]; then
error "in k8s env, kube-node-ref is required"
fi
dpu_node_name=$kube_node_ref
else
dpu_node_name=$(hostname | tr '[:upper:]'
'[:lower:]'
)
fi
fi
dpudevice_names=()
# Set the dependent variables after dms_conf_dir is finalized
dms_conf_dir=${dms_conf_dir:-"$DEFAULT_DMS_CONF_DIR"
}
pci_addr_file=${dms_conf_dir}/${DEFAULT_PCI_ADDR_FILE}
dms_image_dir=${dms_image_dir:-"${dms_conf_dir}/${DEFAULT_DMS_IMAGE_DIR}"
}
log "DMS image directory: $dms_image_dir"
mkdir -p $dms_image_dir
}
check_rshim_not_occupied() {
local output=/dev/stderr
# Check if
any DPUs were discovered
if
[ ${#pci_addr_list[@]} -eq 0
]; then
error "No DPUs found on this system, exiting"
fi
# Check if
termination-log exists and is writable
if
[ -f "/dev/termination-log"
]; then
log "Using /dev/termination-log for output"
output=/dev/termination-log
fi
local pci_addr=$(get_the_first_pci_address "$pci_addr_file"
)
log "Get PCI address: $pci_addr"
while
read dev; do
if
echo 'DISPLAY_LEVEL 1'
> "/dev/$dev/misc"
&& grep -q "$pci_addr"
"/dev/$dev/misc"
; then
echo -n "$pci_addr"
> "$output"
exit 1
fi
done < <(ls /dev | egrep 'rshim.*[0-9]+'
)
}
check_bridge() {
# Check if
bridge is configured correctly
log "Checking bridge br-dpu configuration"
# Run hostnetwork script with check-bridge option
if
[ -x "$HOSTNETWORK_SCRIPT"
]; then
log "Running $HOSTNETWORK_SCRIPT --check-bridge"
if
"$HOSTNETWORK_SCRIPT"
--check-bridge; then
log "Bridge check passed"
return
0
else
local ret_code=$?
log "Bridge check failed with return code $ret_code"
return
$ret_code
fi
else
error "Hostnetwork script not found or not executable: $HOSTNETWORK_SCRIPT"
fi
}
main() {
log "Running cmd $cmd"
if
[ "$k8s_env"
!= "true"
]; then
# in k8s env, this
check is performed by dpu-detector
if
! check_bridge; then
error "Bridge br-dpu check failed"
fi
fi
discover_dpu
if
[ "$cmd"
= "$CMD_REGISTER"
]; then
# Check if
any DPUs were discovered
if
[ ${#pci_addr_list[@]} -eq 0
]; then
error "No DPUs found on this system, exiting"
fi
check_and_install_kubectl
if
[ "$external_certificate"
= "none"
]; then
create_certificate
fi
# Create DPU device objects
for
i in "${!pci_addr_list[@]}"
; do
local pci_addr="${pci_addr_list[$i]}"
local pf0_name="${pf0_name_list[$i]}"
local npfs="${number_of_pfs[$pci_addr]}"
create_dpu_device "$pci_addr"
"$pf0_name"
"$npfs"
done
log "DPU devices created successfully"
create_dpunode
create_dms_config
# Create and enable restore-vf service only in non-Kubernetes environment
if
[ "$k8s_env"
!= true
]; then
create_restore_vf_service
else
log "Skipping restore-vf service creation in Kubernetes environment"
fi
# set grace_period
# THIS IS ONLY A TEMPORARY WORKAROUND AND WILL BE REMOVED WITH THE JANUARY RELEASE!
# This workaround addresses a firmware protection mechanism issue where devices
# are not automatically recovered after a fatal error.
# Fatal errors can occur during a SW_RESET, which is triggered by the DMS Pod (e.g., bfb-install).
#
# Workflow:
# 1
. Set the `grace_period` to 0
to bypass the firmware protection mechanism and ensure devices are recovered.
# 2
. Explicitly trigger a recovery operation on each device before proceeding with BF installation.
#
# Notes:
# - Physical Functions (PFs) must be handled before Virtual Functions (VFs) to avoid errors.
# Improved sorting ensures PFs are processed first based on their device names.
# - We will only handle `p0` and `p1` PFs (representing the first two functions of the PCI device).
# This is consistent with the behavior of the hostnetwork Pod, which also limits its handling to these PFs.
# - All VFs associated with these PFs will also be handled and recovered, but only after their corresponding PFs
# have been successfully processed to ensure a stable recovery sequence.
local pci_addr=$(get_the_first_pci_address "$pci_addr_file"
)
log "Get PCI address: $pci_addr"
readlink /sys/bus/pci/devices/"$pci_addr"
.[01
] /sys/bus/pci/devices/"$pci_addr"
.[01
]/virtfn* | xargs -n1 basename | sort -u | while
read pci_device; do
devlink health set pci/$pci_device reporter fw_fatal grace_period 0
; devlink health recover pci/$pci_device reporter fw_fatal; done
elif [ "$cmd"
= "$CMD_CHECK_RSHIM_NOT_OCCUPIED"
]; then
check_rshim_not_occupied
fi
log "Main function completed successfully"
}
parse_arguments "$@"
if
! main; then
error "Failed to complete main function: $?"
fi
a kubeconfig to access the K8s cluster where DPF is deployed
3. Initialize DMS
# Configure environment variables
export
KUBECONFIG=/opt/dpf/kube.config
export
DMS_IP=$(ip -o -4 addr show $(ip route show default | grep
-Po '(?<=dev )[^ ]+'
) | head
-1 | awk
'{print $4}'
| cut
-d/ -f1)
# Set the reboot method - options: 'gNOI', 'external', or 'script'
# Note: gNOI is not fully supported right now. Working on it
export
REBOOT_METHOD="external"
# Run DMS initialization script with preserved environment variables
sudo
-E /opt/dpf/dmsinit.sh --dms-ip ${DMS_IP} --k8s-env
false
--node-reboot-method ${REBOOT_METHOD} --kubeconfig ${KUBECONFIG}
4. Configure and Start System Services
# Copy the DMS service definition
sudo
cp
/opt/dpf/dms/dmsd.service /etc/systemd/system/
# Enable and start the rshim service
sudo
systemctl enable
rshim
sudo
systemctl start rshim
# Enable and start the DMS service
sudo
systemctl enable
dmsd
sudo
systemctl start dmsd
To verify the installation is working correctly:
# Check services status
sudo
systemctl status rshim
sudo
systemctl status dmsd
When your worker hosts are not managed by Kubernetes, you will need to perform a manual reboot step during the DPU provisioning process:
1) When the DPU reaches the rebooting
phase, you will need to manually power-cycle the worker host.
2) After the worker node has rebooted, remove the provisioning.dpu.nvidia.com/dpunode-external-reboot-required
annotation from the DPUNode using kubectl:
kubectl annotate dpunode <node-name> provisioning.dpu.nvidia.com/dpunode-external-reboot-required-
This step is only required for hosts that are not managed by Kubernetes.