#!/bin/bash
# autopkgtest QEMU backend with support for AMD GPU pass-through
#
# This is just a thin wrapper around autopkgtest-virt-qemu. It adds all the
# arguments necessary for using AMD GPUs in the VM.
#
# Author: Christian Kastner <ckk@kvr.at>
# License: MIT
set -eu

function usage() {
	cat >&2 <<- EOF

	autopkgtest QEMU backend with support for AMD GPU pass-through

	This is a thin wrapper around autopkgtest-virt-qemu(1).

	By default, all AMD GPUs assigned to vfio-pci are passed through. This can
	be overridden with --gpu, which can be specified multiple times [NOTE:
	multi-GPU is untested as of yet]. By default, the VM will be allocated 75%
	of the host's cores, and 75% of the host's memory.

	All other options on the command line are passed on directly to
	autopkgtest-virt-qemu, so the reader is referred to its man page. For
	example, use --ram-size or --cpus to deviate from the 75% default mentioned
	above.

	This will always boot an image in EFI mode. It will also always request a
	clean poweroff after a test (--timeout-poweroff), rather than the default
	of sending SIGTERM.

	This utility assumes that the invoking user has all the necessary
	permissions required in order to effectively and efficiently operate the
	VM, most notably that the user has access to /dev/kvm. When in doubt, try
	running the script as root.

	Synopsis:
	  $0 -h

	  $0 autopkgtest [...] -- qemu+rocm [--gpu GPU] [options] image [ro-image ...]

	Options:
	  --gpu GPU     PCI slot ID of the GPU to pass through (eg: 09:00.0). Can
	                used multiple times.

	Examples:

	  # Create an image first

	  \$ sudo rocm-qemu-create unstable.img
	  \$ sudo chown \$USER: unstable.img

	  # Run autopkgtests for src:rocrand, using packages from the Archive

	  \$ autopkgtest -B rocrand -- qemu+rocm unstable.img

	  # Like above, but limit to the GPU in slot 09:00.0

	  \$ autopkgtest -B rocrand -- qemu+rocm --gpu 09:00.0 unstable.img

	EOF
	exit 0
}

[ "${1:-}" = "-h" ] && usage

for groupname in kvm render
do
	if [ "$UID" -ne 0 ] && ! groups | grep -q "\b${groupname}\b"
	then
		echo "Must be either root, or in group $groupname to use this." >&2
		exit 1
	fi
done

# Given something like 0000:07:00.1, generates a QEMU device string
# Only works for VGA (0300), Display (0380) and Audio (0403) device classes
generate_qemu_device() {
	local pci_device="$1"
	local bus="$2"
	local addr
	local class

	addr="$(echo "$pci_device" | cut -d' ' -f1 | cut -d: -f3)"
	class="$(lspci -s "$1" -n | cut -d ' ' -f2)"
	if [ "$class" = "0300:" ] || [ "$class" = "0380:" ]
	then
		echo "-device vfio-pci,host=$pci_device,bus=$bus,addr=$addr,multifunction=on,x-vga=off"
	elif [ "$class" = "0403:" ]
	then
		echo "-device vfio-pci,host=$pci_device,bus=$bus,addr=$addr"
	else
		echo "$pci_device: Unsupported device class $class." >&2
		exit 1
	fi
}

# We need to remember this, as we cd a few times below
initial_cwd="$PWD"

# 75% of the host's cores and memory
Ncores=$(grep 'cpu cores' /proc/cpuinfo | uniq | grep -Eo '[[:digit:]]+')
Ncores=$((Ncores * 3 / 4))
Nmem=$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024)))
Nmem=$((Nmem * 3 / 4))

# We use --qemu-options, but so might our caller, so we need to merge here.
qemu_options="-vga none"
# List of GPU devices we will use
gpus=
# Associative array of other devices either on the same card, or some IOMMU
# group as the primary device. These must be passed through, too
declare -A subdevices
# Devices our user might have requested via --gpu
gpus_input=

# We can't use getopt for option parsing, as it has no way to ignore unknown
# options, specifically: the options we just pass on to autopkgtest-virt-qemu.
declare -a newargs
continued=
for arg in "$@"
do
	if [ "$continued" = "qemu-options" ]
	then
		qemu_options+=" $arg"
		continued=
	elif [ "$continued" = "gpu" ]
	then
		gpus_input+=" $arg"
		continued=
	# --qemu-options="foo bar baz" is one positional argument
	elif [[ "$arg" =~ "--qemu-options=" ]]
	then
		qemu_options+=" ${arg##--qemu-options=}"
	# --qemu-options "foo bar baz" are two positional arguments
	# Need the trailing space, as to not just match on prefix
	elif [[ "$arg " =~ "--qemu-options " ]]
	then
		continued="qemu-options"
	elif [[ "$arg" =~ "--gpu=" ]]
	then
		gpus_input+=" ${arg##--gpu=}"
	elif [[ "$arg " =~ "--gpu " ]]
	then
		continued="gpu"
	else
		newargs+=( "$arg" )
	fi
done

# First, determine the list of GPUs to pass through
if [ -z "$gpus_input" ]
then
	# No GPUs specified -- use all AMD GPUs assigned to vfio
	# 1002=AMD, 0300=VGA compatible controller, 0380=Display controller
	for gpu in $( { lspci -D -d 1002::0300 ; lspci -D -d 1002::0380 ; } | cut -d' ' -f1)
	do
		if lspci -s "$gpu" -k | grep -q 'Kernel driver in use: vfio-pci'
		then
			gpus+=" $gpu"
		fi
	done
else
	# User explicitly requested this GPU, so we treat it as a hard failure if
	# it cannot be passed through
	for gpu in $gpus_input
	do
		# Get canonical device ID
		cangpu=$(lspci -s "$gpu" -D | cut -d' ' -f1)
		if [ -z "$cangpu" ]
		then
			echo "No such device: $gpu" >&2
			exit 1
		elif ! lspci -s "$cangpu" | grep -q -E '(VGA compatible|Display) controller'
		then
			echo "Device $cangpu is not a GPU" >&2
			exit 1
		elif ! lspci -s "$cangpu" -k | grep -q 'Kernel driver in use: vfio-pci'
		then
			echo "Device $cangpu not assigned to vfio-pci" >&2
			exit 1
		fi
		[[ "$gpus"  == *$cangpu* ]] || gpus+=" $cangpu"
	done
fi

# Then, find all of the GPU's subdevices
shopt -s nullglob
for gpu in $gpus
do
	subdevices[$gpu]=""

	# First, all consumer devices of this GPU (like the audio device on the card)
	cd /sys/bus/pci/devices/"$gpu"
	for consumer_raw in consumer:pci:*
	do
		consumer=$(echo "$consumer_raw" | cut -d: -f3,4,5)
		[[ ${subdevices[$gpu]} == *$consumer* ]] || subdevices[$gpu]+=" $consumer"
	done

	# Then, all devices in the same IOMMU group
	cd /sys/bus/pci/devices/"$gpu"/iommu_group/devices
	for member in *
	do
		[ "$gpu" == "$member" ] && continue
		[[ ${subdevices[$gpu]} == *$member* ]] || subdevices[$gpu]+=" $member"
	done
done
shopt -u nullglob

# Each device gets assigned to its own PCIe bridge, with an address mirroring
# the host address. chassis + slot need to be unique; we use chassis=1 and
# incremental slots.
increment=1
for gpu in $gpus
do
	root_port="rp$increment"
	qemu_options+=" -device pcie-root-port,id=$root_port,chassis=1,slot=$increment,multifunction=on"
	increment=$(("$increment" + 1))

	qemu_options+=" $(generate_qemu_device "$gpu" "$root_port")"
	for subdevice in ${subdevices[$gpu]}
	do
		qemu_options+=" $(generate_qemu_device "$subdevice" "$root_port")"
	done
done

newargs+=( "--qemu-options=$qemu_options" )
set -- "${newargs[@]}"

cd "$initial_cwd"

# If a user explicitly specifies --ram-size or --cpus (in $@), that will
# override our values here
exec autopkgtest-virt-qemu \
	--boot=efi \
	--timeout-poweroff=30 \
	--cpus "$Ncores" \
	--ram-size "$Nmem"\
	"$@"
