#!/bin/bash -e
# Boot an amd64 QEMU VM with EFI boot
#
# Author:  Christian Kastner <ckk@kvr.at>
# License: MIT

# Option defaults
portfwd=
virtfs=
snapshot=off
gpus_input=
# Use 75% of CPU cores and memory by default
Ncores=$(grep 'cpu cores' /proc/cpuinfo | uniq | grep -Eo '[[:digit:]]+')
Ncores=$((Ncores * 3 / 4))
Nmem=$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024)))
Nmem=$((Nmem * 3 / 4))
# Other defaults
qemu_binary=qemu-system-x86_64
ovmf_code_file=/usr/share/OVMF/OVMF_CODE_4M.fd
ovmf_vars_file=/usr/share/OVMF/OVMF_VARS_4M.fd

function usage() {
	cat >&2 <<- EOF

	Boot an amd64 QEMU VM to a console, passing through GPUs.

	By default, all GPUs assigned to vfio-pci are passed through. This can be
	overridden with -g, which can be specified multiple times. [NOTE: multi-GPU
	is untested, and therefore disabled for now.]

	The VM will boot an amd64 image in EFI mode. The EFIVARS file will be a
	clean copy at every boot. The image will be writeable, so use qemu-img(1)
	to create a snapshot first if you'd like to be able to revert your changes.

	All [qemu options] are passed on to the QEMU command line, in case you want
	to add more configuration to the VM.

	This utility assumes that the invoking user has all the necessary
	permissions required in order to effectively and efficiently operate the
	VM, most notably that the user has access to /dev/kvm. When in doubt, try
	running the script as root.

	Synopsis:
	  $0 -h

	  $0 [-c CPUs] [-d DIR ] [-g GPU] [-m MEM] [-p SSHPORT] image [qemu options]

	Options:
	  -h         Show this help
	  -c CPUS    Number of guest CPUs (default: 75% of CPU cores)
	  -d DIR     Mount host dir DIR as /shared in the guest
	  -g GPU     PCI slot ID of GPU to pass through (eg: 09:00.0)
	  -m MEM     Guest RAM (default: 75% of memory)
	  -p SSHPORT Forward host port PORT to guest port 22
	  -s         Boot the VM in snapshot mode (changes are discarded)

	Examples:

	  # Configure the system for GPU pass-through

	  \$ rocm-qemu-setup -u <user>

	  # Create an image

	  \$ sudo rocm-qemu-create unstable.img
	  \$ sudo chown \$USER: unstable.img

	  # Boot the image above to a console
	  # This will pass through all GPUs assigned to vfio-pci

	  \$ $0 unstable.img

	  # Like above, but use 4 CPUs and 8GB of RAM

	  \$ $0 -c 4 -m 8192 unstable.img

	  # Like above, but only pass through GPU 09:00.0

	  \$ $0 unstable.img -g 09:00.0

	  # Forward host port 10022 to guest port 22. Connect with:
	  # ssh -p 10022 -o CheckHostIP=no -o StrictHostKeyChecking=no root@localhost

	  \$ $0 -p 10022 -d /some/host/dir unstable-amd64.img

	  # Snapshot mode (no changes to the image will be saved)

	  \$ $0 -s unstable.img

	  # Share host directory /tmp/foo
	  # The directory will be mounted at /shared within the guest

	  \$ $0 -d /tmp/foo unstable.img

	EOF
	exit 0
}

while getopts "c:d:g:hm:p:s" OPTNAME
do
	case $OPTNAME in
	c)	Ncores="$OPTARG";;
	d)	virtfs="-virtfs local,path=$OPTARG,id=rocm-guest,mount_tag=rocm-guest,security_model=none";;
	g)	gpus_input+=" $OPTARG";;
	h)	usage;;
	m)	Nmem="$OPTARG";;
	p)	portfwd="-net nic,model=virtio -net user,hostfwd=tcp::$OPTARG-:22";;
	s)	snapshot=on;;
	?)	usage;;
	esac
done
shift $((OPTIND - 1))

imagefile="$(realpath "$1")"
[ -n "$imagefile" ] || usage
# Because of the "$@" below
shift 1

for groupname in kvm render
do
	if [ "$UID" -ne 0 ] && ! groups | grep -q "\b${groupname}\b"
	then
		echo "Must be either root, or in group $groupname to use this." >&2
		exit 1
	fi
done

# Given a slot ID like 0000:07:00.1, generates a QEMU device string
# Only works for VGA (0300), Display (0380) and Audio (0403) device classes
generate_qemu_device() {
	local pci_slot="$1"
	local bus="$2"
	local addr
	local class

	addr="$(echo "$pci_slot" | cut -d' ' -f1 | cut -d: -f3)"
	class="$(lspci -s "$1" -n | cut -d ' ' -f2)"
	if [ "$class" = "0300:" ] || [ "$class" = "0380:" ]
	then
		echo "-device vfio-pci,host=$pci_slot,bus=$bus,addr=$addr,multifunction=on,x-vga=off"
	elif [ "$class" = "0403:" ]
	then
		echo "-device vfio-pci,host=$pci_slot,bus=$bus,addr=$addr"
	else
		echo "Unsupported device class." >&2
	fi
}

# List of GPU devices (slot IDs) we will use
gpus=
# Associative array of other devices either on the same card, or some IOMMU
# group as the primary device. These must be passed through, too
declare -A subdevices

# First, determine the list of GPUs to pass through
if [ -z "$gpus_input" ]
then
	# No GPUs specified -- use all AMD GPUs assigned to vfio
	# 1002=AMD, 0300=VGA compatible controller, 0380 = Display controller
	for gpu in $( { lspci -D -d 1002::0300 ; lspci -D -d 1002::0380 ; } | cut -d' ' -f1)
	do
		if lspci -s "$gpu" -k | grep -q 'Kernel driver in use: vfio-pci'
		then
			gpus+=" $gpu"
		fi
	done
else
	# User explicitly requested this GPU, so we treat it as a hard failure if
	# it cannot be passed through
	for gpu in $gpus_input
	do
		# Get canonical slot ID
		slotID=$(lspci -s "$gpu" -D | cut -d' ' -f1)
		if [ -z "$slotID" ]
		then
			echo "No such device: $gpu" >&2
			exit 1
		elif ! lspci -s "$slotID" | grep -q -E '(VGA compatible|Display) controller'
		then
			echo "Device $slotID is not a GPU" >&2
			exit 1
		elif ! lspci -s "$slotID" -k | grep -q 'Kernel driver in use: vfio-pci'
		then
			echo "Device $slotID not assigned to vfio-pci" >&2
			exit 1
		fi
		[[ "$gpus"  == *$slotID* ]] || gpus+=" $slotID"
	done
fi

# Then, find all of the GPU's subdevices
shopt -s nullglob
for gpu in $gpus
do
	subdevices[$gpu]=""

	# First, all consumer devices of this GPU (like the audio device on the card)
	cd /sys/bus/pci/devices/"$gpu"
	for consumer_raw in consumer:pci:*
	do
		consumer=$(echo "$consumer_raw" | cut -d: -f3,4,5)
		[[ ${subdevices[$gpu]} == *$consumer* ]] || subdevices[$gpu]+=" $consumer"
	done

	# Then, all devices in the same IOMMU group
	cd /sys/bus/pci/devices/"$gpu"/iommu_group/devices
	for member in *
	do
		[ "$gpu" == "$member" ] && continue
		[[ ${subdevices[$gpu]} == *$member* ]] || subdevices[$gpu]+=" $member"
	done
done
shopt -u nullglob

qemu_options="-vga none"

# Each device gets assigned to its own PCIe bridge, with an address mirroring
# the host address. chassis + slot need to be unique; we use chassis=1 and
# incremental slots.
increment=1
for gpu in $gpus
do
	root_port="rp$increment"
	qemu_options+=" -device pcie-root-port,id=$root_port,chassis=1,slot=$increment,multifunction=on"
	increment=$(("$increment" + 1))

	qemu_options+=" $(generate_qemu_device "$gpu" "$root_port")"
	for subdevice in ${subdevices[$gpu]}
	do
		qemu_options+=" $(generate_qemu_device "$subdevice" "$root_port")"
	done
done

efivars_tmpfile="$(mktemp || exit 1)"
cp -f "$ovmf_vars_file" "$efivars_tmpfile"
cleanup() {
    rm -f "$efivars_tmpfile"
}
trap cleanup EXIT

# $virtfs and $portfwd need to be expanded
# shellcheck disable=SC2086
"$qemu_binary" $virtfs $portfwd $qemu_options "$@" \
	-enable-kvm \
	-machine q35 \
	-m "$Nmem" \
	-cpu host \
	-smp "$Ncores" \
	-nographic \
	-device virtio-serial \
	-drive "index=0,file=$imagefile,snapshot=$snapshot" \
	-drive "if=pflash,format=raw,unit=0,read-only=on,file=$ovmf_code_file" \
	-drive "if=pflash,format=raw,unit=1,file=$efivars_tmpfile"
