From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.133])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0FD3272605
	for <kdevops@lists.linux.dev>; Sun,  7 Sep 2025 04:23:26 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.137.202.133
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757219008; cv=none; b=fvT/z7flM5CDjNJpN5Ulz4bvKuU61Xef8M3oXBkeBtbE5Ho3gNpi8IvNn+VgT6CubWBpm+BHJrtVEKMZczMTwdCHQFo6OWF+vL0tdJmx9sa879a/t+BrAsASqfHnBvSktPINB9HrWkFpk3qHgk+td/Lg1yVTQCti3fbSN/BosfU=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757219008; c=relaxed/simple;
	bh=2wtCJELlzuqLP3fryliQ0KnFgY2N9TJiL1kGCKtNyew=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version; b=IpiYNYR7huynlvZbYe1jHOgB/fAuygW0pfVVsQJtzEp4WimPIIZpBGBBtGyqwq1fVshqoXetDq09S/Z3PG5sXS3nLjI7iuhJfMnCWFOBM27j8ZDrcYTb0poJUYBol4Jqd88UZ/CXYct0BwLURP0b6aGY8isM4SKOdqtOdc6QxSQ=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=TBbJzQxY; arc=none smtp.client-ip=198.137.202.133
Authentication-Results: smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org
Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="TBbJzQxY"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=bombadil.20210309; h=Sender:Content-Transfer-Encoding:
	MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:
	Reply-To:Content-Type:Content-ID:Content-Description;
	bh=Z7PkhE7yZF9wzjUxQFi2C58THLu/cB779am8azAyMTY=; b=TBbJzQxY1cm3FMiUC+cj0EGsL0
	W9xGbzc6xKh2p1Lcp9lfwDEHU0W9IJiREfoDsoTK1S5LuxYDjZv7l+Vm3ltRmom4dm0E/1Bbh97j3
	uoGRjWnEArL1TzKwfSWHxzbXHflp4T/gI/On2M22q71GS1WRETVYu/8YKHIj7bicQNHXd+1lfqVu+
	5l/6b3Ntykpw7T2FhZyBGlbed1XIqA4myNbTq0SAIfWJBUfdqxnUkGhYYXiue/gDugfbL5KSxtRGg
	xJYdXzi8pxGdAjP8s7poGdtysQ84/G00xB+wl8EfCQmDBrJXfjUiTS0nInLjWyT+AGqSrK6bOvAR7
	Pb2cAQWw==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux))
	id 1uv6w2-00000009Lq5-1klM;
	Sun, 07 Sep 2025 04:23:26 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: Chuck Lever <cel@kernel.org>,
	Daniel Gomez <da.gomez@kruces.com>,
	kdevops@lists.linux.dev
Cc: Luis Chamberlain <mcgrof@kernel.org>
Subject: [PATCH 2/2] aws: enable GPU AMI support for GPU instances
Date: Sat,  6 Sep 2025 21:23:23 -0700
Message-ID: <20250907042325.2228868-3-mcgrof@kernel.org>
X-Mailer: git-send-email 2.49.0
In-Reply-To: <20250907042325.2228868-1-mcgrof@kernel.org>
References: <20250907042325.2228868-1-mcgrof@kernel.org>
Precedence: bulk
X-Mailing-List: kdevops@lists.linux.dev
List-Id: <kdevops.lists.linux.dev>
List-Subscribe: <mailto:kdevops+subscribe@lists.linux.dev>
List-Unsubscribe: <mailto:kdevops+unsubscribe@lists.linux.dev>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Sender: Luis Chamberlain <mcgrof@infradead.org>

Add support for GPU-optimized AMIs when using GPU instance types.
This includes:

- AWS Deep Learning AMI with pre-installed NVIDIA drivers, CUDA, and
  ML frameworks
- NVIDIA Deep Learning AMI option for NGC containers
- Custom GPU AMI support for specialized images
- Automatic detection of GPU instance types
- Conditional display of GPU AMI options only for GPU instances
- Update terraform.tfvars template to use GPU AMI when configured
- Add defconfig for AWS G6e.2xlarge GPU instance with Deep Learning AMI

The system automatically detects when you select a GPU instance family
(like G6E) and provides appropriate GPU-optimized AMI options including
the AWS Deep Learning AMI with all necessary drivers and frameworks
pre-installed.

Generated-by: Claude AI
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 defconfigs/aws-gpu-g6e-ai                     | 42 +++++++++++++++++++
 .../templates/aws/terraform.tfvars.j2         |  5 +++
 scripts/aws_api.py                            |  4 +-
 scripts/dynamic-cloud-kconfig.Makefile        |  6 +++
 terraform/aws/kconfigs/Kconfig.compute        |  5 +++
 5 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100644 defconfigs/aws-gpu-g6e-ai

diff --git a/defconfigs/aws-gpu-g6e-ai b/defconfigs/aws-gpu-g6e-ai
new file mode 100644
index 00000000..028b0c5e
--- /dev/null
+++ b/defconfigs/aws-gpu-g6e-ai
@@ -0,0 +1,42 @@
+# AWS G6e.2xlarge GPU instance with Deep Learning AMI for AI/ML workloads
+# This configuration sets up an AWS G6e.2xlarge instance with NVIDIA L40S GPU
+# optimized for machine learning, AI inference, and GPU-accelerated workloads
+
+# Cloud provider configuration
+CONFIG_KDEVOPS_ENABLE_TERRAFORM=y
+CONFIG_TERRAFORM=y
+CONFIG_TERRAFORM_AWS=y
+
+# AWS Dynamic configuration (required for G6E instance family and GPU AMIs)
+CONFIG_TERRAFORM_AWS_USE_DYNAMIC_CONFIG=y
+
+# AWS Instance configuration - G6E family with NVIDIA L40S GPU
+# G6E.2XLARGE specifications:
+# - 8 vCPUs (3rd Gen AMD EPYC processors)
+# - 32 GB system RAM
+# - 1x NVIDIA L40S Tensor Core GPU
+# - 48 GB GPU memory
+# - Up to 15 Gbps network performance
+# - Up to 10 Gbps EBS bandwidth
+CONFIG_TERRAFORM_AWS_INSTANCE_TYPE_G6E=y
+CONFIG_TERRAFORM_AWS_INSTANCE_G6E_2XLARGE=y
+
+# AWS Region - US East (N. Virginia) - primary availability for G6E
+CONFIG_TERRAFORM_AWS_REGION_US_EAST_1=y
+
+# GPU-optimized Deep Learning AMI
+# Includes: NVIDIA drivers 535+, CUDA 12.x, cuDNN, TensorFlow, PyTorch, MXNet
+CONFIG_TERRAFORM_AWS_USE_GPU_AMI=y
+CONFIG_TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING=y
+CONFIG_TERRAFORM_AWS_GPU_AMI_NAME="Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
+CONFIG_TERRAFORM_AWS_GPU_AMI_OWNER="amazon"
+
+# Storage configuration optimized for ML workloads
+# 200 GB for datasets, models, and experiment artifacts
+CONFIG_TERRAFORM_AWS_DATA_VOLUME_SIZE=200
+
+# Note: After provisioning, the instance will have:
+# - Jupyter notebook server ready for ML experiments
+# - Pre-installed deep learning frameworks
+# - NVIDIA GPU drivers and CUDA toolkit
+# - Docker with NVIDIA Container Toolkit for containerized ML workloads
diff --git a/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2 b/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2
index d880254b..f8f4c842 100644
--- a/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2
+++ b/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2
@@ -1,8 +1,13 @@
 aws_profile = "{{ terraform_aws_profile }}"
 aws_region = "{{ terraform_aws_region }}"
 aws_availability_zone = "{{ terraform_aws_av_zone }}"
+{% if terraform_aws_use_gpu_ami is defined and terraform_aws_use_gpu_ami %}
+aws_name_search = "{{ terraform_aws_gpu_ami_name }}"
+aws_ami_owner = "{{ terraform_aws_gpu_ami_owner }}"
+{% else %}
 aws_name_search = "{{ terraform_aws_ns }}"
 aws_ami_owner = "{{ terraform_aws_ami_owner }}"
+{% endif %}
 aws_instance_type = "{{ terraform_aws_instance_type }}"
 aws_ebs_volumes_per_instance = "{{ terraform_aws_ebs_volumes_per_instance }}"
 aws_ebs_volume_size = {{ terraform_aws_ebs_volume_size }}
diff --git a/scripts/aws_api.py b/scripts/aws_api.py
index e23acaa9..b22da559 100755
--- a/scripts/aws_api.py
+++ b/scripts/aws_api.py
@@ -956,7 +956,7 @@ if TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING
 config TERRAFORM_AWS_GPU_AMI_NAME
 	string
 	output yaml
-	default "Deep Learning AMI GPU TensorFlow*"
+	default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
 	help
 	  AMI name pattern for AWS Deep Learning AMI.
 
@@ -1061,7 +1061,7 @@ if TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING
 config TERRAFORM_AWS_GPU_AMI_NAME
 	string
 	output yaml
-	default "Deep Learning AMI GPU TensorFlow*"
+	default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
 
 config TERRAFORM_AWS_GPU_AMI_OWNER
 	string
diff --git a/scripts/dynamic-cloud-kconfig.Makefile b/scripts/dynamic-cloud-kconfig.Makefile
index fffa5446..c2d187bf 100644
--- a/scripts/dynamic-cloud-kconfig.Makefile
+++ b/scripts/dynamic-cloud-kconfig.Makefile
@@ -45,6 +45,7 @@ dynamic_aws_kconfig_touch:
 	$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated
 	$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.compute.static
 	$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.location.static
+	$(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static
 	$(Q)for family in $(AWS_INSTANCE_TYPE_FAMILIES); do \
 		touch $(AWS_INSTANCE_TYPES_DIR)/Kconfig.$$family.generated; \
 		touch $(AWS_INSTANCE_TYPES_DIR)/Kconfig.$$family.static; \
@@ -117,6 +118,11 @@ cloud-update:
 		sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.location.static; \
 		echo "  Created $(AWS_KCONFIG_DIR)/Kconfig.location.static"; \
 	fi
+	$(Q)if [ -f $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated ]; then \
+		cp $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \
+		sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \
+		echo "  Created $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static"; \
+	fi
 	# AWS instance type families
 	$(Q)for file in $(AWS_INSTANCE_TYPES_DIR)/Kconfig.*.generated; do \
 		if [ -f "$$file" ]; then \
diff --git a/terraform/aws/kconfigs/Kconfig.compute b/terraform/aws/kconfigs/Kconfig.compute
index 12083d1a..6b5ff900 100644
--- a/terraform/aws/kconfigs/Kconfig.compute
+++ b/terraform/aws/kconfigs/Kconfig.compute
@@ -80,3 +80,8 @@ source "terraform/aws/kconfigs/distros/Kconfig.oracle"
 source "terraform/aws/kconfigs/distros/Kconfig.rhel"
 source "terraform/aws/kconfigs/distros/Kconfig.sles"
 source "terraform/aws/kconfigs/distros/Kconfig.custom"
+
+# Include GPU AMI configuration if available (generated by cloud-config)
+if TERRAFORM_AWS_USE_DYNAMIC_CONFIG
+source "terraform/aws/kconfigs/Kconfig.gpu-amis.static"
+endif
-- 
2.50.1