From mboxrd@z Thu Jan  1 00:00:00 1970
Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.133])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC6C220DD72
	for <kdevops@lists.linux.dev>; Tue,  9 Sep 2025 00:56:45 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.137.202.133
ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1757379407; cv=none; b=NHeRRpFCul68PHqg4ktnLBDKPNCSna4t5HOTTUbn3aFG+ocaPy0hcnut+NiyZEgmg+2fBK4831IbHcOwkoKmF5Zhi3u1LOcQV6xGYe5nfkS1jcH/Id9qf/Pl96+mn3fBOte79Kcyt4Q1gJ/DlZ7CBletFy66Hm1SgliXr9BXgu4=
ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1757379407; c=relaxed/simple;
	bh=viHfjecUso3G4hQz+8lrylx9w84ByJ9nEswGimgxAUc=;
	h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version; b=md6x5p32FlkL6i/wP/1r7qW6gDBf9YN47IPOs6bnezsvwDD2FIKbtgFxtTULIQh+8n2yY75J3GanrAfoKyrFLWlpfrQ4saSlbQPBuCRTz1u4E9qqUNZKegczVMjMwAVFMTTLBQ5h6UimgT0cSIYFaIld+m38aNpXeVz1eUFYfow=
ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=2OiBzrpl; arc=none smtp.client-ip=198.137.202.133
Authentication-Results: smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org
Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="2OiBzrpl"
DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed;
	d=infradead.org; s=bombadil.20210309; h=Sender:Content-Transfer-Encoding:
	MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From:
	Reply-To:Content-Type:Content-ID:Content-Description;
	bh=WY1nQ8EpaluRzDxF7cplfs1+E+5rDLCBwo31KxeYDjA=; b=2OiBzrplPVng4wC7/hEp7nLte0
	UlOek7Z0efyk27Zg0o+Sr0HNRSn+nZ9ZY6e+80ccVPUXXUiJpn37EYmKODxmeLX0NWWSd0yCdaBly
	aEpQLp5zl5tyvFolm1YUmfRRUgI3lrug6j0MrI+Fn92xJ17ND0v579Qji+CdDk4NFpwVhLYE+WcPI
	JLMLvU0DjbjvqQpjJvYAwyXyDYZHoCidRA6oBo6ldo6gq4jZ3UQiA9xARj6afZZx6qQe2ztrjAlV8
	mCGW6BSSkk68Bzv5IS1q23anS0DRAv7zAXGDz/VjN89zCADXfgM/SeOLEKEbdvbKzqbLYm0ldO+If
	d3L9bnOA==;
Received: from mcgrof by bombadil.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux))
	id 1uvmf7-00000003Ldt-2fHY;
	Tue, 09 Sep 2025 00:56:45 +0000
From: Luis Chamberlain <mcgrof@kernel.org>
To: Chuck Lever <cel@kernel.org>,
	Daniel Gomez <da.gomez@kruces.com>,
	kdevops@lists.linux.dev
Cc: Luis Chamberlain <mcgrof@kernel.org>
Subject: [PATCH v3 2/3] aws: enable GPU AMI support for GPU instances
Date: Mon,  8 Sep 2025 17:56:42 -0700
Message-ID: <20250909005644.798127-3-mcgrof@kernel.org>
X-Mailer: git-send-email 2.49.0
In-Reply-To: <20250909005644.798127-1-mcgrof@kernel.org>
References: <20250909005644.798127-1-mcgrof@kernel.org>
Precedence: bulk
X-Mailing-List: kdevops@lists.linux.dev
List-Id: <kdevops.lists.linux.dev>
List-Subscribe: <mailto:kdevops+subscribe@lists.linux.dev>
List-Unsubscribe: <mailto:kdevops+unsubscribe@lists.linux.dev>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Sender: Luis Chamberlain <mcgrof@infradead.org>

Add support for using GPU-optimized Amazon Machine Images (AMIs) when
deploying GPU instances on AWS. This enables automatic selection of
Deep Learning AMIs with pre-installed NVIDIA drivers, CUDA toolkit,
and ML frameworks for GPU-accelerated workloads.

Key changes:
- Add CONFIG_TERRAFORM_AWS_USE_GPU_AMI to enable GPU AMI selection
- Support GPU AMI name/owner configuration via Kconfig
- Default to Deep Learning OSS Nvidia Driver AMI with PyTorch for Ubuntu 22.04
- Update terraform.tfvars template to conditionally use GPU AMI settings
- Add aws-gpu-g6e-ai defconfig for G6E.2xlarge with GPU AMI
- Generate GPU AMI Kconfig options dynamically via AWS API
- Provide fallback GPU AMI defaults when AWS CLI unavailable

The aws-gpu-g6e-ai defconfig demonstrates usage with:
- G6E.2xlarge instance (8 vCPUs, 32GB RAM, NVIDIA L40S GPU)
- Deep Learning AMI with NVIDIA drivers 535+, CUDA 12.x, PyTorch
- 200GB storage for datasets and models

Generated-by: Claude AI
Signed-off-by: Luis Chamberlain <mcgrof@kernel.org>
---
 defconfigs/aws-gpu-g6e-ai                     | 40 +++++++++++++++++++
 .../templates/aws/terraform.tfvars.j2         |  5 +++
 scripts/aws_api.py                            |  4 +-
 scripts/dynamic-cloud-kconfig.Makefile        |  5 +++
 4 files changed, 52 insertions(+), 2 deletions(-)
 create mode 100644 defconfigs/aws-gpu-g6e-ai

diff --git a/defconfigs/aws-gpu-g6e-ai b/defconfigs/aws-gpu-g6e-ai
new file mode 100644
index 00000000..a168bbc3
--- /dev/null
+++ b/defconfigs/aws-gpu-g6e-ai
@@ -0,0 +1,40 @@
+# AWS G6e.2xlarge GPU instance with Deep Learning AMI for AI/ML workloads
+# This configuration sets up an AWS G6e.2xlarge instance with NVIDIA L40S GPU
+# optimized for machine learning, AI inference, and GPU-accelerated workloads
+
+# Cloud provider configuration
+CONFIG_KDEVOPS_ENABLE_TERRAFORM=y
+CONFIG_TERRAFORM=y
+CONFIG_TERRAFORM_AWS=y
+
+
+# AWS Instance configuration - G6E family with NVIDIA L40S GPU
+# G6E.2XLARGE specifications:
+# - 8 vCPUs (3rd Gen AMD EPYC processors)
+# - 32 GB system RAM
+# - 1x NVIDIA L40S Tensor Core GPU
+# - 48 GB GPU memory
+# - Up to 15 Gbps network performance
+# - Up to 10 Gbps EBS bandwidth
+CONFIG_TERRAFORM_AWS_INSTANCE_TYPE_G6E=y
+CONFIG_TERRAFORM_AWS_INSTANCE_G6E_2XLARGE=y
+
+# AWS Region - US East (N. Virginia) - primary availability for G6E
+CONFIG_TERRAFORM_AWS_REGION_US_EAST_1=y
+
+# GPU-optimized Deep Learning AMI
+# Includes: NVIDIA drivers 535+, CUDA 12.x, cuDNN, TensorFlow, PyTorch, MXNet
+CONFIG_TERRAFORM_AWS_USE_GPU_AMI=y
+CONFIG_TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING=y
+CONFIG_TERRAFORM_AWS_GPU_AMI_NAME="Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
+CONFIG_TERRAFORM_AWS_GPU_AMI_OWNER="amazon"
+
+# Storage configuration optimized for ML workloads
+# 200 GB for datasets, models, and experiment artifacts
+CONFIG_TERRAFORM_AWS_DATA_VOLUME_SIZE=200
+
+# Note: After provisioning, the instance will have:
+# - Jupyter notebook server ready for ML experiments
+# - Pre-installed deep learning frameworks
+# - NVIDIA GPU drivers and CUDA toolkit
+# - Docker with NVIDIA Container Toolkit for containerized ML workloads
diff --git a/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2 b/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2
index d880254b..f8f4c842 100644
--- a/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2
+++ b/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2
@@ -1,8 +1,13 @@
 aws_profile = "{{ terraform_aws_profile }}"
 aws_region = "{{ terraform_aws_region }}"
 aws_availability_zone = "{{ terraform_aws_av_zone }}"
+{% if terraform_aws_use_gpu_ami is defined and terraform_aws_use_gpu_ami %}
+aws_name_search = "{{ terraform_aws_gpu_ami_name }}"
+aws_ami_owner = "{{ terraform_aws_gpu_ami_owner }}"
+{% else %}
 aws_name_search = "{{ terraform_aws_ns }}"
 aws_ami_owner = "{{ terraform_aws_ami_owner }}"
+{% endif %}
 aws_instance_type = "{{ terraform_aws_instance_type }}"
 aws_ebs_volumes_per_instance = "{{ terraform_aws_ebs_volumes_per_instance }}"
 aws_ebs_volume_size = {{ terraform_aws_ebs_volume_size }}
diff --git a/scripts/aws_api.py b/scripts/aws_api.py
index a9180b31..fe66b3e0 100755
--- a/scripts/aws_api.py
+++ b/scripts/aws_api.py
@@ -956,7 +956,7 @@ if TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING
 config TERRAFORM_AWS_GPU_AMI_NAME
     string
     output yaml
-    default "Deep Learning AMI GPU TensorFlow*"
+    default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
     help
       AMI name pattern for AWS Deep Learning AMI.
 
@@ -1061,7 +1061,7 @@ if TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING
 config TERRAFORM_AWS_GPU_AMI_NAME
     string
     output yaml
-    default "Deep Learning AMI GPU TensorFlow*"
+    default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*"
 
 config TERRAFORM_AWS_GPU_AMI_OWNER
     string
diff --git a/scripts/dynamic-cloud-kconfig.Makefile b/scripts/dynamic-cloud-kconfig.Makefile
index 9c9d718e..dbcda506 100644
--- a/scripts/dynamic-cloud-kconfig.Makefile
+++ b/scripts/dynamic-cloud-kconfig.Makefile
@@ -110,6 +110,11 @@ cloud-update:
 		sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.location.static; \
 		echo "  Created $(AWS_KCONFIG_DIR)/Kconfig.location.static"; \
 	fi
+	$(Q)if [ -f $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated ]; then \
+		cp $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \
+		sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \
+		echo "  Created $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static"; \
+	fi
 	# AWS instance type families
 	$(Q)for file in $(AWS_INSTANCE_TYPES_DIR)/Kconfig.*.generated; do \
 		if [ -f "$$file" ]; then \
-- 
2.50.1