From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from bombadil.infradead.org (bombadil.infradead.org [198.137.202.133]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0FD3272605 for ; Sun, 7 Sep 2025 04:23:26 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.137.202.133 ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1757219008; cv=none; b=fvT/z7flM5CDjNJpN5Ulz4bvKuU61Xef8M3oXBkeBtbE5Ho3gNpi8IvNn+VgT6CubWBpm+BHJrtVEKMZczMTwdCHQFo6OWF+vL0tdJmx9sa879a/t+BrAsASqfHnBvSktPINB9HrWkFpk3qHgk+td/Lg1yVTQCti3fbSN/BosfU= ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1757219008; c=relaxed/simple; bh=2wtCJELlzuqLP3fryliQ0KnFgY2N9TJiL1kGCKtNyew=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=IpiYNYR7huynlvZbYe1jHOgB/fAuygW0pfVVsQJtzEp4WimPIIZpBGBBtGyqwq1fVshqoXetDq09S/Z3PG5sXS3nLjI7iuhJfMnCWFOBM27j8ZDrcYTb0poJUYBol4Jqd88UZ/CXYct0BwLURP0b6aGY8isM4SKOdqtOdc6QxSQ= ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org; spf=none smtp.mailfrom=infradead.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b=TBbJzQxY; arc=none smtp.client-ip=198.137.202.133 Authentication-Results: smtp.subspace.kernel.org; dmarc=fail (p=quarantine dis=none) header.from=kernel.org Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=infradead.org Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=infradead.org header.i=@infradead.org header.b="TBbJzQxY" DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=bombadil.20210309; h=Sender:Content-Transfer-Encoding: MIME-Version:References:In-Reply-To:Message-ID:Date:Subject:Cc:To:From: Reply-To:Content-Type:Content-ID:Content-Description; bh=Z7PkhE7yZF9wzjUxQFi2C58THLu/cB779am8azAyMTY=; b=TBbJzQxY1cm3FMiUC+cj0EGsL0 W9xGbzc6xKh2p1Lcp9lfwDEHU0W9IJiREfoDsoTK1S5LuxYDjZv7l+Vm3ltRmom4dm0E/1Bbh97j3 uoGRjWnEArL1TzKwfSWHxzbXHflp4T/gI/On2M22q71GS1WRETVYu/8YKHIj7bicQNHXd+1lfqVu+ 5l/6b3Ntykpw7T2FhZyBGlbed1XIqA4myNbTq0SAIfWJBUfdqxnUkGhYYXiue/gDugfbL5KSxtRGg xJYdXzi8pxGdAjP8s7poGdtysQ84/G00xB+wl8EfCQmDBrJXfjUiTS0nInLjWyT+AGqSrK6bOvAR7 Pb2cAQWw==; Received: from mcgrof by bombadil.infradead.org with local (Exim 4.98.2 #2 (Red Hat Linux)) id 1uv6w2-00000009Lq5-1klM; Sun, 07 Sep 2025 04:23:26 +0000 From: Luis Chamberlain To: Chuck Lever , Daniel Gomez , kdevops@lists.linux.dev Cc: Luis Chamberlain Subject: [PATCH 2/2] aws: enable GPU AMI support for GPU instances Date: Sat, 6 Sep 2025 21:23:23 -0700 Message-ID: <20250907042325.2228868-3-mcgrof@kernel.org> X-Mailer: git-send-email 2.49.0 In-Reply-To: <20250907042325.2228868-1-mcgrof@kernel.org> References: <20250907042325.2228868-1-mcgrof@kernel.org> Precedence: bulk X-Mailing-List: kdevops@lists.linux.dev List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: Luis Chamberlain Add support for GPU-optimized AMIs when using GPU instance types. This includes: - AWS Deep Learning AMI with pre-installed NVIDIA drivers, CUDA, and ML frameworks - NVIDIA Deep Learning AMI option for NGC containers - Custom GPU AMI support for specialized images - Automatic detection of GPU instance types - Conditional display of GPU AMI options only for GPU instances - Update terraform.tfvars template to use GPU AMI when configured - Add defconfig for AWS G6e.2xlarge GPU instance with Deep Learning AMI The system automatically detects when you select a GPU instance family (like G6E) and provides appropriate GPU-optimized AMI options including the AWS Deep Learning AMI with all necessary drivers and frameworks pre-installed. Generated-by: Claude AI Signed-off-by: Luis Chamberlain --- defconfigs/aws-gpu-g6e-ai | 42 +++++++++++++++++++ .../templates/aws/terraform.tfvars.j2 | 5 +++ scripts/aws_api.py | 4 +- scripts/dynamic-cloud-kconfig.Makefile | 6 +++ terraform/aws/kconfigs/Kconfig.compute | 5 +++ 5 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 defconfigs/aws-gpu-g6e-ai diff --git a/defconfigs/aws-gpu-g6e-ai b/defconfigs/aws-gpu-g6e-ai new file mode 100644 index 00000000..028b0c5e --- /dev/null +++ b/defconfigs/aws-gpu-g6e-ai @@ -0,0 +1,42 @@ +# AWS G6e.2xlarge GPU instance with Deep Learning AMI for AI/ML workloads +# This configuration sets up an AWS G6e.2xlarge instance with NVIDIA L40S GPU +# optimized for machine learning, AI inference, and GPU-accelerated workloads + +# Cloud provider configuration +CONFIG_KDEVOPS_ENABLE_TERRAFORM=y +CONFIG_TERRAFORM=y +CONFIG_TERRAFORM_AWS=y + +# AWS Dynamic configuration (required for G6E instance family and GPU AMIs) +CONFIG_TERRAFORM_AWS_USE_DYNAMIC_CONFIG=y + +# AWS Instance configuration - G6E family with NVIDIA L40S GPU +# G6E.2XLARGE specifications: +# - 8 vCPUs (3rd Gen AMD EPYC processors) +# - 32 GB system RAM +# - 1x NVIDIA L40S Tensor Core GPU +# - 48 GB GPU memory +# - Up to 15 Gbps network performance +# - Up to 10 Gbps EBS bandwidth +CONFIG_TERRAFORM_AWS_INSTANCE_TYPE_G6E=y +CONFIG_TERRAFORM_AWS_INSTANCE_G6E_2XLARGE=y + +# AWS Region - US East (N. Virginia) - primary availability for G6E +CONFIG_TERRAFORM_AWS_REGION_US_EAST_1=y + +# GPU-optimized Deep Learning AMI +# Includes: NVIDIA drivers 535+, CUDA 12.x, cuDNN, TensorFlow, PyTorch, MXNet +CONFIG_TERRAFORM_AWS_USE_GPU_AMI=y +CONFIG_TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING=y +CONFIG_TERRAFORM_AWS_GPU_AMI_NAME="Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*" +CONFIG_TERRAFORM_AWS_GPU_AMI_OWNER="amazon" + +# Storage configuration optimized for ML workloads +# 200 GB for datasets, models, and experiment artifacts +CONFIG_TERRAFORM_AWS_DATA_VOLUME_SIZE=200 + +# Note: After provisioning, the instance will have: +# - Jupyter notebook server ready for ML experiments +# - Pre-installed deep learning frameworks +# - NVIDIA GPU drivers and CUDA toolkit +# - Docker with NVIDIA Container Toolkit for containerized ML workloads diff --git a/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2 b/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2 index d880254b..f8f4c842 100644 --- a/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2 +++ b/playbooks/roles/gen_tfvars/templates/aws/terraform.tfvars.j2 @@ -1,8 +1,13 @@ aws_profile = "{{ terraform_aws_profile }}" aws_region = "{{ terraform_aws_region }}" aws_availability_zone = "{{ terraform_aws_av_zone }}" +{% if terraform_aws_use_gpu_ami is defined and terraform_aws_use_gpu_ami %} +aws_name_search = "{{ terraform_aws_gpu_ami_name }}" +aws_ami_owner = "{{ terraform_aws_gpu_ami_owner }}" +{% else %} aws_name_search = "{{ terraform_aws_ns }}" aws_ami_owner = "{{ terraform_aws_ami_owner }}" +{% endif %} aws_instance_type = "{{ terraform_aws_instance_type }}" aws_ebs_volumes_per_instance = "{{ terraform_aws_ebs_volumes_per_instance }}" aws_ebs_volume_size = {{ terraform_aws_ebs_volume_size }} diff --git a/scripts/aws_api.py b/scripts/aws_api.py index e23acaa9..b22da559 100755 --- a/scripts/aws_api.py +++ b/scripts/aws_api.py @@ -956,7 +956,7 @@ if TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING config TERRAFORM_AWS_GPU_AMI_NAME string output yaml - default "Deep Learning AMI GPU TensorFlow*" + default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*" help AMI name pattern for AWS Deep Learning AMI. @@ -1061,7 +1061,7 @@ if TERRAFORM_AWS_GPU_AMI_DEEP_LEARNING config TERRAFORM_AWS_GPU_AMI_NAME string output yaml - default "Deep Learning AMI GPU TensorFlow*" + default "Deep Learning OSS Nvidia Driver AMI GPU PyTorch*Ubuntu 22.04*" config TERRAFORM_AWS_GPU_AMI_OWNER string diff --git a/scripts/dynamic-cloud-kconfig.Makefile b/scripts/dynamic-cloud-kconfig.Makefile index fffa5446..c2d187bf 100644 --- a/scripts/dynamic-cloud-kconfig.Makefile +++ b/scripts/dynamic-cloud-kconfig.Makefile @@ -45,6 +45,7 @@ dynamic_aws_kconfig_touch: $(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated $(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.compute.static $(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.location.static + $(Q)touch $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static $(Q)for family in $(AWS_INSTANCE_TYPE_FAMILIES); do \ touch $(AWS_INSTANCE_TYPES_DIR)/Kconfig.$$family.generated; \ touch $(AWS_INSTANCE_TYPES_DIR)/Kconfig.$$family.static; \ @@ -117,6 +118,11 @@ cloud-update: sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.location.static; \ echo " Created $(AWS_KCONFIG_DIR)/Kconfig.location.static"; \ fi + $(Q)if [ -f $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated ]; then \ + cp $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.generated $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \ + sed -i 's/Kconfig\.\([^.]*\)\.generated/Kconfig.\1.static/g' $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static; \ + echo " Created $(AWS_KCONFIG_DIR)/Kconfig.gpu-amis.static"; \ + fi # AWS instance type families $(Q)for file in $(AWS_INSTANCE_TYPES_DIR)/Kconfig.*.generated; do \ if [ -f "$$file" ]; then \ diff --git a/terraform/aws/kconfigs/Kconfig.compute b/terraform/aws/kconfigs/Kconfig.compute index 12083d1a..6b5ff900 100644 --- a/terraform/aws/kconfigs/Kconfig.compute +++ b/terraform/aws/kconfigs/Kconfig.compute @@ -80,3 +80,8 @@ source "terraform/aws/kconfigs/distros/Kconfig.oracle" source "terraform/aws/kconfigs/distros/Kconfig.rhel" source "terraform/aws/kconfigs/distros/Kconfig.sles" source "terraform/aws/kconfigs/distros/Kconfig.custom" + +# Include GPU AMI configuration if available (generated by cloud-config) +if TERRAFORM_AWS_USE_DYNAMIC_CONFIG +source "terraform/aws/kconfigs/Kconfig.gpu-amis.static" +endif -- 2.50.1