yahoo-eng-team team mailing list archive
-
yahoo-eng-team team
-
Mailing list archive
-
Message #96454
[Bug 2125445] [NEW] Nova/Placement ignores the flavor’s trait constraints when scheduling SR-IOV vGPU devices.
Public bug reported:
Nova/Placement ignores the flavor’s trait constraints when scheduling
SR-IOV vGPU devices.
Environment
Deployment : OpenStack Epoxy 2025.1 (Kolla-Ansible)
Hypervisor node : Ubuntu 24.04, NVIDIA vGPU driver 570.148
Hardware : 10xRTX 6000 Ada cards in SR-IOV mode
nova.conf on compute :
[pci]
report_in_placement = true
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:52:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:53:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:56:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:57:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:ce:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d1:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d2:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }
[filter_scheduler]
pci_in_placement = true
enabled_filters = ComputeFilter, ComputeCapabilitiesFilter, ImagePropertiesFilter, ServerGroupAntiAffinityFilter, ServerGroupAffinityFilter, PciPassthroughFilter
available_filters = nova.scheduler.filters.all_filters
[scheduler]
allocation_candidate_request_method = post
max_placement_results = 128
[DEFAULT]
#allowing resize to same host otherwise resize fails, it takes into account whats currently running on the host (including the resized instance active resource usage)
allow_resize_to_same_host = true
# added because of the long time it takes to attach volumes
block_device_allocate_retries = 700
[libvirt]
volume_use_multipath = True
[compute]
# added because of the long time it takes to attach volumes
volume_attach_retry_count = 70
# added because of the long time it takes to attach volumes
volume_attach_retry_interval = 7
nova-api.conf:
[pci]
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }
placement.conf:
[api]
placement_log_debug = true
[placement]
max_allocation_candidates = 1024
allocation_candidates_generation_strategy = breadth-first
Flavour:
openstack flavor create g1.8q --private \
--ram 4096 --vcpu 4 --disk 0 \
--property "pci_passthrough:alias"="rtx6000-ada-8q:1"
openstack flavor set --project admin g1.8q
Each PF is enabled for VFs and the current_vgpu_type of all VFs are set
respectively to the profiles you see above each boot.
Steps to reproduce:
1.) I loaded a base QEMU ubuntu image with vgpu guest drivers pre-installed
2.) Running nvidia-smi shows that three rtx6000-ada-8q are allocated instead of one
Is this a config issue? Am I missing something?
** Affects: nova
Importance: Undecided
Status: New
--
You received this bug notification because you are a member of Yahoo!
Engineering Team, which is subscribed to OpenStack Compute (nova).
https://bugs.launchpad.net/bugs/2125445
Title:
Nova/Placement ignores the flavor’s trait constraints when scheduling
SR-IOV vGPU devices.
Status in OpenStack Compute (nova):
New
Bug description:
Nova/Placement ignores the flavor’s trait constraints when scheduling
SR-IOV vGPU devices.
Environment
Deployment : OpenStack Epoxy 2025.1 (Kolla-Ansible)
Hypervisor node : Ubuntu 24.04, NVIDIA vGPU driver 570.148
Hardware : 10xRTX 6000 Ada cards in SR-IOV mode
nova.conf on compute :
[pci]
report_in_placement = true
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:52:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:53:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:56:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:57:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:ce:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d1:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d2:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }
[filter_scheduler]
pci_in_placement = true
enabled_filters = ComputeFilter, ComputeCapabilitiesFilter, ImagePropertiesFilter, ServerGroupAntiAffinityFilter, ServerGroupAffinityFilter, PciPassthroughFilter
available_filters = nova.scheduler.filters.all_filters
[scheduler]
allocation_candidate_request_method = post
max_placement_results = 128
[DEFAULT]
#allowing resize to same host otherwise resize fails, it takes into account whats currently running on the host (including the resized instance active resource usage)
allow_resize_to_same_host = true
# added because of the long time it takes to attach volumes
block_device_allocate_retries = 700
[libvirt]
volume_use_multipath = True
[compute]
# added because of the long time it takes to attach volumes
volume_attach_retry_count = 70
# added because of the long time it takes to attach volumes
volume_attach_retry_interval = 7
nova-api.conf:
[pci]
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }
placement.conf:
[api]
placement_log_debug = true
[placement]
max_allocation_candidates = 1024
allocation_candidates_generation_strategy = breadth-first
Flavour:
openstack flavor create g1.8q --private \
--ram 4096 --vcpu 4 --disk 0 \
--property "pci_passthrough:alias"="rtx6000-ada-8q:1"
openstack flavor set --project admin g1.8q
Each PF is enabled for VFs and the current_vgpu_type of all VFs are
set respectively to the profiles you see above each boot.
Steps to reproduce:
1.) I loaded a base QEMU ubuntu image with vgpu guest drivers pre-installed
2.) Running nvidia-smi shows that three rtx6000-ada-8q are allocated instead of one
Is this a config issue? Am I missing something?
To manage notifications about this bug go to:
https://bugs.launchpad.net/nova/+bug/2125445/+subscriptions
Follow ups