← Back to team overview

yahoo-eng-team team mailing list archive

[Bug 2125445] [NEW] Nova/Placement ignores the flavor’s trait constraints when scheduling SR-IOV vGPU devices.

 

Public bug reported:

Nova/Placement ignores the flavor’s trait constraints when scheduling
SR-IOV vGPU devices.

Environment

Deployment : OpenStack Epoxy 2025.1 (Kolla-Ansible)
Hypervisor node : Ubuntu 24.04, NVIDIA vGPU driver 570.148
Hardware : 10xRTX 6000 Ada cards in SR-IOV mode
nova.conf on compute :
[pci]
report_in_placement = true
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:52:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:53:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:56:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:57:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:ce:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d1:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d2:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }


[filter_scheduler]
pci_in_placement = true
enabled_filters = ComputeFilter, ComputeCapabilitiesFilter, ImagePropertiesFilter, ServerGroupAntiAffinityFilter, ServerGroupAffinityFilter, PciPassthroughFilter
available_filters = nova.scheduler.filters.all_filters

[scheduler]
allocation_candidate_request_method = post
max_placement_results = 128

[DEFAULT]
#allowing resize to same host otherwise resize fails, it takes into account whats currently running on the host (including the resized instance active resource usage)
allow_resize_to_same_host = true
# added because of the long time it takes to attach volumes
block_device_allocate_retries = 700 

[libvirt]
volume_use_multipath = True

[compute]
# added because of the long time it takes to attach volumes
volume_attach_retry_count = 70 
# added because of the long time it takes to attach volumes
volume_attach_retry_interval = 7 

nova-api.conf:
[pci]
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }

placement.conf:
[api]
placement_log_debug = true

[placement]
max_allocation_candidates = 1024
allocation_candidates_generation_strategy = breadth-first

Flavour:
openstack flavor create g1.8q --private \
  --ram 4096 --vcpu 4 --disk 0 \
  --property "pci_passthrough:alias"="rtx6000-ada-8q:1"
openstack flavor set --project admin g1.8q

Each PF is enabled for VFs and the current_vgpu_type of all VFs are set
respectively to the profiles you see above each boot.


Steps to reproduce:

1.) I loaded a base QEMU ubuntu image with vgpu guest drivers pre-installed
2.) Running nvidia-smi shows that three rtx6000-ada-8q are allocated instead of one

Is this a config issue? Am I missing something?

** Affects: nova
     Importance: Undecided
         Status: New

-- 
You received this bug notification because you are a member of Yahoo!
Engineering Team, which is subscribed to OpenStack Compute (nova).
https://bugs.launchpad.net/bugs/2125445

Title:
  Nova/Placement ignores the flavor’s trait constraints when scheduling
  SR-IOV vGPU devices.

Status in OpenStack Compute (nova):
  New

Bug description:
  Nova/Placement ignores the flavor’s trait constraints when scheduling
  SR-IOV vGPU devices.

  Environment

  Deployment : OpenStack Epoxy 2025.1 (Kolla-Ansible)
  Hypervisor node : Ubuntu 24.04, NVIDIA vGPU driver 570.148
  Hardware : 10xRTX 6000 Ada cards in SR-IOV mode
  nova.conf on compute :
  [pci]
  report_in_placement = true
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:4f:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:52:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:53:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:56:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:57:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:ce:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d1:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d2:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.6", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:00.7", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.0", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d5:01.1", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.4", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
  device_spec = { "vendor_id":"10de", "product_id":"26b1", "address":"0000:d6:00.5", "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "managed":"no" }
  alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
  alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
  alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }

  
  [filter_scheduler]
  pci_in_placement = true
  enabled_filters = ComputeFilter, ComputeCapabilitiesFilter, ImagePropertiesFilter, ServerGroupAntiAffinityFilter, ServerGroupAffinityFilter, PciPassthroughFilter
  available_filters = nova.scheduler.filters.all_filters

  [scheduler]
  allocation_candidate_request_method = post
  max_placement_results = 128

  [DEFAULT]
  #allowing resize to same host otherwise resize fails, it takes into account whats currently running on the host (including the resized instance active resource usage)
  allow_resize_to_same_host = true
  # added because of the long time it takes to attach volumes
  block_device_allocate_retries = 700 

  [libvirt]
  volume_use_multipath = True

  [compute]
  # added because of the long time it takes to attach volumes
  volume_attach_retry_count = 70 
  # added because of the long time it takes to attach volumes
  volume_attach_retry_interval = 7 

  nova-api.conf:
  [pci]
  alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_48Q", "device_type":"type-VF", "name":"rtx6000-ada-48q" }
  alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_8Q", "device_type":"type-VF", "name":"rtx6000-ada-8q" }
  alias = { "resource_class":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "traits":"CUSTOM_NVIDIA_RTX6000_ADA_24Q", "device_type":"type-VF", "name":"rtx6000-ada-24q" }

  placement.conf:
  [api]
  placement_log_debug = true

  [placement]
  max_allocation_candidates = 1024
  allocation_candidates_generation_strategy = breadth-first

  Flavour:
  openstack flavor create g1.8q --private \
    --ram 4096 --vcpu 4 --disk 0 \
    --property "pci_passthrough:alias"="rtx6000-ada-8q:1"
  openstack flavor set --project admin g1.8q

  Each PF is enabled for VFs and the current_vgpu_type of all VFs are
  set respectively to the profiles you see above each boot.

  
  Steps to reproduce:

  1.) I loaded a base QEMU ubuntu image with vgpu guest drivers pre-installed
  2.) Running nvidia-smi shows that three rtx6000-ada-8q are allocated instead of one

  Is this a config issue? Am I missing something?

To manage notifications about this bug go to:
https://bugs.launchpad.net/nova/+bug/2125445/+subscriptions



Follow ups