From a473b5e1f49afbc972f62b2b77adf10e99fa6100 Mon Sep 17 00:00:00 2001
From: OpenStack Release Bot <infra-root@openstack.org>
Date: Fri, 11 Mar 2022 09:54:33 +0000
Subject: [PATCH 01/93] [stable-only] Update .gitreview for stable/yoga

Change-Id: I73f6abe17a2bc9d740bb6b83c9f97412ea9a7040
---
 .gitreview | 1 +
 1 file changed, 1 insertion(+)
diff --git a/.gitreview b/.gitreview
index c2b7eef7078..b111c5d6a83 100644
--- a/.gitreview
+++ b/.gitreview
@@ -2,3 +2,4 @@
 host=review.opendev.org
 port=29418
 project=openstack/nova.git
+defaultbranch=stable/yoga

From 1bb0697f1cd2c93bf81d36af3b05b08295f2ca31 Mon Sep 17 00:00:00 2001
From: OpenStack Release Bot <infra-root@openstack.org>
Date: Fri, 11 Mar 2022 09:54:38 +0000
Subject: [PATCH 02/93] [stable-only] Update TOX_CONSTRAINTS_FILE for
 stable/yoga

Update the URL to the upper-constraints file to point to the redirect
rule on releases.openstack.org so that anyone working on this branch
will switch to the correct upper-constraints list automatically when
the requirements repository branches.

Until the requirements repository has as stable/yoga branch, tests will
continue to use the upper-constraints list on master.

Change-Id: I266e16e645c5676ba4cdf7572fbd472db1426b87
---
 tox.ini | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tox.ini b/tox.ini
index 2875f3374ad..dd382514f3c 100644
--- a/tox.ini
+++ b/tox.ini
@@ -26,7 +26,7 @@ setenv =
 # TODO(stephenfin): Remove once we bump our upper-constraint to SQLAlchemy 2.0
   SQLALCHEMY_WARN_20=1
 deps =
-  -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master}
+  -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/yoga}
   -r{toxinidir}/requirements.txt
   -r{toxinidir}/test-requirements.txt
 extras =
@@ -227,7 +227,7 @@ description =
 # Note that we don't use {[testenv]deps} for deps here because we don't want
 # to install (test-)requirements.txt for docs.
 deps =
-  -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/master}
+  -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/yoga}
   -r{toxinidir}/doc/requirements.txt
 extras =
 commands =

From a9f444a997cd0a26aae3dac821299adf0e944121 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Tue, 22 Feb 2022 12:24:37 +0100
Subject: [PATCH 03/93] Fix eventlet.tpool import

Currently nova.utils.tpool_execute() only works by chance. And as the
bug report shows there are env where it fails.

The nova.utils.tpool_execute() call tries to uses eventlet.tpool.execute
but the tpool module is not imported by the utils module only eventlet.
In devstack it works by chance as the wsgi init actually imports
eventlet.tpool indirectly via:

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "/opt/stack/nova/nova/api/openstack/compute/__init__.py", line 21, in <module>
    from nova.api.openstack.compute.routes import APIRouterV21 # noqa
  File "/opt/stack/nova/nova/api/openstack/compute/routes.py", line 20, in <module>
    from nova.api.openstack.compute import admin_actions
  File "/opt/stack/nova/nova/api/openstack/compute/admin_actions.py", line 17, in <module>
    from nova.api.openstack import common
  File "/opt/stack/nova/nova/api/openstack/common.py", line 27, in <module>
    from nova.compute import task_states
  File "/opt/stack/nova/nova/compute/task_states.py", line 26, in <module>
    from nova.objects import fields
  File "/opt/stack/nova/nova/objects/fields.py", line 24, in <module>
    from nova.network import model as network_model
  File "/opt/stack/nova/nova/network/model.py", line 23, in <module>
    from nova import utils
  File "/opt/stack/nova/nova/utils.py", line 39, in <module>
    from oslo_concurrency import processutils
  File "/usr/local/lib/python3.8/dist-packages/oslo_concurrency/processutils.py", line 57, in <module>
    from eventlet import tpool

This was broken since I8dbc579e0037969aab4f2bb500fccfbde4190726. This
patch adds the correct import statement.

Change-Id: Ic46345ceeb445164aea6ae9b35c457c6150765f6
Closes-Bug: #1915400
(cherry picked from commit b2d28f890872747d099a262e4a208e146b882f3f)
---
 nova/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nova/utils.py b/nova/utils.py
index ec5e6c92480..664056a09fd 100644
--- a/nova/utils.py
+++ b/nova/utils.py
@@ -29,6 +29,7 @@
 import tempfile
 
 import eventlet
+from eventlet import tpool
 from keystoneauth1 import loading as ks_loading
 import netaddr
 from openstack import connection
@@ -685,7 +686,7 @@ def context_wrapper(*args, **kwargs):
 
 def tpool_execute(func, *args, **kwargs):
     """Run func in a native thread"""
-    eventlet.tpool.execute(func, *args, **kwargs)
+    tpool.execute(func, *args, **kwargs)
 
 
 def is_none_string(val):

From 15b72717f2f3bd79791b913f1b294a19ced47ca7 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Thu, 14 Apr 2022 20:35:11 +0200
Subject: [PATCH 04/93] [stable-only] Drop lower-constraints job

During the PTG the TC discussed the topic and decided to drop the job
completely. Since the latest job configuration broke all stable gate
for nova (older than yoga) this is needed there to unblock our gates.
For dropping the job on master let's wait to the resolution as the
gate is not broken there, hence the patch is stable-only.

Change-Id: I514f6b337ffefef90a0ce9ab0b4afd083caa277e
---
 .zuul.yaml            |   1 -
 lower-constraints.txt | 166 ------------------------------------------
 tox.ini               |   7 --
 3 files changed, 174 deletions(-)
 delete mode 100644 lower-constraints.txt

diff --git a/.zuul.yaml b/.zuul.yaml
index 4c1f0c442c6..dcae6117a52 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -622,7 +622,6 @@
       - check-requirements
       - integrated-gate-compute
       - openstack-cover-jobs
-      - openstack-lower-constraints-jobs
       - openstack-python3-yoga-jobs
       - openstack-python3-yoga-jobs-arm64
       - periodic-stable-jobs
diff --git a/lower-constraints.txt b/lower-constraints.txt
deleted file mode 100644
index 93e757994e2..00000000000
--- a/lower-constraints.txt
+++ /dev/null
@@ -1,166 +0,0 @@
-alembic==1.5.0
-amqp==2.5.0
-appdirs==1.4.3
-asn1crypto==0.24.0
-attrs==17.4.0
-automaton==1.14.0
-bandit==1.1.0
-cachetools==2.0.1
-castellan==0.16.0
-cffi==1.14.0
-cliff==2.11.0
-cmd2==0.8.1
-colorama==0.3.9
-coverage==4.0
-cryptography==2.7
-cursive==0.2.1
-dataclasses==0.7
-ddt==1.2.1
-debtcollector==1.19.0
-decorator==4.1.0
-deprecation==2.0
-dogpile.cache==0.6.5
-enum-compat==0.0.2
-eventlet==0.30.1
-extras==1.0.0
-fasteners==0.14.1
-fixtures==3.0.0
-future==0.16.0
-futurist==1.8.0
-gabbi==1.35.0
-gitdb2==2.0.3
-GitPython==2.1.8
-greenlet==0.4.15
-idna==2.6
-iso8601==0.1.11
-Jinja2==2.10
-jmespath==0.9.3
-jsonpatch==1.21
-jsonpath-rw==1.4.0
-jsonpath-rw-ext==1.1.3
-jsonpointer==2.0
-jsonschema==3.2.0
-keystoneauth1==3.16.0
-keystonemiddleware==4.20.0
-kombu==4.6.1
-linecache2==1.0.0
-lxml==4.5.0
-Mako==1.0.7
-MarkupSafe==1.1.1
-microversion-parse==0.2.1
-mock==3.0.0
-msgpack==0.6.0
-msgpack-python==0.5.6
-munch==2.2.0
-mypy==0.761
-netaddr==0.7.18
-netifaces==0.10.4
-networkx==2.1.0
-numpy==1.19.0
-openstacksdk==0.35.0
-os-brick==5.2
-os-client-config==1.29.0
-os-resource-classes==1.1.0
-os-service-types==1.7.0
-os-traits==2.7.0
-os-vif==1.15.2
-os-win==5.5.0
-osc-lib==1.10.0
-oslo.cache==1.26.0
-oslo.concurrency==4.5.0
-oslo.config==8.6.0
-oslo.context==3.4.0
-oslo.db==10.0.0
-oslo.i18n==5.1.0
-oslo.log==4.6.1
-oslo.limit==1.5.0
-oslo.messaging==10.3.0
-oslo.middleware==3.31.0
-oslo.policy==3.7.0
-oslo.privsep==2.6.2
-oslo.reports==1.18.0
-oslo.rootwrap==5.8.0
-oslo.serialization==4.2.0
-oslo.service==2.8.0
-oslo.upgradecheck==1.3.0
-oslo.utils==4.12.1
-oslo.versionedobjects==1.35.0
-oslo.vmware==3.6.0
-oslotest==3.8.0
-osprofiler==1.4.0
-ovs==2.10.0
-ovsdbapp==0.15.0
-packaging==20.4
-paramiko==2.7.1
-Paste==2.0.2
-PasteDeploy==1.5.0
-pbr==5.8.0
-pluggy==0.6.0
-ply==3.11
-prettytable==0.7.1
-psutil==3.2.2
-psycopg2-binary==2.8
-py==1.5.2
-pyasn1==0.4.2
-pyasn1-modules==0.2.1
-pycadf==2.7.0
-pycparser==2.18
-pyinotify==0.9.6
-pyroute2==0.5.4
-PyJWT==1.7.0
-PyMySQL==0.8.0
-pyOpenSSL==17.5.0
-pyparsing==2.2.0
-pyperclip==1.6.0
-pypowervm==1.1.15
-pytest==3.4.2
-python-barbicanclient==4.5.2
-python-cinderclient==3.3.0
-python-dateutil==2.7.0
-python-editor==1.0.3
-python-glanceclient==2.8.0
-python-ironicclient==3.0.0
-python-keystoneclient==3.15.0
-python-mimeparse==1.6.0
-python-neutronclient==7.1.0
-python-subunit==1.4.0
-pytz==2018.3
-PyYAML==5.1
-repoze.lru==0.7
-requests==2.25.1
-requests-mock==1.2.0
-requestsexceptions==1.4.0
-retrying==1.3.3
-rfc3986==1.2.0
-Routes==2.3.1
-simplejson==3.13.2
-six==1.15.0
-smmap2==2.0.3
-sortedcontainers==2.1.0
-SQLAlchemy==1.4.13
-sqlalchemy-migrate==0.13.0
-sqlparse==0.2.4
-statsd==3.2.2
-stestr==2.0.0
-stevedore==1.20.0
-suds-jurko==0.6
-taskflow==3.8.0
-Tempita==0.5.2
-tenacity==6.3.1
-testrepository==0.0.20
-testresources==2.0.0
-testscenarios==0.4
-testtools==2.5.0
-tooz==1.58.0
-traceback2==1.4.0
-types-paramiko==0.1.3
-unittest2==1.1.0
-urllib3==1.22
-vine==1.1.4
-voluptuous==0.11.1
-warlock==1.3.1
-WebOb==1.8.2
-websockify==0.9.0
-wrapt==1.10.11
-wsgi-intercept==1.7.0
-zVMCloudConnector==1.3.0
diff --git a/tox.ini b/tox.ini
index dd382514f3c..300747a6ede 100644
--- a/tox.ini
+++ b/tox.ini
@@ -389,10 +389,3 @@ deps = bindep
 extras =
 commands =
   bindep test
-
-[testenv:lower-constraints]
-usedevelop = False
-deps =
-  -c{toxinidir}/lower-constraints.txt
-  -r{toxinidir}/test-requirements.txt
-  -r{toxinidir}/requirements.txt

From 3402aa7a53a48e6abc0fb8a2620cd449fd3f75fe Mon Sep 17 00:00:00 2001
From: "Erlon R. Cruz" <erlon@canonical.com>
Date: Thu, 10 Mar 2022 15:50:54 -0300
Subject: [PATCH 05/93] Adds regression test for bug LP#1944619

Related-bug: #1944619
Closes-bug: #1964472
Change-Id: Ie7e5377aea23a4fbd7ad91f245d17def6d0fb927
(cherry picked from commit 2ddb8bf53fdf9a17c09afc4987ab6efe8ba97696)
---
 .../regressions/test_bug_1944619.py           | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 nova/tests/functional/regressions/test_bug_1944619.py

diff --git a/nova/tests/functional/regressions/test_bug_1944619.py b/nova/tests/functional/regressions/test_bug_1944619.py
new file mode 100644
index 00000000000..3274ff5a158
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1944619.py
@@ -0,0 +1,82 @@
+# Copyright 2021, Canonical, Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+import mock
+
+from nova import exception as nova_exceptions
+from nova.tests.fixtures import libvirt as fakelibvirt
+from nova.tests.functional import integrated_helpers
+from nova.tests.functional.libvirt import base
+
+
+class TestRollbackWithHWOffloadedOVS(
+    base.LibvirtMigrationMixin,
+    base.ServersTestBase,
+    integrated_helpers.InstanceHelperMixin
+):
+    """Regression test for bug LP#1944619
+
+    Assert the behaviour observed in bug LP#1944619 caused by the live
+    migration cleanup code being used to cleanup pre-live migration failures.
+    When SRIOV devices are in use on a VM, that will cause the source host to
+    try to re-attach a VIF not actually de-attached causing a failure.
+
+    The exception mocked in pre_live_migration reproduce an arbitrary error
+    that might cause the pre-live migration process to fail and
+    rollback_live_migration_at_source reproduce the device re-attach failure.
+    """
+
+    api_major_version = 'v2.1'
+    microversion = 'latest'
+    ADMIN_API = True
+
+    def setUp(self):
+        super().setUp()
+
+        self.start_compute(
+            hostname='src',
+            host_info=fakelibvirt.HostInfo(
+                cpu_nodes=1, cpu_sockets=1, cpu_cores=4, cpu_threads=1))
+        self.start_compute(
+            hostname='dest',
+            host_info=fakelibvirt.HostInfo(
+                cpu_nodes=1, cpu_sockets=1, cpu_cores=4, cpu_threads=1))
+
+        self.src = self.computes['src']
+        self.dest = self.computes['dest']
+
+    def test_rollback_pre_live_migration(self):
+        self.server = self._create_server(host='src', networks='none')
+
+        lib_path = "nova.virt.libvirt.driver.LibvirtDriver"
+        funtion_path = "pre_live_migration"
+        mock_lib_path_prelive = "%s.%s" % (lib_path, funtion_path)
+        with mock.patch(mock_lib_path_prelive,
+                        side_effect=nova_exceptions.DestinationDiskExists(
+                            path='/var/non/existent')) as mlpp:
+            funtion_path = "rollback_live_migration_at_source"
+            mock_lib_path_rollback = "%s.%s" % (lib_path, funtion_path)
+            with mock.patch(mock_lib_path_rollback) as mlpr:
+                # Live migrate the instance to another host
+                self._live_migrate(self.server,
+                                   migration_expected_state='failed',
+                                   server_expected_state='MIGRATING')
+        # FIXME(erlon): In the current behavior,
+        # rollback_live_migration_at_source is called if an error happens
+        # during the  pre_live_migration phase on the destination and therefore
+        # triggers the observed bug. rollback_live_migration_at_source should
+        # *not* be called for when errors happen during pre_live_migration
+        # phase.
+        mlpr.assert_called_once()
+        mlpp.assert_called_once()

From 29b94aa34ad954e617c2a0d6df0809765dced188 Mon Sep 17 00:00:00 2001
From: "Erlon R. Cruz" <erlon@canonical.com>
Date: Tue, 7 Dec 2021 17:39:58 -0300
Subject: [PATCH 06/93] Fix pre_live_migration rollback

During the pre live migration process, Nova performs most of the
tasks related to the creation and operation of the VM in the destination
host. That is done without interrupting any of the hardware in the source
host. If the pre_live_migration fails, those same operations should be
rolled back.

Currently nova is sharing the _rollback_live_migration for both
live and pre_live migration rollbacks, and that is causing the source
host to try to re-attach network interfaces on the source host where
they weren't actually de-attached.

This patch fixes that by adding a conditional to allow nova to do
different paths for migration and pre_live_migration rollbacks.

Closes-bug: #1944619
Change-Id: I784190ac356695dd508e0ad8ec31d8eaa3ebee56
(cherry picked from commit 63ffba7496182f6f6f49a380f3c639fc3ded9772)
---
 nova/compute/manager.py                          | 16 ++++++++++++----
 .../functional/regressions/test_bug_1944619.py   |  8 +-------
 nova/tests/unit/compute/test_compute_mgr.py      |  6 ++++--
 .../bug-1944619-fix-live-migration-rollback.yaml | 10 ++++++++++
 4 files changed, 27 insertions(+), 13 deletions(-)
 create mode 100644 releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 44352909a2a..4df1c4112c3 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -8413,7 +8413,8 @@ def _cleanup_pre_live_migration(self, context, dest, instance,
         migrate_data.migration = migration
         self._rollback_live_migration(context, instance, dest,
                                       migrate_data=migrate_data,
-                                      source_bdms=source_bdms)
+                                      source_bdms=source_bdms,
+                                      pre_live_migration=True)
 
     def _do_pre_live_migration_from_source(self, context, dest, instance,
                                            block_migration, migration,
@@ -9167,7 +9168,8 @@ def _rollback_volume_bdms(self, context, bdms, original_bdms, instance):
     def _rollback_live_migration(self, context, instance,
                                  dest, migrate_data=None,
                                  migration_status='failed',
-                                 source_bdms=None):
+                                 source_bdms=None,
+                                 pre_live_migration=False):
         """Recovers Instance/volume state from migrating -> running.
 
         :param context: security context
@@ -9217,8 +9219,14 @@ def _rollback_live_migration(self, context, instance,
         #                  for nova-network)
         # NOTE(mriedem): This is a no-op for neutron.
         self.network_api.setup_networks_on_host(context, instance, self.host)
-        self.driver.rollback_live_migration_at_source(context, instance,
-                                                      migrate_data)
+
+        # NOTE(erlon): We should make sure that rollback_live_migration_at_src
+        # is not called in the pre_live_migration rollback as that will trigger
+        # the src host to re-attach interfaces which were not detached
+        # previously.
+        if not pre_live_migration:
+            self.driver.rollback_live_migration_at_source(context, instance,
+                                                          migrate_data)
 
         # NOTE(lyarwood): Fetch the current list of BDMs, disconnect any
         # connected volumes from the dest and delete any volume attachments
diff --git a/nova/tests/functional/regressions/test_bug_1944619.py b/nova/tests/functional/regressions/test_bug_1944619.py
index 3274ff5a158..82b7475dca8 100644
--- a/nova/tests/functional/regressions/test_bug_1944619.py
+++ b/nova/tests/functional/regressions/test_bug_1944619.py
@@ -72,11 +72,5 @@ def test_rollback_pre_live_migration(self):
                 self._live_migrate(self.server,
                                    migration_expected_state='failed',
                                    server_expected_state='MIGRATING')
-        # FIXME(erlon): In the current behavior,
-        # rollback_live_migration_at_source is called if an error happens
-        # during the  pre_live_migration phase on the destination and therefore
-        # triggers the observed bug. rollback_live_migration_at_source should
-        # *not* be called for when errors happen during pre_live_migration
-        # phase.
-        mlpr.assert_called_once()
+        mlpr.assert_not_called()
         mlpp.assert_called_once()
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index cd1a9369c4a..760ea79e877 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -9539,7 +9539,8 @@ def test_live_migration_wait_vif_plugged_vif_plug_error(
         self.assertEqual('error', self.migration.status)
         mock_rollback_live_mig.assert_called_once_with(
             self.context, self.instance, 'dest-host',
-            migrate_data=migrate_data, source_bdms=source_bdms)
+            migrate_data=migrate_data, source_bdms=source_bdms,
+            pre_live_migration=True)
 
     @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration')
     @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration')
@@ -9574,7 +9575,8 @@ def test_live_migration_wait_vif_plugged_timeout_error(
         self.assertEqual('error', self.migration.status)
         mock_rollback_live_mig.assert_called_once_with(
             self.context, self.instance, 'dest-host',
-            migrate_data=migrate_data, source_bdms=source_bdms)
+            migrate_data=migrate_data, source_bdms=source_bdms,
+            pre_live_migration=True)
 
     @mock.patch('nova.compute.rpcapi.ComputeAPI.pre_live_migration')
     @mock.patch('nova.compute.manager.ComputeManager._rollback_live_migration')
diff --git a/releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml b/releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml
new file mode 100644
index 00000000000..b6c68ed49f2
--- /dev/null
+++ b/releasenotes/notes/bug-1944619-fix-live-migration-rollback.yaml
@@ -0,0 +1,10 @@
+---
+fixes:
+  - |
+    Instances with hardware offloaded ovs ports no longer lose connectivity
+    after failed live migrations. The driver.rollback_live_migration_at_source
+    function is no longer called during during pre_live_migration rollback
+    which previously resulted in connectivity loss following a failed live
+    migration. See `Bug 1944619`_ for more details.
+
+    .. _Bug 1944619: https://bugs.launchpad.net/nova/+bug/1944619

From 1ac0d6984a43cddbb5a2f1a2f7bc115fd83517c9 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Thu, 28 Apr 2022 17:17:47 +0200
Subject: [PATCH 07/93] [CI] Install dependencies for docs target

When tox 'docs' target is called, first it installs the dependencies
(listed in 'deps') in 'installdeps' phase, then it installs nova (with
its requirements) in 'develop-inst' phase. In the latter case 'deps' is
not used so that the constraints defined in 'deps' are not used.
This could lead to failures on stable branches when new packages are
released that break the build. To avoid this, the simplest solution is
to pre-install requirements, i.e. add requirements.txt to 'docs' tox
target.

Conflicts:
  tox.ini

NOTE(elod.illes): conflict is due to branch specific upper constraints
file link.

Change-Id: I4471d4488d336d5af0c23028724c4ce79d6a2031
(cherry picked from commit 494e8d7db6f8a3d1a952f657acab353787f57e04)
---
 tox.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tox.ini b/tox.ini
index 300747a6ede..92edcd7b469 100644
--- a/tox.ini
+++ b/tox.ini
@@ -228,6 +228,7 @@ description =
 # to install (test-)requirements.txt for docs.
 deps =
   -c{env:TOX_CONSTRAINTS_FILE:https://releases.openstack.org/constraints/upper/yoga}
+  -r{toxinidir}/requirements.txt
   -r{toxinidir}/doc/requirements.txt
 extras =
 commands =

From 2be1570c96697bdb917a086ccdc5a05d3d21e9db Mon Sep 17 00:00:00 2001
From: Artom Lifshitz <alifshit@redhat.com>
Date: Fri, 22 Apr 2022 14:21:55 -0400
Subject: [PATCH 08/93] Reproduce live migration rollback w/o multi port
 bindings error

When the libvirt driver does live migration rollback of an instance
with network interfaces, it unconditionally refers to
migrate_data.vifs. These will only be set when Neutron has the
multiple port bindings extension. We don't handle the case of the
extension not being present, and currently the rollback will fail with
a "NotImplementedError: Cannot load 'vifs' in the base class" error.

Related-bug: 1969980
Change-Id: Ieef773453ed9f3ced564c1a352fbefbcc6a653ec
(cherry picked from commit 5181bae923bdae2b536affc87d39ddf2c5f7835d)
---
 .../regressions/test_bug_1888395.py           | 45 ++++++++++++++++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/nova/tests/functional/regressions/test_bug_1888395.py b/nova/tests/functional/regressions/test_bug_1888395.py
index e582ad3e851..8f2e2a0eeb4 100644
--- a/nova/tests/functional/regressions/test_bug_1888395.py
+++ b/nova/tests/functional/regressions/test_bug_1888395.py
@@ -23,14 +23,8 @@
 from nova.tests.functional.libvirt import base as libvirt_base
 
 
-class TestLiveMigrationWithoutMultiplePortBindings(
+class TestLiveMigrationWithoutMultiplePortBindingsBase(
         libvirt_base.ServersTestBase):
-    """Regression test for bug 1888395.
-
-    This regression test asserts that Live migration works when
-    neutron does not support the binding-extended api extension
-    and the legacy single port binding workflow is used.
-    """
 
     ADMIN_API = True
     microversion = 'latest'
@@ -72,6 +66,16 @@ def setUp(self):
             'nova.tests.fixtures.libvirt.Domain.migrateToURI3',
             self._migrate_stub))
 
+
+class TestLiveMigrationWithoutMultiplePortBindings(
+        TestLiveMigrationWithoutMultiplePortBindingsBase):
+    """Regression test for bug 1888395.
+
+    This regression test asserts that Live migration works when
+    neutron does not support the binding-extended api extension
+    and the legacy single port binding workflow is used.
+    """
+
     def _migrate_stub(self, domain, destination, params, flags):
         """Stub out migrateToURI3."""
 
@@ -124,3 +128,30 @@ def test_live_migrate(self):
             server, {'OS-EXT-SRV-ATTR:host': 'end_host', 'status': 'ACTIVE'})
         msg = "NotImplementedError: Cannot load 'vif_type' in the base class"
         self.assertNotIn(msg, self.stdlog.logger.output)
+
+
+class TestLiveMigrationRollbackWithoutMultiplePortBindings(
+        TestLiveMigrationWithoutMultiplePortBindingsBase):
+
+    def _migrate_stub(self, domain, destination, params, flags):
+        source = self.computes['start_host']
+        conn = source.driver._host.get_connection()
+        dom = conn.lookupByUUIDString(self.server['id'])
+        dom.fail_job()
+
+    def test_live_migration_rollback(self):
+        self.server = self._create_server(
+            host='start_host',
+            networks=[{'port': self.neutron.port_1['id']}])
+
+        self.assertFalse(
+            self.neutron_api.has_port_binding_extension(self.ctxt))
+        # FIXME(artom) Until bug 1969980 is fixed, this will fail with a
+        # NotImplementedError.
+        self._live_migrate(self.server, migration_expected_state='error',
+                           server_expected_state='ERROR')
+        server = self.api.get_server(self.server['id'])
+        self.assertIn(
+            "NotImplementedError: Cannot load 'vifs' in the base class",
+            server['fault']['details']
+        )

From 5f086d437e8e467ab8c90605c904470c37098227 Mon Sep 17 00:00:00 2001
From: Artom Lifshitz <alifshit@redhat.com>
Date: Mon, 25 Apr 2022 10:20:14 -0400
Subject: [PATCH 09/93] Fix LM rollback w/o multi port bindings extension

Previously, the libvirt driver's live migration rollback code would
unconditionally refer to migrate_data.vifs. This field would only be
set if the Neutron multiple port bindings extension was in use. When
it is not in use, the reference would fail with a NotImplementedError.
This patch wraps the migrate_data.vifs reference in a conditional that
checks if the vifs field is actually set. This is the only way to do
it, as in the libvirt driver we do not have access to the network
API's has_port_binding_extension() helper.

Closes-bug: 1969980
Change-Id: I48ca6a77de38e3afaa44630e6ae1fd41d2031ba9
(cherry picked from commit aa1b0a7ccb1fcf3644784125bfff6950993e1697)
---
 .../functional/regressions/test_bug_1888395.py      | 13 ++++---------
 nova/virt/libvirt/driver.py                         | 11 +++++++----
 2 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/nova/tests/functional/regressions/test_bug_1888395.py b/nova/tests/functional/regressions/test_bug_1888395.py
index 8f2e2a0eeb4..c50b78e2f66 100644
--- a/nova/tests/functional/regressions/test_bug_1888395.py
+++ b/nova/tests/functional/regressions/test_bug_1888395.py
@@ -146,12 +146,7 @@ def test_live_migration_rollback(self):
 
         self.assertFalse(
             self.neutron_api.has_port_binding_extension(self.ctxt))
-        # FIXME(artom) Until bug 1969980 is fixed, this will fail with a
-        # NotImplementedError.
-        self._live_migrate(self.server, migration_expected_state='error',
-                           server_expected_state='ERROR')
-        server = self.api.get_server(self.server['id'])
-        self.assertIn(
-            "NotImplementedError: Cannot load 'vifs' in the base class",
-            server['fault']['details']
-        )
+        # NOTE(artom) The live migration will still fail (we fail it in
+        # _migrate_stub()), but the server should correctly rollback to ACTIVE.
+        self._live_migrate(self.server, migration_expected_state='failed',
+                           server_expected_state='ACTIVE')
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 94e7b1945aa..d56ae3fd8cd 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -10418,10 +10418,13 @@ def rollback_live_migration_at_source(self, context, instance,
         :param instance:  the instance being migrated
         :param migrate_date: a LibvirtLiveMigrateData object
         """
-        network_info = network_model.NetworkInfo(
-            [vif.source_vif for vif in migrate_data.vifs
-                            if "source_vif" in vif and vif.source_vif])
-        self._reattach_instance_vifs(context, instance, network_info)
+        # NOTE(artom) migrate_data.vifs might not be set if our Neutron doesn't
+        # have the multiple port bindings extension.
+        if 'vifs' in migrate_data and migrate_data.vifs:
+            network_info = network_model.NetworkInfo(
+                [vif.source_vif for vif in migrate_data.vifs
+                                if "source_vif" in vif and vif.source_vif])
+            self._reattach_instance_vifs(context, instance, network_info)
 
     def rollback_live_migration_at_destination(self, context, instance,
                                                network_info,

From 60548e804219d91d8c68ab3d74dd0ae956cd33f3 Mon Sep 17 00:00:00 2001
From: Andrew Bonney <andrew.bonney@bbc.co.uk>
Date: Tue, 26 Apr 2022 11:35:38 +0100
Subject: [PATCH 10/93] Fix segment-aware scheduling permissions error

Resolves a bug encountered when setting the Nova scheduler to
be aware of Neutron routed provider network segments, by using
'query_placement_for_routed_network_aggregates'.

Non-admin users attempting to access the 'segment_id' attribute
of a subnet caused a traceback, resulting in instance creation
failure.

This patch ensures the Neutron client is initialised with an
administrative context no matter what the requesting user's
permissions are.

Change-Id: Ic0f25e4d2395560fc2b68f3b469e266ac59abaa2
Closes-Bug: #1970383
(cherry picked from commit ee32934f34afd8e6df467361e9d71788cd36f6ee)
---
 nova/network/neutron.py                        |  4 ++--
 nova/tests/unit/network/test_neutron.py        | 18 ++++++++++++++++--
 ...cheduling-permissions-92ba907b10a9eb1c.yaml |  7 +++++++
 3 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/bug-1970383-segment-scheduling-permissions-92ba907b10a9eb1c.yaml

diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index 3ee9774d24c..7d6b6a8af90 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -3874,7 +3874,7 @@ def get_segment_ids_for_network(
             either Segment extension isn't enabled in Neutron or if the network
             isn't configured for routing.
         """
-        client = get_client(context)
+        client = get_client(context, admin=True)
 
         if not self.has_segment_extension(client=client):
             return []
@@ -3905,7 +3905,7 @@ def get_segment_id_for_subnet(
             extension isn't enabled in Neutron or the provided subnet doesn't
             have segments (if the related network isn't configured for routing)
         """
-        client = get_client(context)
+        client = get_client(context, admin=True)
 
         if not self.has_segment_extension(client=client):
             return None
diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py
index dbcfa80c273..8d6e0638998 100644
--- a/nova/tests/unit/network/test_neutron.py
+++ b/nova/tests/unit/network/test_neutron.py
@@ -7026,13 +7026,17 @@ def test_get_requested_resource_for_instance_with_multiple_ports_extended(
             req_lvl_params.same_subtree,
         )
 
-    def test_get_segment_ids_for_network_no_segment_ext(self):
+    @mock.patch.object(neutronapi, 'get_client')
+    def test_get_segment_ids_for_network_no_segment_ext(self, mock_client):
+        mocked_client = mock.create_autospec(client.Client)
+        mock_client.return_value = mocked_client
         with mock.patch.object(
             self.api, 'has_segment_extension', return_value=False,
         ):
             self.assertEqual(
                 [], self.api.get_segment_ids_for_network(self.context,
                                                          uuids.network_id))
+            mock_client.assert_called_once_with(self.context, admin=True)
 
     @mock.patch.object(neutronapi, 'get_client')
     def test_get_segment_ids_for_network_passes(self, mock_client):
@@ -7046,6 +7050,7 @@ def test_get_segment_ids_for_network_passes(self, mock_client):
             res = self.api.get_segment_ids_for_network(
                 self.context, uuids.network_id)
         self.assertEqual([uuids.segment_id], res)
+        mock_client.assert_called_once_with(self.context, admin=True)
         mocked_client.list_subnets.assert_called_once_with(
             network_id=uuids.network_id, fields='segment_id')
 
@@ -7061,6 +7066,7 @@ def test_get_segment_ids_for_network_with_no_segments(self, mock_client):
             res = self.api.get_segment_ids_for_network(
                 self.context, uuids.network_id)
         self.assertEqual([], res)
+        mock_client.assert_called_once_with(self.context, admin=True)
         mocked_client.list_subnets.assert_called_once_with(
             network_id=uuids.network_id, fields='segment_id')
 
@@ -7076,14 +7082,19 @@ def test_get_segment_ids_for_network_fails(self, mock_client):
             self.assertRaises(exception.InvalidRoutedNetworkConfiguration,
                               self.api.get_segment_ids_for_network,
                               self.context, uuids.network_id)
+            mock_client.assert_called_once_with(self.context, admin=True)
 
-    def test_get_segment_id_for_subnet_no_segment_ext(self):
+    @mock.patch.object(neutronapi, 'get_client')
+    def test_get_segment_id_for_subnet_no_segment_ext(self, mock_client):
+        mocked_client = mock.create_autospec(client.Client)
+        mock_client.return_value = mocked_client
         with mock.patch.object(
             self.api, 'has_segment_extension', return_value=False,
         ):
             self.assertIsNone(
                 self.api.get_segment_id_for_subnet(self.context,
                                                    uuids.subnet_id))
+            mock_client.assert_called_once_with(self.context, admin=True)
 
     @mock.patch.object(neutronapi, 'get_client')
     def test_get_segment_id_for_subnet_passes(self, mock_client):
@@ -7097,6 +7108,7 @@ def test_get_segment_id_for_subnet_passes(self, mock_client):
             res = self.api.get_segment_id_for_subnet(
                 self.context, uuids.subnet_id)
         self.assertEqual(uuids.segment_id, res)
+        mock_client.assert_called_once_with(self.context, admin=True)
         mocked_client.show_subnet.assert_called_once_with(uuids.subnet_id)
 
     @mock.patch.object(neutronapi, 'get_client')
@@ -7111,6 +7123,7 @@ def test_get_segment_id_for_subnet_with_no_segment(self, mock_client):
             self.assertIsNone(
                 self.api.get_segment_id_for_subnet(self.context,
                                                    uuids.subnet_id))
+            mock_client.assert_called_once_with(self.context, admin=True)
 
     @mock.patch.object(neutronapi, 'get_client')
     def test_get_segment_id_for_subnet_fails(self, mock_client):
@@ -7124,6 +7137,7 @@ def test_get_segment_id_for_subnet_fails(self, mock_client):
             self.assertRaises(exception.InvalidRoutedNetworkConfiguration,
                               self.api.get_segment_id_for_subnet,
                               self.context, uuids.subnet_id)
+            mock_client.assert_called_once_with(self.context, admin=True)
 
     @mock.patch.object(neutronapi.LOG, 'debug')
     def test_get_port_pci_dev(self, mock_debug):
diff --git a/releasenotes/notes/bug-1970383-segment-scheduling-permissions-92ba907b10a9eb1c.yaml b/releasenotes/notes/bug-1970383-segment-scheduling-permissions-92ba907b10a9eb1c.yaml
new file mode 100644
index 00000000000..88495079e75
--- /dev/null
+++ b/releasenotes/notes/bug-1970383-segment-scheduling-permissions-92ba907b10a9eb1c.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    `Bug #1970383 <https://bugs.launchpad.net/nova/+bug/1970383>`_: Fixes a
+    permissions error when using the
+    'query_placement_for_routed_network_aggregates' scheduler variable, which
+    caused a traceback on instance creation for non-admin users.

From f04cfd42359d555e746df0e2c48d989d0c3244f1 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Thu, 28 Apr 2022 15:43:13 +0200
Subject: [PATCH 11/93] Isolate PCI tracker unit tests

During the testing If9ab424cc7375a1f0d41b03f01c4a823216b3eb8 we noticed
that the unit test cases of PciTracker._set_hvdev are changing and
leaking global state leading to unstable tests.

To reproduce on master, duplicate the
test_set_hvdev_remove_tree_maintained_with_allocations test case and run
PciDevTrackerTestCase serially. The duplicated test case will fail with

  File "/nova/nova/objects/pci_device.py", line 238, in _from_db_object
  setattr(pci_device, key, db_dev[key])
  KeyError: 'id'

This is caused by the fact that the test data is defined on module
level, both _create_tracker and _set_hvdevs modifies the devices
passed to them, and some test mixes passing db dicts to _set_hvdevs
that expects pci dicts from the hypervisor.

This patch fixes multiple related issues:
* always deepcopy what _create_tracker takes as that list is later
  returned to the PciTracker via a mock and the tracker might modify
  what it got

* ensure that _create_tracker takes db dicts (with id field) while
  _set_hvdevs takes pci dicts in the hypervisor format (without id
  field)

* always deepcopy what is passed to _set_hvdevs as the PciTracker modify
  what it gets.

* normalize when the deepcopy happens to give a safe patter for future
  test cases

Change-Id: I20fb4ea96d5dfabfc4be3b5ecec0e4e6c5b3a318
(cherry picked from commit c58376db75917444831934963fa75b4b57f08818)
---
 nova/tests/unit/pci/test_manager.py | 63 ++++++++++++++---------------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py
index 39d0b116bbb..1f3478a070b 100644
--- a/nova/tests/unit/pci/test_manager.py
+++ b/nova/tests/unit/pci/test_manager.py
@@ -42,6 +42,8 @@
                   product_id='p1', vendor_id='v1')
 fake_pci_2 = dict(fake_pci, address='0000:00:00.3')
 
+fake_pci_devs = [fake_pci, fake_pci_1, fake_pci_2]
+
 fake_pci_3 = dict(fake_pci, address='0000:00:01.1',
                   dev_type=fields.PciDeviceType.SRIOV_PF,
                   vendor_id='v2', product_id='p2', numa_node=None)
@@ -53,6 +55,7 @@
                   dev_type=fields.PciDeviceType.SRIOV_VF,
                   parent_addr='0000:00:01.1',
                   vendor_id='v2', product_id='p2', numa_node=None)
+fake_pci_devs_tree = [fake_pci_3, fake_pci_4, fake_pci_5]
 
 fake_db_dev = {
     'created_at': None,
@@ -142,14 +145,14 @@ def _create_pci_requests_object(self, requests,
                 requests=pci_reqs)
 
     def _create_tracker(self, fake_devs):
-        self.fake_devs = fake_devs
+        self.fake_devs = copy.deepcopy(fake_devs)
         self.tracker = manager.PciDevTracker(
             self.fake_context, objects.ComputeNode(id=1, numa_topology=None))
 
     def setUp(self):
         super(PciDevTrackerTestCase, self).setUp()
         self.fake_context = context.get_admin_context()
-        self.fake_devs = fake_db_devs[:]
+        self.fake_devs = copy.deepcopy(fake_db_devs)
         self.stub_out('nova.db.main.api.pci_device_get_all_by_node',
             self._fake_get_pci_devices)
         # The fake_pci_whitelist must be called before creating the fake
@@ -157,7 +160,7 @@ def setUp(self):
         patcher = pci_fakes.fake_pci_whitelist()
         self.addCleanup(patcher.stop)
         self._create_fake_instance()
-        self._create_tracker(fake_db_devs[:])
+        self._create_tracker(fake_db_devs)
 
     def test_pcidev_tracker_create(self):
         self.assertEqual(len(self.tracker.pci_devs), 3)
@@ -266,9 +269,8 @@ def test_update_devices_from_hypervisor_resources_32bit_domain(
 
     def test_set_hvdev_new_dev(self):
         fake_pci_3 = dict(fake_pci, address='0000:00:00.4', vendor_id='v2')
-        fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_1),
-                         copy.deepcopy(fake_pci_2), copy.deepcopy(fake_pci_3)]
-        self.tracker._set_hvdevs(fake_pci_devs)
+        fake_pci_devs = [fake_pci, fake_pci_1, fake_pci_2, fake_pci_3]
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         self.assertEqual(len(self.tracker.pci_devs), 4)
         self.assertEqual(set([dev.address for
                               dev in self.tracker.pci_devs]),
@@ -284,11 +286,8 @@ def test_set_hvdev_new_dev_tree_maintained(self):
         self._create_tracker(fake_db_devs_tree)
 
         fake_new_device = dict(fake_pci_5, id=12, address='0000:00:02.3')
-        fake_pci_devs = [copy.deepcopy(fake_pci_3),
-                         copy.deepcopy(fake_pci_4),
-                         copy.deepcopy(fake_pci_5),
-                         copy.deepcopy(fake_new_device)]
-        self.tracker._set_hvdevs(fake_pci_devs)
+        fake_pci_devs = [fake_pci_3, fake_pci_4, fake_pci_5, fake_new_device]
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         self.assertEqual(len(self.tracker.pci_devs), 4)
 
         pf = [dev for dev in self.tracker.pci_devs
@@ -304,15 +303,14 @@ def test_set_hvdev_new_dev_tree_maintained(self):
 
     def test_set_hvdev_changed(self):
         fake_pci_v2 = dict(fake_pci, address='0000:00:00.2', vendor_id='v1')
-        fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_2),
-                         copy.deepcopy(fake_pci_v2)]
-        self.tracker._set_hvdevs(fake_pci_devs)
+        fake_pci_devs = [fake_pci, fake_pci_2, fake_pci_v2]
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         self.assertEqual(set([dev.vendor_id for
                              dev in self.tracker.pci_devs]),
                          set(['v', 'v1']))
 
     def test_set_hvdev_remove(self):
-        self.tracker._set_hvdevs([fake_pci])
+        self.tracker._set_hvdevs(copy.deepcopy([fake_pci]))
         self.assertEqual(
             len([dev for dev in self.tracker.pci_devs
                  if dev.status == fields.PciDeviceStatus.REMOVED]),
@@ -324,8 +322,8 @@ def test_set_hvdev_remove_tree_maintained(self):
         # from previous scans)
         self._create_tracker(fake_db_devs_tree)
 
-        fake_pci_devs = [copy.deepcopy(fake_pci_3), copy.deepcopy(fake_pci_4)]
-        self.tracker._set_hvdevs(fake_pci_devs)
+        fake_pci_devs = [fake_pci_3, fake_pci_4]
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         self.assertEqual(
             2,
             len([dev for dev in self.tracker.pci_devs
@@ -344,8 +342,9 @@ def test_set_hvdev_remove_tree_maintained_with_allocations(self):
         # Make sure the device tree is properly maintained when there are
         # devices removed from the system that are allocated to vms.
 
-        all_devs = fake_db_devs_tree[:]
-        self._create_tracker(all_devs)
+        all_db_devs = fake_db_devs_tree
+        all_pci_devs = fake_pci_devs_tree
+        self._create_tracker(all_db_devs)
         # we start with 3 devices
         self.assertEqual(
             3,
@@ -358,7 +357,7 @@ def test_set_hvdev_remove_tree_maintained_with_allocations(self):
         claimed_dev = self.tracker.claim_instance(
             mock.sentinel.context, pci_requests_obj, None)[0]
 
-        self.tracker._set_hvdevs(all_devs)
+        self.tracker._set_hvdevs(copy.deepcopy(all_pci_devs))
         # and assert that no devices were removed
         self.assertEqual(
             0,
@@ -366,10 +365,10 @@ def test_set_hvdev_remove_tree_maintained_with_allocations(self):
                  if dev.status == fields.PciDeviceStatus.REMOVED]))
         # we then try to remove the allocated device from the set reported
         # by the driver.
-        fake_pci_devs = [dev for dev in all_devs
+        fake_pci_devs = [dev for dev in all_pci_devs
                          if dev['address'] != claimed_dev.address]
         with mock.patch("nova.pci.manager.LOG.warning") as log:
-            self.tracker._set_hvdevs(fake_pci_devs)
+            self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
             log.assert_called_once()
             args = log.call_args_list[0][0]  # args of first call
             self.assertIn('Unable to remove device with', args[0])
@@ -380,7 +379,7 @@ def test_set_hvdev_remove_tree_maintained_with_allocations(self):
                  if dev.status == fields.PciDeviceStatus.REMOVED]))
         # free the device that was allocated and update tracker again
         self.tracker._free_device(claimed_dev)
-        self.tracker._set_hvdevs(fake_pci_devs)
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         # and assert that one device is removed from the tracker
         self.assertEqual(
             1,
@@ -393,9 +392,8 @@ def test_set_hvdev_changed_stal(self):
         self.tracker.claim_instance(mock.sentinel.context,
                                     pci_requests_obj, None)
         fake_pci_3 = dict(fake_pci, address='0000:00:00.2', vendor_id='v2')
-        fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_2),
-                         copy.deepcopy(fake_pci_3)]
-        self.tracker._set_hvdevs(fake_pci_devs)
+        fake_pci_devs = [fake_pci, fake_pci_2, fake_pci_3]
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         self.assertEqual(len(self.tracker.stale), 1)
         self.assertEqual(self.tracker.stale['0000:00:00.2']['vendor_id'], 'v2')
 
@@ -424,13 +422,13 @@ def test_update_pci_for_instance_fail(self):
         self.assertIsNone(devs)
 
     def test_pci_claim_instance_with_numa(self):
-        fake_db_dev_3 = dict(fake_db_dev_1, id=4, address='0000:00:00.4')
-        fake_devs_numa = copy.deepcopy(fake_db_devs)
-        fake_devs_numa.append(fake_db_dev_3)
+        fake_pci_3 = dict(fake_pci_1, address='0000:00:00.4')
+        fake_devs_numa = copy.deepcopy(fake_pci_devs)
+        fake_devs_numa.append(fake_pci_3)
         self.tracker = manager.PciDevTracker(
             mock.sentinel.context,
             objects.ComputeNode(id=1, numa_topology=None))
-        self.tracker._set_hvdevs(fake_devs_numa)
+        self.tracker._set_hvdevs(copy.deepcopy(fake_devs_numa))
         pci_requests = copy.deepcopy(fake_pci_requests)[:1]
         pci_requests[0]['count'] = 2
         pci_requests_obj = self._create_pci_requests_object(pci_requests)
@@ -477,9 +475,8 @@ def test_save(self, migrate_mock):
                 'nova.db.main.api.pci_device_update',
                 self._fake_pci_device_update)
         fake_pci_v3 = dict(fake_pci, address='0000:00:00.2', vendor_id='v3')
-        fake_pci_devs = [copy.deepcopy(fake_pci), copy.deepcopy(fake_pci_2),
-                         copy.deepcopy(fake_pci_v3)]
-        self.tracker._set_hvdevs(fake_pci_devs)
+        fake_pci_devs = [fake_pci, fake_pci_2, fake_pci_v3]
+        self.tracker._set_hvdevs(copy.deepcopy(fake_pci_devs))
         self.update_called = 0
         self.tracker.save(self.fake_context)
         self.assertEqual(self.update_called, 3)

From d7bca631fea348fb56cc46c5680643bacd82513e Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Tue, 19 Apr 2022 17:50:34 +0200
Subject: [PATCH 12/93] Remove unavailable but not reported PCI devices at
 startup

We saw in the field that the pci_devices table can end up in
inconsistent state after a compute node HW failure and re-deployment.
There could be dependent devices where the parent PF is in available
state while the children VFs are in unavailable state. (Before the HW
fault the PF was allocated hence the VFs was marked unavailable).

In this state this PF is still schedulable but during the
PCI claim the handling of dependent devices in the PCI tracker fill fail
with the error: "Attempt to consume PCI device XXX from empty pool".

The reason of the failure is that when the PF is claimed, all the
children VFs are marked unavailable. But if the VF is already
unavailable such step fails.

One way the deployer might try to recover from this state is to remove
the VFs from the hypervisor and restart the compute agent. The compute
startup already has a logic to delete PCI devices that are unused and
not reported by the hypervisor. However this logic only removed devices
in 'available' state and ignored devices in 'unavailable' state.

If a device is unused and the hypervisor is not reporting the device any
more then it is safe to delete that device from the PCI tracker. So this
patch extends the logic to allow deleting 'unavailable' devices. There
is a small window when dependent PCI device is in 'unclaimable' state.
From cleanup perspective this is an analogous state. So it is also
added to the cleanup logic.

Related-Bug: #1969496
Change-Id: If9ab424cc7375a1f0d41b03f01c4a823216b3eb8
(cherry picked from commit 284ea72e96604bdf16d1c5c4db47247334841b2f)
---
 nova/objects/pci_device.py                 | 23 +++++-
 nova/pci/manager.py                        | 16 +++--
 nova/tests/unit/objects/test_pci_device.py | 10 +++
 nova/tests/unit/pci/test_manager.py        | 81 ++++++++++++++++++++++
 4 files changed, 123 insertions(+), 7 deletions(-)

diff --git a/nova/objects/pci_device.py b/nova/objects/pci_device.py
index 275d5da3564..b675641a06d 100644
--- a/nova/objects/pci_device.py
+++ b/nova/objects/pci_device.py
@@ -447,11 +447,30 @@ def allocate(self, instance):
             instance.pci_devices.objects.append(copy.copy(self))
 
     def remove(self):
-        if self.status != fields.PciDeviceStatus.AVAILABLE:
+        # We allow removal of a device is if it is unused. It can be unused
+        # either by being in available state or being in a state that shows
+        # that the parent or child device blocks the consumption of this device
+        expected_states = [
+            fields.PciDeviceStatus.AVAILABLE,
+            fields.PciDeviceStatus.UNAVAILABLE,
+            fields.PciDeviceStatus.UNCLAIMABLE,
+        ]
+        if self.status not in expected_states:
             raise exception.PciDeviceInvalidStatus(
                 compute_node_id=self.compute_node_id,
                 address=self.address, status=self.status,
-                hopestatus=[fields.PciDeviceStatus.AVAILABLE])
+                hopestatus=expected_states)
+        # Just to be on the safe side, do not allow removal of device that has
+        # an owner even if the state of the device suggests that it is not
+        # owned.
+        if 'instance_uuid' in self and self.instance_uuid is not None:
+            raise exception.PciDeviceInvalidOwner(
+                compute_node_id=self.compute_node_id,
+                address=self.address,
+                owner=self.instance_uuid,
+                hopeowner=None,
+            )
+
         self.status = fields.PciDeviceStatus.REMOVED
         self.instance_uuid = None
         self.request_id = None
diff --git a/nova/pci/manager.py b/nova/pci/manager.py
index fc6a8417246..78cfb05dbbd 100644
--- a/nova/pci/manager.py
+++ b/nova/pci/manager.py
@@ -217,10 +217,13 @@ def _set_hvdevs(self, devices: ty.List[ty.Dict[str, ty.Any]]) -> None:
                 # from the pci whitelist.
                 try:
                     existed.remove()
-                except exception.PciDeviceInvalidStatus as e:
-                    LOG.warning("Unable to remove device with %(status)s "
-                                "ownership %(instance_uuid)s because of "
-                                "%(pci_exception)s. "
+                except (
+                        exception.PciDeviceInvalidStatus,
+                        exception.PciDeviceInvalidOwner,
+                ) as e:
+                    LOG.warning("Unable to remove device with status "
+                                "'%(status)s' and ownership %(instance_uuid)s "
+                                "because of %(pci_exception)s. "
                                 "Check your [pci]passthrough_whitelist "
                                 "configuration to make sure this allocated "
                                 "device is whitelisted. If you have removed "
@@ -250,7 +253,10 @@ def _set_hvdevs(self, devices: ty.List[ty.Dict[str, ty.Any]]) -> None:
                 else:
                     # Note(yjiang5): no need to update stats if an assigned
                     # device is hot removed.
-                    self.stats.remove_device(existed)
+                    # NOTE(gibi): only remove the device from the pools if it
+                    # is not already removed
+                    if existed in self.stats.get_free_devs():
+                        self.stats.remove_device(existed)
             else:
                 # Update tracked devices.
                 new_value: ty.Dict[str, ty.Any]
diff --git a/nova/tests/unit/objects/test_pci_device.py b/nova/tests/unit/objects/test_pci_device.py
index 4087b898009..91ec566c32b 100644
--- a/nova/tests/unit/objects/test_pci_device.py
+++ b/nova/tests/unit/objects/test_pci_device.py
@@ -467,6 +467,16 @@ def test_remove_device_fail(self):
         devobj.claim(self.inst.uuid)
         self.assertRaises(exception.PciDeviceInvalidStatus, devobj.remove)
 
+    def test_remove_device_fail_owned_with_unavailable_state(self):
+        # This test creates an PCI device in an invalid state. This should
+        # not happen in any known scenario. But we want to be save not to allow
+        # removing a device that has an owner. See bug 1969496 for more details
+        self._create_fake_instance()
+        devobj = pci_device.PciDevice.create(None, dev_dict)
+        devobj.claim(self.inst.uuid)
+        devobj.status = fields.PciDeviceStatus.UNAVAILABLE
+        self.assertRaises(exception.PciDeviceInvalidOwner, devobj.remove)
+
 
 class TestPciDeviceObject(test_objects._LocalTest,
                           _TestPciDeviceObject):
diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py
index 1f3478a070b..4dd15ed204a 100644
--- a/nova/tests/unit/pci/test_manager.py
+++ b/nova/tests/unit/pci/test_manager.py
@@ -397,6 +397,87 @@ def test_set_hvdev_changed_stal(self):
         self.assertEqual(len(self.tracker.stale), 1)
         self.assertEqual(self.tracker.stale['0000:00:00.2']['vendor_id'], 'v2')
 
+    def _get_device_by_address(self, address):
+        devs = [dev for dev in self.tracker.pci_devs if dev.address == address]
+        if len(devs) == 1:
+            return devs[0]
+        if devs:
+            raise ValueError('ambiguous address', devs)
+        else:
+            raise ValueError('device not found', address)
+
+    def test_set_hvdevs_unavailable_vf_removed(self):
+        # We start with a PF parent and two VF children
+        self._create_tracker([fake_db_dev_3, fake_db_dev_4, fake_db_dev_5])
+        pci_requests_obj = self._create_pci_requests_object(
+            [
+                {
+                    'count': 1,
+                    'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
+                }
+            ],
+            instance_uuid=uuidsentinel.instance1,
+        )
+        # then claim and allocate the PF that makes the VFs unavailable
+        self.tracker.claim_instance(
+            mock.sentinel.context, pci_requests_obj, None)
+        self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
+
+        dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
+        self.assertEqual('allocated', dev3_pf.status)
+        self.assertEqual(uuidsentinel.instance1, dev3_pf.instance_uuid)
+        dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
+        self.assertEqual('unavailable', dev4_vf.status)
+        dev5_vf = self._get_device_by_address(fake_db_dev_5['address'])
+        self.assertEqual('unavailable', dev5_vf.status)
+
+        # now simulate that one VF (dev_5) is removed from the hypervisor and
+        # the compute is restarted. As the VF is not claimed or allocated we
+        # are free to remove it from the tracker.
+        self.tracker._set_hvdevs(copy.deepcopy([fake_pci_3, fake_pci_4]))
+
+        dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
+        self.assertEqual('allocated', dev3_pf.status)
+        self.assertEqual(uuidsentinel.instance1, dev3_pf.instance_uuid)
+        dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
+        self.assertEqual('unavailable', dev4_vf.status)
+        dev5_vf = self._get_device_by_address(fake_db_dev_5['address'])
+        self.assertEqual('removed', dev5_vf.status)
+
+    def test_set_hvdevs_unavailable_pf_removed(self):
+        # We start with one PF parent and one child VF
+        self._create_tracker([fake_db_dev_3, fake_db_dev_4])
+        pci_requests_obj = self._create_pci_requests_object(
+            [
+                {
+                    'count': 1,
+                    'spec': [{'dev_type': fields.PciDeviceType.SRIOV_VF}]
+                }
+            ],
+            instance_uuid=uuidsentinel.instance1,
+        )
+        # Then we claim and allocate the VF that makes the PF unavailable
+        self.tracker.claim_instance(
+            mock.sentinel.context, pci_requests_obj, None)
+        self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
+
+        dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
+        self.assertEqual('unavailable', dev3_pf.status)
+        dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
+        self.assertEqual('allocated', dev4_vf.status)
+        self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
+
+        # now simulate that the parent PF is removed from the hypervisor and
+        # the compute is restarted. As the PF is not claimed or allocated we
+        # are free to remove it from the tracker.
+        self.tracker._set_hvdevs(copy.deepcopy([fake_pci_4]))
+
+        dev3_pf = self._get_device_by_address(fake_db_dev_3['address'])
+        self.assertEqual('removed', dev3_pf.status)
+        dev4_vf = self._get_device_by_address(fake_db_dev_4['address'])
+        self.assertEqual('allocated', dev4_vf.status)
+        self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
+
     def test_update_pci_for_instance_active(self):
         pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)
         self.tracker.claim_instance(mock.sentinel.context,

From 23c48b670668e9cae886b5113c776077780f5581 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Tue, 19 Apr 2022 17:27:31 +0200
Subject: [PATCH 13/93] Simulate bug 1969496

As If9ab424cc7375a1f0d41b03f01c4a823216b3eb8 stated there is a way for
the pci_device table to become inconsistent. Parent PF can be in
'available' state while children VFs are still in 'unavailable' state.
In this situation the PF is schedulable but the PCI claim will fail to
when try to mark the dependent VFs unavailable.

This patch adds a test case that shows the error.

Related-Bug: #1969496

Change-Id: I7b432d7a32aeb1ab765d1f731691c7841a8f1440
(cherry picked from commit 9ee5d2c66255f83cc8a66f1b5648fa13e1d73f47)
---
 nova/tests/unit/pci/test_manager.py | 56 +++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py
index 4dd15ed204a..dc2ac263b96 100644
--- a/nova/tests/unit/pci/test_manager.py
+++ b/nova/tests/unit/pci/test_manager.py
@@ -21,6 +21,7 @@
 
 from nova.compute import vm_states
 from nova import context
+from nova import exception
 from nova import objects
 from nova.objects import fields
 from nova.pci import manager
@@ -478,6 +479,61 @@ def test_set_hvdevs_unavailable_pf_removed(self):
         self.assertEqual('allocated', dev4_vf.status)
         self.assertEqual(uuidsentinel.instance1, dev4_vf.instance_uuid)
 
+    def test_claim_available_pf_while_child_vf_is_unavailable(self):
+        # NOTE(gibi): this is bug 1969496. The state created here is
+        # inconsistent and should not happen. But it did happen in some cases
+        # where we were not able to track down the way how it happened.
+
+        # We start with a PF parent and a VF child. The PF is available and
+        # the VF is unavailable.
+        pf = copy.deepcopy(fake_db_dev_3)
+        vf = copy.deepcopy(fake_db_dev_4)
+        vf['status'] = fields.PciDeviceStatus.UNAVAILABLE
+        self._create_tracker([pf, vf])
+
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('available', pf_dev.status)
+        vf_dev = self._get_device_by_address(vf['address'])
+        self.assertEqual('unavailable', vf_dev.status)
+
+        pci_requests_obj = self._create_pci_requests_object(
+            [
+                {
+                    'count': 1,
+                    'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
+                }
+            ],
+            instance_uuid=uuidsentinel.instance1,
+        )
+        # now try to claim and allocate the PF. It should work as it is
+        # available
+        # This is bug 1969496 as the claim fails with exception
+        ex = self.assertRaises(
+            exception.PciDevicePoolEmpty,
+            self.tracker.claim_instance,
+            mock.sentinel.context,
+            pci_requests_obj,
+            None
+        )
+        self.assertIn(
+            'Attempt to consume PCI device 1:0000:00:02.1 from empty pool',
+            str(ex)
+        )
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('available', pf_dev.status)
+        vf_dev = self._get_device_by_address(vf['address'])
+        self.assertEqual('unavailable', vf_dev.status)
+
+        # This should work when the bug is fixed
+        # self.tracker.claim_instance(
+        #     mock.sentinel.context, pci_requests_obj, None)
+        # self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
+
+        # pf_dev = self._get_device_by_address(pf['address'])
+        # self.assertEqual('allocated', pf_dev.status)
+        # vf_dev = self._get_device_by_address(vf['address'])
+        # self.assertEqual('unavailable', vf_dev.status)
+
     def test_update_pci_for_instance_active(self):
         pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)
         self.tracker.claim_instance(mock.sentinel.context,

From 4ca4b2e6bc84ca2c209653b46d7428c6c7cbd270 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Tue, 19 Apr 2022 18:36:50 +0200
Subject: [PATCH 14/93] Allow claiming PCI PF if child VF is unavailable

As If9ab424cc7375a1f0d41b03f01c4a823216b3eb8 stated there is a way for
the pci_device table to become inconsistent. Parent PF can be in
'available' state while children VFs are still in 'unavailable' state.
In this situation the PF is schedulable but the PCI claim will fail
when try to mark the dependent VFs unavailable.

This patch changes the PCI claim logic to allow claiming the parent PF
in the inconsistent situation as we assume that it is safe to do so.
This claim also fixed the inconsistency so that when the parent PF is
freed the children VFs become available again.

Closes-Bug: #1969496
Change-Id: I575ce06bcc913add7db0849f85728371da2032fc
(cherry picked from commit 3af2ecc13fa9334de8418accaed4fffefefb41da)
---
 nova/objects/pci_device.py          |  38 +++++++-
 nova/pci/stats.py                   |   6 +-
 nova/tests/unit/pci/test_manager.py | 140 ++++++++++++++++++++++++----
 3 files changed, 160 insertions(+), 24 deletions(-)

diff --git a/nova/objects/pci_device.py b/nova/objects/pci_device.py
index b675641a06d..b0d5b75826b 100644
--- a/nova/objects/pci_device.py
+++ b/nova/objects/pci_device.py
@@ -346,10 +346,40 @@ def claim(self, instance_uuid):
             # Update PF status to CLAIMED if all of it dependants are free
             # and set their status to UNCLAIMABLE
             vfs_list = self.child_devices
-            if not all([vf.is_available() for vf in vfs_list]):
-                raise exception.PciDeviceVFInvalidStatus(
-                    compute_node_id=self.compute_node_id,
-                    address=self.address)
+            non_free_dependants = [
+                vf for vf in vfs_list if not vf.is_available()]
+            if non_free_dependants:
+                # NOTE(gibi): There should not be any dependent devices that
+                # are UNCLAIMABLE or UNAVAILABLE as the parent is AVAILABLE,
+                # but we got reports in bug 1969496 that this inconsistency
+                # can happen. So check if the only non-free devices are in
+                # state UNCLAIMABLE or UNAVAILABLE then we log a warning but
+                # allow to claim the parent.
+                actual_statuses = {
+                    child.status for child in non_free_dependants}
+                allowed_non_free_statues = {
+                    fields.PciDeviceStatus.UNCLAIMABLE,
+                    fields.PciDeviceStatus.UNAVAILABLE,
+                }
+                if actual_statuses - allowed_non_free_statues == set():
+                    LOG.warning(
+                        "Some child device of parent %s is in an inconsistent "
+                        "state. If you can reproduce this warning then please "
+                        "report a bug at "
+                        "https://bugs.launchpad.net/nova/+filebug with "
+                        "reproduction steps. Inconsistent children with "
+                        "state: %s",
+                        self.address,
+                        ",".join(
+                            "%s - %s" % (child.address, child.status)
+                            for child in non_free_dependants
+                        ),
+                    )
+
+                else:
+                    raise exception.PciDeviceVFInvalidStatus(
+                        compute_node_id=self.compute_node_id,
+                        address=self.address)
             self._bulk_update_status(vfs_list,
                                            fields.PciDeviceStatus.UNCLAIMABLE)
 
diff --git a/nova/pci/stats.py b/nova/pci/stats.py
index c8dda84d4bf..6a53c43c787 100644
--- a/nova/pci/stats.py
+++ b/nova/pci/stats.py
@@ -279,8 +279,12 @@ def _handle_device_dependents(self, pci_dev: 'objects.PciDevice') -> None:
         if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
             vfs_list = pci_dev.child_devices
             if vfs_list:
+                free_devs = self.get_free_devs()
                 for vf in vfs_list:
-                    self.remove_device(vf)
+                    # NOTE(gibi): do not try to remove a device that are
+                    # already removed
+                    if vf in free_devs:
+                        self.remove_device(vf)
         elif pci_dev.dev_type in (
             fields.PciDeviceType.SRIOV_VF,
             fields.PciDeviceType.VDPA,
diff --git a/nova/tests/unit/pci/test_manager.py b/nova/tests/unit/pci/test_manager.py
index dc2ac263b96..e9e4a455902 100644
--- a/nova/tests/unit/pci/test_manager.py
+++ b/nova/tests/unit/pci/test_manager.py
@@ -507,32 +507,134 @@ def test_claim_available_pf_while_child_vf_is_unavailable(self):
         )
         # now try to claim and allocate the PF. It should work as it is
         # available
-        # This is bug 1969496 as the claim fails with exception
-        ex = self.assertRaises(
-            exception.PciDevicePoolEmpty,
-            self.tracker.claim_instance,
-            mock.sentinel.context,
-            pci_requests_obj,
-            None
-        )
+        self.tracker.claim_instance(
+            mock.sentinel.context, pci_requests_obj, None)
+        self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
+
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('allocated', pf_dev.status)
+        vf_dev = self._get_device_by_address(vf['address'])
+        self.assertEqual('unavailable', vf_dev.status)
+
         self.assertIn(
-            'Attempt to consume PCI device 1:0000:00:02.1 from empty pool',
-            str(ex)
+            'Some child device of parent 0000:00:01.1 is in an inconsistent '
+            'state. If you can reproduce this warning then please report a '
+            'bug at https://bugs.launchpad.net/nova/+filebug with '
+            'reproduction steps. Inconsistent children with state: '
+            '0000:00:02.1 - unavailable',
+            self.stdlog.logger.output
         )
+
+        # Ensure that the claim actually fixes the inconsistency so when the
+        # parent if freed the children become available too.
+        self.tracker.free_instance(
+            mock.sentinel.context, {'uuid': uuidsentinel.instance1})
+
         pf_dev = self._get_device_by_address(pf['address'])
         self.assertEqual('available', pf_dev.status)
         vf_dev = self._get_device_by_address(vf['address'])
-        self.assertEqual('unavailable', vf_dev.status)
+        self.assertEqual('available', vf_dev.status)
+
+    def test_claim_available_pf_while_children_vfs_are_in_mixed_state(self):
+        # We start with a PF parent and two VF children. The PF is available
+        # and one of the VF is unavailable while the other is available.
+        pf = copy.deepcopy(fake_db_dev_3)
+        vf1 = copy.deepcopy(fake_db_dev_4)
+        vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
+        vf2 = copy.deepcopy(fake_db_dev_5)
+        vf2['status'] = fields.PciDeviceStatus.AVAILABLE
+        self._create_tracker([pf, vf1, vf2])
+
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('available', pf_dev.status)
+        vf1_dev = self._get_device_by_address(vf1['address'])
+        self.assertEqual('unavailable', vf1_dev.status)
+        vf2_dev = self._get_device_by_address(vf2['address'])
+        self.assertEqual('available', vf2_dev.status)
+
+        pci_requests_obj = self._create_pci_requests_object(
+            [
+                {
+                    'count': 1,
+                    'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
+                }
+            ],
+            instance_uuid=uuidsentinel.instance1,
+        )
+        # now try to claim and allocate the PF. It should work as it is
+        # available
+        self.tracker.claim_instance(
+            mock.sentinel.context, pci_requests_obj, None)
+        self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
+
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('allocated', pf_dev.status)
+        vf1_dev = self._get_device_by_address(vf1['address'])
+        self.assertEqual('unavailable', vf1_dev.status)
+        vf2_dev = self._get_device_by_address(vf2['address'])
+        self.assertEqual('unavailable', vf2_dev.status)
+
+        self.assertIn(
+            'Some child device of parent 0000:00:01.1 is in an inconsistent '
+            'state. If you can reproduce this warning then please report a '
+            'bug at https://bugs.launchpad.net/nova/+filebug with '
+            'reproduction steps. Inconsistent children with state: '
+            '0000:00:02.1 - unavailable',
+            self.stdlog.logger.output
+        )
+
+        # Ensure that the claim actually fixes the inconsistency so when the
+        # parent if freed the children become available too.
+        self.tracker.free_instance(
+            mock.sentinel.context, {'uuid': uuidsentinel.instance1})
+
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('available', pf_dev.status)
+        vf1_dev = self._get_device_by_address(vf1['address'])
+        self.assertEqual('available', vf1_dev.status)
+        vf2_dev = self._get_device_by_address(vf2['address'])
+        self.assertEqual('available', vf2_dev.status)
+
+    def test_claim_available_pf_while_a_child_is_used(self):
+        pf = copy.deepcopy(fake_db_dev_3)
+        vf1 = copy.deepcopy(fake_db_dev_4)
+        vf1['status'] = fields.PciDeviceStatus.UNAVAILABLE
+        vf2 = copy.deepcopy(fake_db_dev_5)
+        vf2['status'] = fields.PciDeviceStatus.CLAIMED
+        self._create_tracker([pf, vf1, vf2])
+
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('available', pf_dev.status)
+        vf1_dev = self._get_device_by_address(vf1['address'])
+        self.assertEqual('unavailable', vf1_dev.status)
+        vf2_dev = self._get_device_by_address(vf2['address'])
+        self.assertEqual('claimed', vf2_dev.status)
 
-        # This should work when the bug is fixed
-        # self.tracker.claim_instance(
-        #     mock.sentinel.context, pci_requests_obj, None)
-        # self.tracker.allocate_instance({'uuid': uuidsentinel.instance1})
+        pci_requests_obj = self._create_pci_requests_object(
+            [
+                {
+                    'count': 1,
+                    'spec': [{'dev_type': fields.PciDeviceType.SRIOV_PF}]
+                }
+            ],
+            instance_uuid=uuidsentinel.instance1,
+        )
+        # now try to claim and allocate the PF. The claim should fail as on of
+        # the child is used.
+        self.assertRaises(
+            exception.PciDeviceVFInvalidStatus,
+            self.tracker.claim_instance,
+            mock.sentinel.context,
+            pci_requests_obj,
+            None,
+        )
 
-        # pf_dev = self._get_device_by_address(pf['address'])
-        # self.assertEqual('allocated', pf_dev.status)
-        # vf_dev = self._get_device_by_address(vf['address'])
-        # self.assertEqual('unavailable', vf_dev.status)
+        pf_dev = self._get_device_by_address(pf['address'])
+        self.assertEqual('available', pf_dev.status)
+        vf1_dev = self._get_device_by_address(vf1['address'])
+        self.assertEqual('unavailable', vf1_dev.status)
+        vf2_dev = self._get_device_by_address(vf2['address'])
+        self.assertEqual('claimed', vf2_dev.status)
 
     def test_update_pci_for_instance_active(self):
         pci_requests_obj = self._create_pci_requests_object(fake_pci_requests)

From 875668827896a44db8dd5083bd6148625c6bddea Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Sat, 26 Feb 2022 19:51:18 +0000
Subject: [PATCH 15/93] Retry in CellDatabases fixture when global DB state
 changes

There is a NOTE in the CellDatabases code about an unlikely but
possible race that can occur between taking the writer lock to set
the last DB context manager and taking the reader lock to call
target_cell(). When the race is detected, a RuntimeError is raised.

We can handle the race by retrying setting the last DB context manager
when the race is detected, as described in the NOTE.

Closes-Bug: #1959677

Change-Id: I5c0607ce5910dce581ab9360cc7fc69ba9673f35
(cherry picked from commit 1c8122a25f50b40934af127d7717b55794ff38b5)
---
 nova/tests/fixtures/nova.py | 66 ++++++++++++++++++++++++++-----------
 1 file changed, 46 insertions(+), 20 deletions(-)

diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py
index ef873f6654a..810c6f62dde 100644
--- a/nova/tests/fixtures/nova.py
+++ b/nova/tests/fixtures/nova.py
@@ -22,6 +22,7 @@
 import functools
 import logging as std_logging
 import os
+import time
 import warnings
 
 import eventlet
@@ -451,6 +452,13 @@ def _wrap_target_cell(self, context, cell_mapping):
         #     yield to do the actual work. We can do schedulable things
         #     here and not exclude other threads from making progress.
         #     If an exception is raised, we capture that and save it.
+        #     Note that it is possible that another thread has changed the
+        #     global state (step #2) after we released the writer lock but
+        #     before we acquired the reader lock. If this happens, we will
+        #     detect the global state change and retry step #2 a limited number
+        #     of times. If we happen to race repeatedly with another thread and
+        #     exceed our retry limit, we will give up and raise a RuntimeError,
+        #     which will fail the test.
         #  4. If we changed state in #2, we need to change it back. So we grab
         #     a writer lock again and do that.
         #  5. Finally, if an exception was raised in #3 while state was
@@ -469,29 +477,47 @@ def _wrap_target_cell(self, context, cell_mapping):
 
         raised_exc = None
 
-        with self._cell_lock.write_lock():
-            if cell_mapping is not None:
-                # This assumes the next local DB access is the same cell that
-                # was targeted last time.
-                self._last_ctxt_mgr = desired
+        def set_last_ctxt_mgr():
+            with self._cell_lock.write_lock():
+                if cell_mapping is not None:
+                    # This assumes the next local DB access is the same cell
+                    # that was targeted last time.
+                    self._last_ctxt_mgr = desired
 
-        with self._cell_lock.read_lock():
-            if self._last_ctxt_mgr != desired:
-                # NOTE(danms): This is unlikely to happen, but it's possible
-                # another waiting writer changed the state between us letting
-                # it go and re-acquiring as a reader. If lockutils supported
-                # upgrading and downgrading locks, this wouldn't be a problem.
-                # Regardless, assert that it is still as we left it here
-                # so we don't hit the wrong cell. If this becomes a problem,
-                # we just need to retry the write section above until we land
-                # here with the cell we want.
-                raise RuntimeError('Global DB state changed underneath us')
+        # Set last context manager to the desired cell's context manager.
+        set_last_ctxt_mgr()
 
+        # Retry setting the last context manager if we detect that a writer
+        # changed global DB state before we take the read lock.
+        for retry_time in range(0, 3):
             try:
-                with self._real_target_cell(context, cell_mapping) as ccontext:
-                    yield ccontext
-            except Exception as exc:
-                raised_exc = exc
+                with self._cell_lock.read_lock():
+                    if self._last_ctxt_mgr != desired:
+                        # NOTE(danms): This is unlikely to happen, but it's
+                        # possible another waiting writer changed the state
+                        # between us letting it go and re-acquiring as a
+                        # reader. If lockutils supported upgrading and
+                        # downgrading locks, this wouldn't be a problem.
+                        # Regardless, assert that it is still as we left it
+                        # here so we don't hit the wrong cell. If this becomes
+                        # a problem, we just need to retry the write section
+                        # above until we land here with the cell we want.
+                        raise RuntimeError(
+                            'Global DB state changed underneath us')
+                    try:
+                        with self._real_target_cell(
+                            context, cell_mapping
+                        ) as ccontext:
+                            yield ccontext
+                    except Exception as exc:
+                        raised_exc = exc
+                    # Leave the retry loop after calling target_cell
+                    break
+            except RuntimeError:
+                # Give other threads a chance to make progress, increasing the
+                # wait time between attempts.
+                time.sleep(retry_time)
+                set_last_ctxt_mgr()
 
         with self._cell_lock.write_lock():
             # Once we have returned from the context, we need

From 6f32b118640ee466b58155b5ecd50e041b4a2e7e Mon Sep 17 00:00:00 2001
From: Stephen Finucane <sfinucan@redhat.com>
Date: Thu, 19 May 2022 12:04:08 +0100
Subject: [PATCH 16/93] neutron: Unbind remaining ports after PortNotFound

Just because we encountered a PortNotFound error when unbinding a port
doesn't mean we should stop unbinding the remaining ports. If this error
is encountered, simply continue with the other ports.

While we're here, we clean up some other tests related to '_unbind_port'
since they're clearly duplicates.

Change-Id: Id04e0df12829df4d8929e03a8b76b5cbe0549059
Signed-off-by: Stephen Finucane <sfinucan@redhat.com>
Closes-Bug: #1974173
(cherry picked from commit 9e0dcb52ab308a63c6a18e47d1850cc3ade4d807)
---
 nova/network/neutron.py                 |  3 +-
 nova/tests/unit/network/test_neutron.py | 88 ++++++++++++++++++-------
 2 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index 3ee9774d24c..427fdedecda 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -636,6 +636,7 @@ def _unbind_ports(self, context, ports,
             # in case the caller forgot to filter the list.
             if port_id is None:
                 continue
+
             port_req_body: ty.Dict[str, ty.Any] = {
                 'port': {
                     'device_id': '',
@@ -650,7 +651,7 @@ def _unbind_ports(self, context, ports,
             except exception.PortNotFound:
                 LOG.debug('Unable to show port %s as it no longer '
                           'exists.', port_id)
-                return
+                continue
             except Exception:
                 # NOTE: In case we can't retrieve the binding:profile or
                 # network info assume that they are empty
diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py
index dbcfa80c273..f18a5487238 100644
--- a/nova/tests/unit/network/test_neutron.py
+++ b/nova/tests/unit/network/test_neutron.py
@@ -5248,7 +5248,8 @@ def test_get_preexisting_port_ids(self, mock_get_nw_info):
         self.assertEqual(['2', '3'], result, "Invalid preexisting ports")
 
     @mock.patch('nova.network.neutron.API._show_port')
-    def _test_unbind_ports_get_client(self, mock_neutron, mock_show):
+    @mock.patch('nova.network.neutron.get_client')
+    def test_unbind_ports_get_client(self, mock_neutron, mock_show):
         mock_ctx = mock.Mock(is_admin=False)
         ports = ["1", "2", "3"]
 
@@ -5264,25 +5265,18 @@ def _test_unbind_ports_get_client(self, mock_neutron, mock_show):
         self.assertEqual(1, mock_neutron.call_count)
         mock_neutron.assert_has_calls(get_client_calls, True)
 
-    @mock.patch('nova.network.neutron.get_client')
-    def test_unbind_ports_get_client_binding_extension(self,
-                                                       mock_neutron):
-        self._test_unbind_ports_get_client(mock_neutron)
-
-    @mock.patch('nova.network.neutron.get_client')
-    def test_unbind_ports_get_client(self, mock_neutron):
-        self._test_unbind_ports_get_client(mock_neutron)
-
     @mock.patch('nova.network.neutron.API.has_dns_extension',
                 new=mock.Mock(return_value=False))
     @mock.patch('nova.network.neutron.API._show_port')
-    def _test_unbind_ports(self, mock_neutron, mock_show):
+    @mock.patch('nova.network.neutron.get_client')
+    def test_unbind_ports(self, mock_neutron, mock_show):
         mock_client = mock.Mock()
         mock_update_port = mock.Mock()
         mock_client.update_port = mock_update_port
         mock_ctx = mock.Mock(is_admin=False)
         ports = ["1", "2", "3"]
         mock_show.side_effect = [{"id": "1"}, {"id": "2"}, {"id": "3"}]
+
         api = neutronapi.API()
         api._unbind_ports(mock_ctx, ports, mock_neutron, mock_client)
 
@@ -5296,14 +5290,6 @@ def _test_unbind_ports(self, mock_neutron, mock_show):
         self.assertEqual(3, mock_update_port.call_count)
         mock_update_port.assert_has_calls(update_port_calls)
 
-    @mock.patch('nova.network.neutron.get_client')
-    def test_unbind_ports_binding_ext(self, mock_neutron):
-        self._test_unbind_ports(mock_neutron)
-
-    @mock.patch('nova.network.neutron.get_client')
-    def test_unbind_ports(self, mock_neutron):
-        self._test_unbind_ports(mock_neutron)
-
     def test_unbind_ports_no_port_ids(self):
         # Tests that None entries in the ports list are filtered out.
         mock_client = mock.Mock()
@@ -6068,7 +6054,6 @@ def test_get_instance_id_by_floating_address_port_not_found(self,
     def test_unbind_ports_port_show_portnotfound(self, mock_log, mock_show):
         api = neutronapi.API()
         neutron_client = mock.Mock()
-        mock_show.return_value = {'id': uuids.port}
         api._unbind_ports(self.context, [uuids.port_id],
                           neutron_client, neutron_client)
         mock_show.assert_called_once_with(
@@ -6077,6 +6062,63 @@ def test_unbind_ports_port_show_portnotfound(self, mock_log, mock_show):
             neutron_client=mock.ANY)
         mock_log.assert_not_called()
 
+    @mock.patch(
+        'nova.network.neutron.API.has_dns_extension',
+        new=mock.Mock(return_value=False),
+    )
+    @mock.patch('nova.network.neutron.API._show_port')
+    @mock.patch.object(neutronapi, 'LOG')
+    def test_unbind_ports_port_show_portnotfound_multiple_ports(
+        self, mock_log, mock_show,
+    ):
+        """Ensure we continue unbinding ports even when one isn't found."""
+        mock_show.side_effect = [
+            exception.PortNotFound(port_id=uuids.port_a),
+            {'id': uuids.port_b},
+        ]
+        api = neutronapi.API()
+        neutron_client = mock.Mock()
+
+        api._unbind_ports(
+            self.context,
+            [uuids.port_a, uuids.port_b],
+            neutron_client,
+            neutron_client,
+        )
+
+        mock_show.assert_has_calls(
+            [
+                mock.call(
+                    self.context,
+                    uuids.port_a,
+                    fields=['binding:profile', 'network_id'],
+                    neutron_client=neutron_client,
+                ),
+                mock.call(
+                    self.context,
+                    uuids.port_b,
+                    fields=['binding:profile', 'network_id'],
+                    neutron_client=neutron_client,
+                ),
+            ]
+        )
+        # Only the port that exists should be updated
+        neutron_client.update_port.assert_called_once_with(
+            uuids.port_b,
+            {
+                'port': {
+                    'device_id': '',
+                    'device_owner': '',
+                    'binding:profile': {},
+                    'binding:host_id': None,
+                }
+            }
+        )
+        mock_log.exception.assert_not_called()
+        mock_log.debug.assert_called_with(
+            'Unable to show port %s as it no longer exists.', uuids.port_a,
+        )
+
     @mock.patch('nova.network.neutron.API.has_dns_extension',
                 new=mock.Mock(return_value=False))
     @mock.patch('nova.network.neutron.API._show_port',
@@ -6100,7 +6142,7 @@ def test_unbind_ports_port_show_unexpected_error(self,
                 new=mock.Mock(return_value=False))
     @mock.patch('nova.network.neutron.API._show_port')
     @mock.patch.object(neutronapi.LOG, 'exception')
-    def test_unbind_ports_portnotfound(self, mock_log, mock_show):
+    def test_unbind_ports_port_update_portnotfound(self, mock_log, mock_show):
         api = neutronapi.API()
         neutron_client = mock.Mock()
         neutron_client.update_port = mock.Mock(
@@ -6118,7 +6160,9 @@ def test_unbind_ports_portnotfound(self, mock_log, mock_show):
                 new=mock.Mock(return_value=False))
     @mock.patch('nova.network.neutron.API._show_port')
     @mock.patch.object(neutronapi.LOG, 'exception')
-    def test_unbind_ports_unexpected_error(self, mock_log, mock_show):
+    def test_unbind_ports_port_update_unexpected_error(
+        self, mock_log, mock_show,
+    ):
         api = neutronapi.API()
         neutron_client = mock.Mock()
         neutron_client.update_port = mock.Mock(

From 8a1b4975f71f9ce1446db689afb092d6e0a670a7 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Wed, 25 May 2022 12:02:09 +0200
Subject: [PATCH 17/93] Accept both 1 and Y as AMD SEV KVM kernel param value

The libvirt virt dirver checks the AMD KVM kernel module parameter SEV
to see if that feature is enabled. However it seems that the
/sys/module/kvm_amd/parameters/sev file can either contain "1\n" or
"Y\n" to indicate that the feature is enabled. Nova only checked for
"1\n" so far making the feature disabled on compute nodes with "Y\n"
value. Now the logic is extended to accept both.

Closes-Bug: #1975686
Change-Id: I737e1d73242430b6756178eb0bf9bd6ec5c94160
(cherry picked from commit ab51a5dd25b8d4c66562148b43b1022eb5ceed7e)
---
 nova/tests/unit/virt/libvirt/test_host.py | 33 ++++++++++++++---------
 nova/virt/libvirt/host.py                 |  7 ++---
 2 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py
index d71d13ab372..1eb26572aa6 100644
--- a/nova/tests/unit/virt/libvirt/test_host.py
+++ b/nova/tests/unit/virt/libvirt/test_host.py
@@ -16,6 +16,7 @@
 
 import os
 
+import ddt
 import eventlet
 from eventlet import greenthread
 from eventlet import tpool
@@ -1928,6 +1929,7 @@ def setUp(self):
         self.host = host.Host("qemu:///system")
 
 
+@ddt.ddt
 class TestLibvirtSEVUnsupported(TestLibvirtSEV):
     @mock.patch.object(os.path, 'exists', return_value=False)
     def test_kernel_parameter_missing(self, fake_exists):
@@ -1935,19 +1937,26 @@ def test_kernel_parameter_missing(self, fake_exists):
         fake_exists.assert_called_once_with(
             '/sys/module/kvm_amd/parameters/sev')
 
+    @ddt.data(
+        ('0\n', False),
+        ('N\n', False),
+        ('1\n', True),
+        ('Y\n', True),
+    )
+    @ddt.unpack
     @mock.patch.object(os.path, 'exists', return_value=True)
-    @mock.patch('builtins.open', mock.mock_open(read_data="0\n"))
-    def test_kernel_parameter_zero(self, fake_exists):
-        self.assertFalse(self.host._kernel_supports_amd_sev())
-        fake_exists.assert_called_once_with(
-            '/sys/module/kvm_amd/parameters/sev')
-
-    @mock.patch.object(os.path, 'exists', return_value=True)
-    @mock.patch('builtins.open', mock.mock_open(read_data="1\n"))
-    def test_kernel_parameter_one(self, fake_exists):
-        self.assertTrue(self.host._kernel_supports_amd_sev())
-        fake_exists.assert_called_once_with(
-            '/sys/module/kvm_amd/parameters/sev')
+    def test_kernel_parameter(
+        self, sev_param_value, expected_support, mock_exists
+    ):
+        with mock.patch(
+            'builtins.open', mock.mock_open(read_data=sev_param_value)
+        ):
+            self.assertIs(
+                expected_support,
+                self.host._kernel_supports_amd_sev()
+            )
+            mock_exists.assert_called_once_with(
+                '/sys/module/kvm_amd/parameters/sev')
 
     @mock.patch.object(os.path, 'exists', return_value=True)
     @mock.patch('builtins.open', mock.mock_open(read_data="1\n"))
diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py
index cdf47008de4..63f1a9c6e41 100644
--- a/nova/virt/libvirt/host.py
+++ b/nova/virt/libvirt/host.py
@@ -46,6 +46,7 @@
 from oslo_serialization import jsonutils
 from oslo_utils import excutils
 from oslo_utils import importutils
+from oslo_utils import strutils
 from oslo_utils import units
 from oslo_utils import versionutils
 
@@ -1656,9 +1657,9 @@ def _kernel_supports_amd_sev(self) -> bool:
             return False
 
         with open(SEV_KERNEL_PARAM_FILE) as f:
-            contents = f.read()
-            LOG.debug("%s contains [%s]", SEV_KERNEL_PARAM_FILE, contents)
-            return contents == "1\n"
+            content = f.read()
+            LOG.debug("%s contains [%s]", SEV_KERNEL_PARAM_FILE, content)
+            return strutils.bool_from_string(content)
 
     @property
     def supports_amd_sev(self) -> bool:

From dfa05d62da441eeaf144e312cf33ba0e029452f0 Mon Sep 17 00:00:00 2001
From: Rajesh Tailor <ratailor@redhat.com>
Date: Fri, 27 May 2022 09:51:11 +0530
Subject: [PATCH 18/93] Add missing condition

Change [1] added new fields 'src|dst_supports_numa_live_migration'
to LibvirtLiveMigrateData object, but missed if condition for
dst_supports_numa_live_migration field in obj_make_compatible
method.

This change adds the if condition as well as fix typo in unit test
because of which this wasn't catched earlier.

Closes-Bug: #1975891
Change-Id: Ice5a2c7aca77f47ea6328a10d835854d9aff408e
(cherry picked from commit 3aa77a3999a7dcabbd4c0141d4c56b07a4624128)
---
 nova/objects/migrate_data.py                 | 3 +++
 nova/tests/unit/objects/test_migrate_data.py | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/nova/objects/migrate_data.py b/nova/objects/migrate_data.py
index 06f30342e54..cf0e4bf9a31 100644
--- a/nova/objects/migrate_data.py
+++ b/nova/objects/migrate_data.py
@@ -279,6 +279,9 @@ def obj_make_compatible(self, primitive, target_version):
         if (target_version < (1, 10) and
                 'src_supports_numa_live_migration' in primitive):
             del primitive['src_supports_numa_live_migration']
+        if (target_version < (1, 10) and
+                'dst_supports_numa_live_migration' in primitive):
+            del primitive['dst_supports_numa_live_migration']
         if target_version < (1, 10) and 'dst_numa_info' in primitive:
             del primitive['dst_numa_info']
         if target_version < (1, 9) and 'vifs' in primitive:
diff --git a/nova/tests/unit/objects/test_migrate_data.py b/nova/tests/unit/objects/test_migrate_data.py
index bc04c5bd13a..7f587c69064 100644
--- a/nova/tests/unit/objects/test_migrate_data.py
+++ b/nova/tests/unit/objects/test_migrate_data.py
@@ -94,8 +94,8 @@ def test_obj_make_compatible(self):
             target_connect_addr='127.0.0.1',
             dst_wants_file_backed_memory=False,
             file_backed_memory_discard=False,
-            src_supports_numa_live_migraton=True,
-            dst_supports_numa_live_migraton=True,
+            src_supports_numa_live_migration=True,
+            dst_supports_numa_live_migration=True,
             dst_numa_info=migrate_data.LibvirtLiveMigrateNUMAInfo())
         manifest = ovo_base.obj_tree_get_versions(obj.obj_name())
 

From 277f88e3872ea41bce02d09c4537946a74d74533 Mon Sep 17 00:00:00 2001
From: Kashyap Chamarthy <kchamart@redhat.com>
Date: Thu, 28 Jan 2021 16:35:10 +0100
Subject: [PATCH 19/93] libvirt: Add a workaround to skip compareCPU() on
 destination

Nova's use of libvirt's compareCPU() API served its purpose
over the years, but its design limitations break live migration in
subtle ways.  For example, the compareCPU() API compares against the
host physical CPUID.  Some of the features from this CPUID aren not
exposed by KVM, and then there are some features that KVM emulates that
are not in the host CPUID.  The latter can cause bogus live migration
failures.

With QEMU >=2.9 and libvirt >= 4.4.0, libvirt will do the right thing in
terms of CPU compatibility checks on the destination host during live
migration.  Nova satisfies these minimum version requirements by a good
margin.  So, provide a workaround to skip the CPU comparison check on
the destination host before migrating a guest, and let libvirt handle it
correctly.  This workaround will be removed once Nova replaces the older
libvirt APIs with their newer and improved counterparts[1][2].

                - - -

Note that Nova's libvirt driver calls compareCPU() in another method,
_check_cpu_compatibility(); I did not remove its usage yet.  As it needs
more careful combing of the code, and then:

  - where possible, remove the usage of compareCPU() altogether, and
    rely on libvirt doing the right thing under the hood; or

  - where Nova _must_ do the CPU comparison checks, switch to the better
    libvirt CPU APIs -- baselineHypervisorCPU() and
    compareHypervisorCPU() -- that are described here[1].  This is work
    in progress[2].

[1] https://opendev.org/openstack/nova-specs/commit/70811da221035044e27
[2] https://review.opendev.org/q/topic:bp%252Fcpu-selection-with-hypervisor-consideration

Change-Id: I444991584118a969e9ea04d352821b07ec0ba88d
Closes-Bug: #1913716
Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Balazs Gibizer <bgibizer@redhat.com>
(cherry picked from commit 267a40663cd8d0b94bbc5ebda4ece55a45753b64)
---
 nova/conf/workarounds.py                      |  8 +++++++
 nova/tests/unit/virt/libvirt/test_driver.py   | 19 +++++++++++++++
 nova/virt/libvirt/driver.py                   | 19 ++++++++-------
 ...-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml | 24 +++++++++++++++++++
 4 files changed, 61 insertions(+), 9 deletions(-)
 create mode 100644 releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml

diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py
index 7419f073b49..6c52eae8e5d 100644
--- a/nova/conf/workarounds.py
+++ b/nova/conf/workarounds.py
@@ -401,6 +401,14 @@
 Related options:
 
 * :oslo.config:option:`quota.driver`
+"""),
+    cfg.BoolOpt('skip_cpu_compare_on_dest',
+               default=False,
+               help="""
+With the libvirt driver, during live migration, skip comparing guest CPU
+with the destination host. When using QEMU >= 2.9 and libvirt >=
+4.4.0, libvirt will do the correct thing with respect to checking CPU
+compatibility on the destination host during live migration.
 """),
 ]
 
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index fd3d322b198..5632fcba868 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -10915,6 +10915,25 @@ def test_check_can_live_migrate_guest_cpu_none_model(
                           'serial_listen_addr': None},
                          result.obj_to_primitive()['nova_object.data'])
 
+    @mock.patch(
+        'nova.network.neutron.API.has_port_binding_extension',
+        new=mock.Mock(return_value=False))
+    @mock.patch.object(libvirt_driver.LibvirtDriver,
+                       '_create_shared_storage_test_file',
+                       return_value='fake')
+    @mock.patch.object(libvirt_driver.LibvirtDriver, '_compare_cpu')
+    def test_check_can_live_migrate_guest_cpu_none_model_skip_compare(
+            self, mock_cpu, mock_test_file):
+        self.flags(group='workarounds', skip_cpu_compare_on_dest=True)
+        instance_ref = objects.Instance(**self.test_instance)
+        instance_ref.vcpu_model = test_vcpu_model.fake_vcpumodel
+        instance_ref.vcpu_model.model = None
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
+        compute_info = {'cpu_info': 'asdf', 'disk_available_least': 1}
+        drvr.check_can_live_migrate_destination(
+            self.context, instance_ref, compute_info, compute_info)
+        mock_cpu.assert_not_called()
+
     @mock.patch(
         'nova.network.neutron.API.has_port_binding_extension',
         new=mock.Mock(return_value=False))
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 94e7b1945aa..7c0abcb1506 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -9338,15 +9338,16 @@ def check_can_live_migrate_destination(self, context, instance,
         disk_available_mb = (
             (disk_available_gb * units.Ki) - CONF.reserved_host_disk_mb)
 
-        # Compare CPU
-        try:
-            if not instance.vcpu_model or not instance.vcpu_model.model:
-                source_cpu_info = src_compute_info['cpu_info']
-                self._compare_cpu(None, source_cpu_info, instance)
-            else:
-                self._compare_cpu(instance.vcpu_model, None, instance)
-        except exception.InvalidCPUInfo as e:
-            raise exception.MigrationPreCheckError(reason=e)
+        if not CONF.workarounds.skip_cpu_compare_on_dest:
+            # Compare CPU
+            try:
+                if not instance.vcpu_model or not instance.vcpu_model.model:
+                    source_cpu_info = src_compute_info['cpu_info']
+                    self._compare_cpu(None, source_cpu_info, instance)
+                else:
+                    self._compare_cpu(instance.vcpu_model, None, instance)
+            except exception.InvalidCPUInfo as e:
+                raise exception.MigrationPreCheckError(reason=e)
 
         # Create file on storage, to be checked on source host
         filename = self._create_shared_storage_test_file(instance)
diff --git a/releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml b/releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml
new file mode 100644
index 00000000000..e7cd4041b16
--- /dev/null
+++ b/releasenotes/notes/skip-compare-cpu-on-dest-6ae419ddd61fd0f8.yaml
@@ -0,0 +1,24 @@
+---
+issues:
+  - |
+    Nova's use of libvirt's compareCPU() API served its purpose over the
+    years, but its design limitations break live migration in subtle
+    ways. For example, the compareCPU() API compares against the host
+    physical CPUID. Some of the features from this CPUID aren not
+    exposed by KVM, and then there are some features that KVM emulates
+    that are not in the host CPUID. The latter can cause bogus live
+    migration failures.
+
+    With QEMU >=2.9 and libvirt >= 4.4.0, libvirt will do the right
+    thing in terms of CPU compatibility checks on the destination host
+    during live migration. Nova satisfies these minimum version
+    requirements by a good margin. So, this workaround provides a way to
+    skip the CPU comparison check on the destination host before
+    migrating a guest, and let libvirt handle it correctly.
+
+    This workaround will be deprecated and removed once Nova replaces
+    the older libvirt APIs with their newer counterparts. The work is
+    being tracked via this `blueprint
+    cpu-selection-with-hypervisor-consideration`_.
+
+    .. _blueprint cpu-selection-with-hypervisor-consideration: https://blueprints.launchpad.net/nova/+spec/cpu-selection-with-hypervisor-consideration

From 0f6c0cd2e43f60999901de8e4b167b27a393d950 Mon Sep 17 00:00:00 2001
From: Rajesh Tailor <ratailor@redhat.com>
Date: Mon, 30 May 2022 17:24:07 +0530
Subject: [PATCH 20/93] Fix typos in help messages

This change fixes typos in conf parameter help messages
and in error log message.

Change-Id: Iedc268072d77771b208603e663b0ce9b94215eb8
(cherry picked from commit aa1e7a6933df221e72a1371d286a63a9a08ce90a)
---
 nova/compute/resource_tracker.py                 | 2 +-
 nova/conf/compute.py                             | 2 +-
 nova/conf/hyperv.py                              | 2 +-
 nova/conf/libvirt.py                             | 4 ++--
 nova/conf/neutron.py                             | 2 +-
 nova/conf/quota.py                               | 2 +-
 nova/conf/scheduler.py                           | 4 ++--
 nova/tests/unit/compute/test_resource_tracker.py | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 8497bbcd899..0b801f7ddf9 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1856,7 +1856,7 @@ def _merge_provider_configs(self, provider_configs, provider_tree):
                         raise ValueError(_(
                             "Provider config '%(source_file_name)s' attempts "
                             "to define a trait that is owned by the "
-                            "virt driver or specified via the placment api. "
+                            "virt driver or specified via the placement api. "
                             "Invalid traits '%(invalid)s' must be removed "
                             "from '%(source_file_name)s'.") % {
                                 'source_file_name': source_file_name,
diff --git a/nova/conf/compute.py b/nova/conf/compute.py
index 263d7775869..5abe7694f80 100644
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@@ -200,7 +200,7 @@
 top-level key called ``interfaces``. This key will contain a list of
 dictionaries, one for each interface.
 
-Refer to the cloudinit documentaion for more information:
+Refer to the cloudinit documentation for more information:
 
   https://cloudinit.readthedocs.io/en/latest/topics/datasources.html
 
diff --git a/nova/conf/hyperv.py b/nova/conf/hyperv.py
index caa7a8702b8..cce3cdc3e2d 100644
--- a/nova/conf/hyperv.py
+++ b/nova/conf/hyperv.py
@@ -320,7 +320,7 @@
     cfg.ListOpt('iscsi_initiator_list',
                 default=[],
                 help="""
-List of iSCSI initiators that will be used for estabilishing iSCSI sessions.
+List of iSCSI initiators that will be used for establishing iSCSI sessions.
 
 If none are specified, the Microsoft iSCSI initiator service will choose the
 initiator.
diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py
index 7d9c837ba56..4ea37b8fe97 100644
--- a/nova/conf/libvirt.py
+++ b/nova/conf/libvirt.py
@@ -453,7 +453,7 @@
 
 Prerequisite: TLS environment is configured correctly on all relevant
 Compute nodes.  This means, Certificate Authority (CA), server, client
-certificates, their corresponding keys, and their file permisssions are
+certificates, their corresponding keys, and their file permissions are
 in place, and are validated.
 
 Notes:
@@ -705,7 +705,7 @@
 returns random numbers when read) is accepted.  The recommended source
 of entropy is ``/dev/urandom`` -- it is non-blocking, therefore
 relatively fast; and avoids the limitations of ``/dev/random``, which is
-a legacy interface.  For more details (and comparision between different
+a legacy interface.  For more details (and comparison between different
 RNG sources), refer to the "Usage" section in the Linux kernel API
 documentation for ``[u]random``:
 http://man7.org/linux/man-pages/man4/urandom.4.html and
diff --git a/nova/conf/neutron.py b/nova/conf/neutron.py
index dc391a268e8..e6774ced55a 100644
--- a/nova/conf/neutron.py
+++ b/nova/conf/neutron.py
@@ -46,7 +46,7 @@
 
 Specifies the name of floating IP pool used for allocating floating IPs. This
 option is only used if Neutron does not specify the floating IP pool name in
-port binding reponses.
+port binding responses.
 """),
     cfg.IntOpt('extension_sync_interval',
          default=600,
diff --git a/nova/conf/quota.py b/nova/conf/quota.py
index 0d51129d503..e5b4b8dc738 100644
--- a/nova/conf/quota.py
+++ b/nova/conf/quota.py
@@ -147,7 +147,7 @@
         deprecated_group='DEFAULT',
         deprecated_name='quota_server_groups',
         help="""
-The maxiumum number of server groups per project.
+The maximum number of server groups per project.
 
 Server groups are used to control the affinity and anti-affinity scheduling
 policy for a group of servers or instances. Reducing the quota will not affect
diff --git a/nova/conf/scheduler.py b/nova/conf/scheduler.py
index 8b3b6169873..03e78fe7017 100644
--- a/nova/conf/scheduler.py
+++ b/nova/conf/scheduler.py
@@ -780,7 +780,7 @@
 
 Possible values:
 
-* An integer or float value, where the value corresponds to the multipler
+* An integer or float value, where the value corresponds to the multiplier
   ratio for this weigher.
 
 Related options:
@@ -857,7 +857,7 @@
 
 Possible values:
 
-* An integer or float value, where the value corresponds to the multipler
+* An integer or float value, where the value corresponds to the multiplier
   ratio for this weigher.
 
 Related options:
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index 36236d58ded..caa12cb754c 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -4069,7 +4069,7 @@ def test_merge_provider_configs_additional_traits_exception(self):
 
         expected = ("Provider config 'test_provider_config.yaml' attempts to "
                     "define a trait that is owned by the virt driver or "
-                    "specified via the placment api. Invalid traits '" +
+                    "specified via the placement api. Invalid traits '" +
                     ex_trait + "' must be removed from "
                     "'test_provider_config.yaml'.")
 

From b94ffb1123b1a6cf0a8675e0d6f1072e9625f570 Mon Sep 17 00:00:00 2001
From: Takashi Kajinami <tkajinam@redhat.com>
Date: Mon, 13 Jun 2022 14:48:24 +0900
Subject: [PATCH 21/93] Retry attachment delete API call for 504 Gateway
 Timeout

When cinder-api runs behind a load balancer(eg haproxy), the load
balancer can return 504 Gateway Timeout when cinder-api does not
respond within timeout. This change ensures nova retries deleting
a volume attachment in that case.

Also this change makes nova ignore 404 in the API call. This is
required because cinder might continue deleting the attachment even if
the load balancer returns 504. This also helps us in the situation
where the volume attachment was accidentally removed by users.

Closes-Bug: #1978444
Change-Id: I593011d9f4c43cdae7a3d53b556c6e2a2b939989
(cherry picked from commit 8f4b740ca5292556f8e953a30f2a11ed4fbc2945)
---
 nova/tests/unit/volume/test_cinder.py         | 40 +++++++++++++++++--
 nova/volume/cinder.py                         | 18 +++++----
 .../notes/bug-1978444-db46df5f3d5ea19e.yaml   |  7 ++++
 3 files changed, 54 insertions(+), 11 deletions(-)
 create mode 100644 releasenotes/notes/bug-1978444-db46df5f3d5ea19e.yaml

diff --git a/nova/tests/unit/volume/test_cinder.py b/nova/tests/unit/volume/test_cinder.py
index 0c170c05e49..f4ee7383d45 100644
--- a/nova/tests/unit/volume/test_cinder.py
+++ b/nova/tests/unit/volume/test_cinder.py
@@ -520,16 +520,15 @@ def test_attachment_delete(self, mock_cinderclient):
     @mock.patch('nova.volume.cinder.cinderclient')
     def test_attachment_delete_failed(self, mock_cinderclient, mock_log):
         mock_cinderclient.return_value.attachments.delete.side_effect = (
-                cinder_exception.NotFound(404, '404'))
+                cinder_exception.BadRequest(400, '400'))
 
         attachment_id = uuids.attachment
-        ex = self.assertRaises(exception.VolumeAttachmentNotFound,
+        ex = self.assertRaises(exception.InvalidInput,
                                self.api.attachment_delete,
                                self.ctx,
                                attachment_id)
 
-        self.assertEqual(404, ex.code)
-        self.assertIn(attachment_id, str(ex))
+        self.assertEqual(400, ex.code)
 
     @mock.patch('nova.volume.cinder.cinderclient',
                 side_effect=exception.CinderAPIVersionNotAvailable(
@@ -545,6 +544,16 @@ def test_attachment_delete_unsupported_api_version(self,
         mock_cinderclient.assert_called_once_with(self.ctx, '3.44',
                                                   skip_version_check=True)
 
+    @mock.patch('nova.volume.cinder.cinderclient')
+    def test_attachment_delete_not_found(self, mock_cinderclient):
+        mock_cinderclient.return_value.attachments.delete.side_effect = (
+            cinder_exception.ClientException(404))
+
+        attachment_id = uuids.attachment
+        self.api.attachment_delete(self.ctx, attachment_id)
+
+        self.assertEqual(1, mock_cinderclient.call_count)
+
     @mock.patch('nova.volume.cinder.cinderclient')
     def test_attachment_delete_internal_server_error(self, mock_cinderclient):
         mock_cinderclient.return_value.attachments.delete.side_effect = (
@@ -568,6 +577,29 @@ def test_attachment_delete_internal_server_error_do_not_raise(
 
         self.assertEqual(2, mock_cinderclient.call_count)
 
+    @mock.patch('nova.volume.cinder.cinderclient')
+    def test_attachment_delete_gateway_timeout(self, mock_cinderclient):
+        mock_cinderclient.return_value.attachments.delete.side_effect = (
+            cinder_exception.ClientException(504))
+
+        self.assertRaises(cinder_exception.ClientException,
+                          self.api.attachment_delete,
+                          self.ctx, uuids.attachment_id)
+
+        self.assertEqual(5, mock_cinderclient.call_count)
+
+    @mock.patch('nova.volume.cinder.cinderclient')
+    def test_attachment_delete_gateway_timeout_do_not_raise(
+                                                      self, mock_cinderclient):
+        # generate exception, and then have a normal return on the next retry
+        mock_cinderclient.return_value.attachments.delete.side_effect = [
+            cinder_exception.ClientException(504), None]
+
+        attachment_id = uuids.attachment
+        self.api.attachment_delete(self.ctx, attachment_id)
+
+        self.assertEqual(2, mock_cinderclient.call_count)
+
     @mock.patch('nova.volume.cinder.cinderclient')
     def test_attachment_delete_bad_request_exception(self, mock_cinderclient):
         mock_cinderclient.return_value.attachments.delete.side_effect = (
diff --git a/nova/volume/cinder.py b/nova/volume/cinder.py
index bf1e455bba4..01efcfec19b 100644
--- a/nova/volume/cinder.py
+++ b/nova/volume/cinder.py
@@ -888,19 +888,23 @@ def attachment_update(self, context, attachment_id, connector,
     @retrying.retry(stop_max_attempt_number=5,
                     retry_on_exception=lambda e:
                     (isinstance(e, cinder_exception.ClientException) and
-                     e.code == 500))
+                     e.code in (500, 504)))
     def attachment_delete(self, context, attachment_id):
         try:
             cinderclient(
                 context, '3.44', skip_version_check=True).attachments.delete(
                     attachment_id)
         except cinder_exception.ClientException as ex:
-            with excutils.save_and_reraise_exception():
-                LOG.error('Delete attachment failed for attachment '
-                          '%(id)s. Error: %(msg)s Code: %(code)s',
-                          {'id': attachment_id,
-                           'msg': str(ex),
-                           'code': getattr(ex, 'code', None)})
+            if ex.code == 404:
+                LOG.warning('Attachment %(id)s does not exist. Ignoring.',
+                            {'id': attachment_id})
+            else:
+                with excutils.save_and_reraise_exception():
+                    LOG.error('Delete attachment failed for attachment '
+                              '%(id)s. Error: %(msg)s Code: %(code)s',
+                              {'id': attachment_id,
+                               'msg': str(ex),
+                               'code': getattr(ex, 'code', None)})
 
     @translate_attachment_exception
     def attachment_complete(self, context, attachment_id):
diff --git a/releasenotes/notes/bug-1978444-db46df5f3d5ea19e.yaml b/releasenotes/notes/bug-1978444-db46df5f3d5ea19e.yaml
new file mode 100644
index 00000000000..6c198040745
--- /dev/null
+++ b/releasenotes/notes/bug-1978444-db46df5f3d5ea19e.yaml
@@ -0,0 +1,7 @@
+---
+fixes:
+  - |
+    `Bug #1978444 <https://bugs.launchpad.net/nova/+bug/1978444>`_: Now nova
+    retries deleting a volume attachment in case Cinder API returns
+    ``504 Gateway Timeout``. Also, ``404 Not Found`` is now ignored and
+    leaves only a warning message.

From 6bd0bf00fca6ac6460d70c855eded3898cfe2401 Mon Sep 17 00:00:00 2001
From: Amit Uniyal <auniyal@redhat.com>
Date: Wed, 6 Jul 2022 18:20:02 +0000
Subject: [PATCH 22/93] add regression test case for bug 1978983

This change add a repoducer test for evacuating
a vm in the powering-off state

Related-Bug: #1978983
Change-Id: I5540df6c7497956219c06cff6f15b51c2c8bc299
(cherry picked from commit 5904c7f993ac737d68456fc05adf0aaa7a6f3018)
---
 nova/tests/functional/integrated_helpers.py   |  6 +-
 .../regressions/test_bug_1978983.py           | 78 +++++++++++++++++++
 2 files changed, 82 insertions(+), 2 deletions(-)
 create mode 100644 nova/tests/functional/regressions/test_bug_1978983.py

diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
index 70918bc5f59..028ef53d7ea 100644
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@@ -606,9 +606,11 @@ def _start_server(self, server):
         self.api.post_server_action(server['id'], {'os-start': None})
         return self._wait_for_state_change(server, 'ACTIVE')
 
-    def _stop_server(self, server):
+    def _stop_server(self, server, wait_for_stop=True):
         self.api.post_server_action(server['id'], {'os-stop': None})
-        return self._wait_for_state_change(server, 'SHUTOFF')
+        if wait_for_stop:
+            return self._wait_for_state_change(server, 'SHUTOFF')
+        return server
 
 
 class PlacementHelperMixin:
diff --git a/nova/tests/functional/regressions/test_bug_1978983.py b/nova/tests/functional/regressions/test_bug_1978983.py
new file mode 100644
index 00000000000..75260abf371
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1978983.py
@@ -0,0 +1,78 @@
+# Copyright 2022 Red Hat, Inc.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+
+from nova import test
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional.api import client
+from nova.tests.functional import fixtures as func_fixtures
+from nova.tests.functional import integrated_helpers
+
+
+class EvacuateServerWithTaskState(
+    test.TestCase, integrated_helpers.InstanceHelperMixin,
+):
+    """Regression test for bug 1978983
+    If instance task state is powering-off or not None
+    instance should be allowed to evacuate.
+    """
+
+    def setUp(self):
+        super().setUp()
+        # Stub out external dependencies.
+        self.useFixture(nova_fixtures.NeutronFixture(self))
+        self.useFixture(nova_fixtures.GlanceFixture(self))
+        self.useFixture(func_fixtures.PlacementFixture())
+        self.useFixture(nova_fixtures.HostNameWeigherFixture())
+
+        # Start nova controller services.
+        self.start_service('conductor')
+        self.start_service('scheduler')
+
+        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+            api_version='v2.1'))
+        self.api = api_fixture.admin_api
+
+        self.src = self._start_compute(host='host1')
+        self.dest = self._start_compute(host='host2')
+
+    def test_evacuate_instance(self):
+        """Evacuating a server
+        """
+        server = self._create_server(networks=[])
+
+        self.api.microversion = 'latest'
+        server = self._wait_for_state_change(server, 'ACTIVE')
+        self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host'])
+
+        # stop host1 compute service
+        self.src.stop()
+
+        # poweroff instance
+        self._stop_server(server, wait_for_stop=False)
+        server = self._wait_for_server_parameter(
+            server, {'OS-EXT-STS:task_state': 'powering-off'})
+
+        # FIXME(auniyal): As compute service is down in source node
+        # instance is stuck at powering-off, evacuation fails with
+        # msg: Cannot 'evacuate' instance <instance-id> while it is in
+        # task_state powering-off (HTTP 409)
+
+        ex = self.assertRaises(
+            client.OpenStackApiException,
+            self._evacuate_server,
+            server,
+            expected_host=self.dest.host)
+        self.assertEqual(409, ex.response.status_code)

From 6d61fccb8455367aaa37ae7bddf3b8befd3c3d88 Mon Sep 17 00:00:00 2001
From: Amit Uniyal <auniyal@redhat.com>
Date: Wed, 6 Jul 2022 18:20:02 +0000
Subject: [PATCH 23/93] For evacuation, ignore if task_state is not None

ignore instance task state and continue with vm evacutaion

Closes-Bug: #1978983
Change-Id: I5540df6c7497956219c06cff6f15b51c2c8bc29d
(cherry picked from commit db919aa15f24c0d74f3c5c0e8341fad3f2392e57)
---
 doc/source/admin/evacuate.rst                 | 14 +++++++++++
 nova/compute/api.py                           |  4 ++--
 .../regressions/test_bug_1978983.py           | 23 +++++++------------
 ...state-for-evacuation-e000f141d0153638.yaml | 11 +++++++++
 4 files changed, 35 insertions(+), 17 deletions(-)
 create mode 100644 releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml

diff --git a/doc/source/admin/evacuate.rst b/doc/source/admin/evacuate.rst
index ef9eccd9312..18796d9c237 100644
--- a/doc/source/admin/evacuate.rst
+++ b/doc/source/admin/evacuate.rst
@@ -97,3 +97,17 @@ instances up and running.
    using a pattern you might want to use the ``--strict`` flag which got introduced
    in version 10.2.0 to make sure nova matches the ``FAILED_HOST``
    exactly.
+
+.. note::
+   .. code-block:: bash
+
+      +------+--------+--------------+
+      | Name | Status | Task State   |
+      +------+--------+--------------+
+      | vm_1 | ACTIVE | powering-off |
+      +------------------------------+
+
+   If the instance task state is not None, evacuation will be possible. However,
+   depending on the ongoing operation, there may be clean up required in other
+   services which the instance was using, such as neutron, cinder, glance, or
+   the storage backend.
\ No newline at end of file
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 43a0f66a100..9a2cbd3325a 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -5474,7 +5474,7 @@ def live_migrate_abort(self, context, instance, migration_id,
     @reject_vtpm_instances(instance_actions.EVACUATE)
     @block_accelerators(until_service=SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD)
     @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
-                                    vm_states.ERROR])
+                                    vm_states.ERROR], task_state=None)
     def evacuate(self, context, instance, host, on_shared_storage,
                  admin_password=None, force=None):
         """Running evacuate to target host.
@@ -5501,7 +5501,7 @@ def evacuate(self, context, instance, host, on_shared_storage,
             context, instance.uuid)
 
         instance.task_state = task_states.REBUILDING
-        instance.save(expected_task_state=[None])
+        instance.save(expected_task_state=None)
         self._record_action_start(context, instance, instance_actions.EVACUATE)
 
         # NOTE(danms): Create this as a tombstone for the source compute
diff --git a/nova/tests/functional/regressions/test_bug_1978983.py b/nova/tests/functional/regressions/test_bug_1978983.py
index 75260abf371..51465900da0 100644
--- a/nova/tests/functional/regressions/test_bug_1978983.py
+++ b/nova/tests/functional/regressions/test_bug_1978983.py
@@ -13,10 +13,8 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
-
 from nova import test
 from nova.tests import fixtures as nova_fixtures
-from nova.tests.functional.api import client
 from nova.tests.functional import fixtures as func_fixtures
 from nova.tests.functional import integrated_helpers
 
@@ -44,6 +42,7 @@ def setUp(self):
         api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
             api_version='v2.1'))
         self.api = api_fixture.admin_api
+        self.api.microversion = 'latest'
 
         self.src = self._start_compute(host='host1')
         self.dest = self._start_compute(host='host2')
@@ -53,26 +52,20 @@ def test_evacuate_instance(self):
         """
         server = self._create_server(networks=[])
 
-        self.api.microversion = 'latest'
         server = self._wait_for_state_change(server, 'ACTIVE')
-        self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host'])
+        self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host'])
 
         # stop host1 compute service
         self.src.stop()
+        self.api.put_service_force_down(self.src.service_ref.uuid, True)
 
         # poweroff instance
         self._stop_server(server, wait_for_stop=False)
         server = self._wait_for_server_parameter(
             server, {'OS-EXT-STS:task_state': 'powering-off'})
 
-        # FIXME(auniyal): As compute service is down in source node
-        # instance is stuck at powering-off, evacuation fails with
-        # msg: Cannot 'evacuate' instance <instance-id> while it is in
-        # task_state powering-off (HTTP 409)
-
-        ex = self.assertRaises(
-            client.OpenStackApiException,
-            self._evacuate_server,
-            server,
-            expected_host=self.dest.host)
-        self.assertEqual(409, ex.response.status_code)
+        # evacuate instance
+        server = self._evacuate_server(
+            server, expected_host=self.dest.host
+        )
+        self.assertEqual(self.dest.host, server['OS-EXT-SRV-ATTR:host'])
diff --git a/releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml b/releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml
new file mode 100644
index 00000000000..46ebf0bd2d0
--- /dev/null
+++ b/releasenotes/notes/ignore-instance-task-state-for-evacuation-e000f141d0153638.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    If compute service is down in source node and user try to stop
+    instance, instance gets stuck at powering-off, hence evacuation fails with
+    msg: Cannot 'evacuate' instance <instance-id> while it is in
+    task_state powering-off.
+    It is now possible for evacuation to ignore the vm task state.
+    For more details see: `bug 1978983`_
+
+    .. _`bug 1978983`: https://bugs.launchpad.net/nova/+bug/1978983
\ No newline at end of file

From bdc32226e2d9ce4a073306f8f6267fbe305e64e4 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Sat, 6 Aug 2022 14:48:52 +0200
Subject: [PATCH 24/93] Reproducer for bug 1983753

Related-Bug: #1983753
Change-Id: Ic6566272b8c5af57cd1c73c73b3cba6cd265bebe
(cherry picked from commit 6d602c6b734c2e360fa319ba22c2fced02ad3d29)
---
 .../regressions/test_bug_1983753.py           | 193 ++++++++++++++++++
 1 file changed, 193 insertions(+)
 create mode 100644 nova/tests/functional/regressions/test_bug_1983753.py

diff --git a/nova/tests/functional/regressions/test_bug_1983753.py b/nova/tests/functional/regressions/test_bug_1983753.py
new file mode 100644
index 00000000000..a11ea8dc63e
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1983753.py
@@ -0,0 +1,193 @@
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+import fixtures
+
+from oslo_serialization import jsonutils
+
+from nova.tests.fixtures import libvirt as fakelibvirt
+from nova.tests.functional.api import client
+from nova.tests.functional.libvirt import test_pci_sriov_servers
+
+
+class TestPciResize(test_pci_sriov_servers._PCIServersTestBase):
+    # these tests use multiple different configs so the whitelist is set by
+    # each testcase individually
+    PCI_PASSTHROUGH_WHITELIST = []
+    PCI_ALIAS = [
+        jsonutils.dumps(x)
+        for x in [
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.PCI_PROD_ID,
+                "name": "a-pci-dev",
+            },
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.PF_PROD_ID,
+                "device_type": "type-PF",
+                "name": "a-pf",
+            },
+            {
+                "vendor_id": fakelibvirt.PCI_VEND_ID,
+                "product_id": fakelibvirt.VF_PROD_ID,
+                "device_type": "type-VF",
+                "name": "a-vf",
+            },
+        ]
+    ]
+
+    def setUp(self):
+        super().setUp()
+        self.useFixture(
+            fixtures.MockPatch(
+                'nova.virt.libvirt.driver.LibvirtDriver.'
+                'migrate_disk_and_power_off',
+                return_value='{}'
+            )
+        )
+        # These tests should not depend on the host's sysfs
+        self.useFixture(
+            fixtures.MockPatch('nova.pci.utils.is_physical_function'))
+        self.useFixture(
+            fixtures.MockPatch(
+                'nova.pci.utils.get_function_by_ifname',
+                return_value=(None, False)
+            )
+        )
+
+    def _test_resize_from_two_devs_to_one_dev(self, num_pci_on_dest):
+        # The fake libvirt will emulate on the host:
+        # * two type-PCI in slot 0, 1
+        compute1_pci_info = fakelibvirt.HostPCIDevicesInfo(num_pci=2)
+        # the config matches the PCI dev
+        compute1_device_spec = [
+            jsonutils.dumps(x)
+            for x in [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PCI_PROD_ID,
+                },
+            ]
+        ]
+        self.flags(group='pci', passthrough_whitelist=compute1_device_spec)
+        self.start_compute(hostname="compute1", pci_info=compute1_pci_info)
+        self.assertPCIDeviceCounts("compute1", total=2, free=2)
+
+        # create a server that requests two PCI devs
+        extra_spec = {"pci_passthrough:alias": "a-pci-dev:2"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server = self._create_server(flavor_id=flavor_id, networks=[])
+        self.assertPCIDeviceCounts("compute1", total=2, free=0)
+
+        # start another compute with a different amount of PCI dev available
+        compute2_pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pci=num_pci_on_dest)
+        # the config matches the PCI dev
+        compute2_device_spec = [
+            jsonutils.dumps(x)
+            for x in [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PCI_PROD_ID,
+                },
+            ]
+        ]
+        self.flags(group='pci', passthrough_whitelist=compute2_device_spec)
+        self.start_compute(hostname="compute2", pci_info=compute2_pci_info)
+        self.assertPCIDeviceCounts(
+            "compute2", total=num_pci_on_dest, free=num_pci_on_dest)
+
+        # resize the server to request only one PCI dev instead of the current
+        # two. This should fit to compute2 having at least one dev
+        extra_spec = {"pci_passthrough:alias": "a-pci-dev:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        self._resize_server(server, flavor_id=flavor_id)
+        self._confirm_resize(server)
+        self.assertPCIDeviceCounts("compute1", total=2, free=2)
+        self.assertPCIDeviceCounts(
+            "compute2", total=num_pci_on_dest, free=num_pci_on_dest - 1)
+
+    def test_resize_from_two_devs_to_one_dev_dest_has_two_devs(self):
+        # this works
+        self._test_resize_from_two_devs_to_one_dev(num_pci_on_dest=2)
+
+    def test_resize_from_two_devs_to_one_dev_dest_has_one_dev(self):
+        # This is bug 1983753 as nova uses the old InstancePciRequest during
+        # the scheduling and therefore tries to find a compute with two PCI
+        # devs even though the flavor only requests one.
+        ex = self.assertRaises(
+            client.OpenStackApiException,
+            self._test_resize_from_two_devs_to_one_dev,
+            num_pci_on_dest=1
+        )
+        self.assertIn('nova.exception.NoValidHost', str(ex))
+
+    def test_resize_from_vf_to_pf(self):
+        # The fake libvirt will emulate on the host:
+        # * one type-PF in slot 0 with one VF
+        compute1_pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pfs=1, num_vfs=1)
+        # the config matches only the VF
+        compute1_device_spec = [
+            jsonutils.dumps(x)
+            for x in [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.VF_PROD_ID,
+                },
+            ]
+        ]
+        self.flags(group='pci', passthrough_whitelist=compute1_device_spec)
+        self.start_compute(hostname="compute1", pci_info=compute1_pci_info)
+        self.assertPCIDeviceCounts("compute1", total=1, free=1)
+
+        # create a server that requests one Vf
+        extra_spec = {"pci_passthrough:alias": "a-vf:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        server = self._create_server(flavor_id=flavor_id, networks=[])
+        self.assertPCIDeviceCounts("compute1", total=1, free=0)
+
+        # start another compute with a single PF dev available
+        # The fake libvirt will emulate on the host:
+        # * one type-PF in slot 0 with 1 VF
+        compute2_pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pfs=1, num_vfs=1)
+        # the config matches the PF dev but not the VF
+        compute2_device_spec = [
+            jsonutils.dumps(x)
+            for x in [
+                {
+                    "vendor_id": fakelibvirt.PCI_VEND_ID,
+                    "product_id": fakelibvirt.PF_PROD_ID,
+                },
+            ]
+        ]
+        self.flags(group='pci', passthrough_whitelist=compute2_device_spec)
+        self.start_compute(hostname="compute2", pci_info=compute2_pci_info)
+        self.assertPCIDeviceCounts("compute2", total=1, free=1)
+
+        # resize the server to request on PF dev instead of the current VF
+        # dev. This should fit to compute2 having exactly one PF dev.
+        extra_spec = {"pci_passthrough:alias": "a-pf:1"}
+        flavor_id = self._create_flavor(extra_spec=extra_spec)
+        # This is bug 1983753 as nova uses the old InstancePciRequest during
+        # the scheduling and therefore tries to find a compute with a VF dev
+        # even though the flavor only requests a PF dev.
+        ex = self.assertRaises(
+            client.OpenStackApiException,
+            self._resize_server,
+            server,
+            flavor_id=flavor_id,
+        )
+        self.assertIn('nova.exception.NoValidHost', str(ex))

From b6c7ee07ad2a6503b8aad0921cca70e61e60259d Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Sat, 6 Aug 2022 16:09:54 +0200
Subject: [PATCH 25/93] Update RequestSpec.pci_request for resize

Nova uses the RequestSpec.pci_request in the PciPassthroughFilter to
decide if the PCI devicesm, requested via the pci_alias in the flavor
extra_spec, are available on a potential target host. During resize the
new flavor might contain different pci_alias request than the old flavor
of the instance. In this case Nova should use the pci_alias from the new
flavor to scheduler the destination host of the resize. However this
logic was missing and Nova used the old pci_request value based on the
old flavor. This patch adds the missing logic.

Closes-Bug: #1983753
Closes-Bug: #1941005
Change-Id: I73c9ae27e9c42ee211a53bed3d849650b65f08be
(cherry picked from commit a93092e0d5c1483f9ad48276708ee35c54ce44fe)
---
 nova/compute/api.py                           | 13 ++++++++++
 .../libvirt/test_pci_sriov_servers.py         | 15 ++++++-----
 .../regressions/test_bug_1983753.py           | 26 ++++---------------
 nova/tests/unit/compute/test_api.py           |  3 ++-
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/nova/compute/api.py b/nova/compute/api.py
index 9a2cbd3325a..8758411d234 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -4223,6 +4223,19 @@ def resize(self, context, instance, flavor_id=None, clean_shutdown=True,
         if not same_flavor:
             request_spec.numa_topology = hardware.numa_get_constraints(
                 new_flavor, instance.image_meta)
+            # if the flavor is changed then we need to recalculate the
+            # pci_requests as well because the new flavor might request
+            # different pci_aliases
+            new_pci_requests = pci_request.get_pci_requests_from_flavor(
+                new_flavor)
+            new_pci_requests.instance_uuid = instance.uuid
+            # The neutron based InstancePCIRequest cannot change during resize,
+            # so we just need to copy them from the old request
+            for request in request_spec.pci_requests.requests or []:
+                if request.source == objects.InstancePCIRequest.NEUTRON_PORT:
+                    new_pci_requests.requests.append(request)
+            request_spec.pci_requests = new_pci_requests
+
             # TODO(huaqiang): Remove in Wallaby
             # check nova-compute nodes have been updated to Victoria to resize
             # instance to a new mixed instance from a dedicated or shared
diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
index a5e52555e05..c98a7534d16 100644
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@@ -1116,13 +1116,16 @@ def test_resize_pci_to_vanilla(self):
 
         # Resize it to a flavor without PCI devices. We expect this to work, as
         # test_compute1 is available.
-        # FIXME(artom) This is bug 1941005.
         flavor_id = self._create_flavor()
-        ex = self.assertRaises(client.OpenStackApiException,
-                               self._resize_server, server, flavor_id)
-        self.assertEqual(500, ex.response.status_code)
-        self.assertIn('NoValidHost', str(ex))
-        # self._confirm_resize(server)
+        with mock.patch(
+            'nova.virt.libvirt.driver.LibvirtDriver'
+            '.migrate_disk_and_power_off',
+            return_value='{}',
+        ):
+            self._resize_server(server, flavor_id)
+        self._confirm_resize(server)
+        self.assertPCIDeviceCounts('test_compute0', total=1, free=1)
+        self.assertPCIDeviceCounts('test_compute1', total=0, free=0)
 
     def _confirm_resize(self, server, host='host1'):
         # NOTE(sbauza): Unfortunately, _cleanup_resize() in libvirt checks the
diff --git a/nova/tests/functional/regressions/test_bug_1983753.py b/nova/tests/functional/regressions/test_bug_1983753.py
index a11ea8dc63e..78499335ec9 100644
--- a/nova/tests/functional/regressions/test_bug_1983753.py
+++ b/nova/tests/functional/regressions/test_bug_1983753.py
@@ -16,7 +16,6 @@
 from oslo_serialization import jsonutils
 
 from nova.tests.fixtures import libvirt as fakelibvirt
-from nova.tests.functional.api import client
 from nova.tests.functional.libvirt import test_pci_sriov_servers
 
 
@@ -119,19 +118,10 @@ def _test_resize_from_two_devs_to_one_dev(self, num_pci_on_dest):
             "compute2", total=num_pci_on_dest, free=num_pci_on_dest - 1)
 
     def test_resize_from_two_devs_to_one_dev_dest_has_two_devs(self):
-        # this works
         self._test_resize_from_two_devs_to_one_dev(num_pci_on_dest=2)
 
     def test_resize_from_two_devs_to_one_dev_dest_has_one_dev(self):
-        # This is bug 1983753 as nova uses the old InstancePciRequest during
-        # the scheduling and therefore tries to find a compute with two PCI
-        # devs even though the flavor only requests one.
-        ex = self.assertRaises(
-            client.OpenStackApiException,
-            self._test_resize_from_two_devs_to_one_dev,
-            num_pci_on_dest=1
-        )
-        self.assertIn('nova.exception.NoValidHost', str(ex))
+        self._test_resize_from_two_devs_to_one_dev(num_pci_on_dest=1)
 
     def test_resize_from_vf_to_pf(self):
         # The fake libvirt will emulate on the host:
@@ -181,13 +171,7 @@ def test_resize_from_vf_to_pf(self):
         # dev. This should fit to compute2 having exactly one PF dev.
         extra_spec = {"pci_passthrough:alias": "a-pf:1"}
         flavor_id = self._create_flavor(extra_spec=extra_spec)
-        # This is bug 1983753 as nova uses the old InstancePciRequest during
-        # the scheduling and therefore tries to find a compute with a VF dev
-        # even though the flavor only requests a PF dev.
-        ex = self.assertRaises(
-            client.OpenStackApiException,
-            self._resize_server,
-            server,
-            flavor_id=flavor_id,
-        )
-        self.assertIn('nova.exception.NoValidHost', str(ex))
+        self._resize_server(server, flavor_id=flavor_id)
+        self._confirm_resize(server)
+        self.assertPCIDeviceCounts("compute1", total=1, free=1)
+        self.assertPCIDeviceCounts("compute2", total=1, free=0)
diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py
index 9e85ef633d3..eb5b0700d43 100644
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@@ -2073,7 +2073,8 @@ def _check_state(expected_task_state=None):
                 filter_properties = {'ignore_hosts': [fake_inst['host']]}
 
             if request_spec:
-                fake_spec = objects.RequestSpec()
+                fake_spec = objects.RequestSpec(
+                    pci_requests=objects.InstancePCIRequests(requests=[]))
                 if requested_destination:
                     cell1 = objects.CellMapping(uuid=uuids.cell1, name='cell1')
                     fake_spec.requested_destination = objects.Destination(

From 19bac6e9c362ee692f4be92041bb3f6c3b9b6c23 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Tue, 16 Aug 2022 13:19:03 +0200
Subject: [PATCH 26/93] Add reno for fixing bug 1941005

Related-Bug: #1941005
Related-Bug: #1983753
Change-Id: I16ed1143ead3779c87698aa29bac005678db2993
(cherry picked from commit 82cdfa23c7a0e269ab038e241bb7428b7f1391aa)
---
 ...requestspec-pci_request-for-resize-a3c6b0a979db723f.yaml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 releasenotes/notes/bug-1983753-update-requestspec-pci_request-for-resize-a3c6b0a979db723f.yaml

diff --git a/releasenotes/notes/bug-1983753-update-requestspec-pci_request-for-resize-a3c6b0a979db723f.yaml b/releasenotes/notes/bug-1983753-update-requestspec-pci_request-for-resize-a3c6b0a979db723f.yaml
new file mode 100644
index 00000000000..89edd12b3d9
--- /dev/null
+++ b/releasenotes/notes/bug-1983753-update-requestspec-pci_request-for-resize-a3c6b0a979db723f.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    `Bug #1941005 <https://bugs.launchpad.net/nova/+bug/1941005>`_ is fixed.
+    During resize Nova now uses the PCI requests from the new flavor to select
+    the destination host.

From 273831716780090677215ba70168bb74a2dae814 Mon Sep 17 00:00:00 2001
From: Jay Faulkner <jay@jvf.cc>
Date: Wed, 17 Aug 2022 11:31:39 -0700
Subject: [PATCH 27/93] nova-live-migration tests not needed for Ironic

Ironic does not support live migration, so we will skip these tests
if the only changed files are in Ironic virt driver to ensur we
don't waste resources or time trying to run unneeded tests.

Change-Id: Ieb5ac3bb93af6a950acff4d76d0276096a6a24dd
(cherry picked from commit c7b865c79b42457c3a9cf987924737fd8675f53e)
---
 .zuul.yaml | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index dcae6117a52..3866c9af9d1 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -90,7 +90,7 @@
     description: |
       Run tempest live migration tests against local qcow2 ephemeral storage
       and shared LVM/iSCSI cinder volumes.
-    irrelevant-files: &nova-base-irrelevant-files
+    irrelevant-files:
       - ^api-.*$
       - ^(test-|)requirements.txt$
       - ^.*\.rst$
@@ -101,6 +101,7 @@
       - ^nova/policies/.*$
       - ^nova/tests/.*$
       - ^nova/test.py$
+      - ^nova/virt/ironic/.*$
       - ^releasenotes/.*$
       - ^setup.cfg$
       - ^tools/.*$
@@ -130,7 +131,21 @@
       the "iptables_hybrid" securitygroup firewall driver, aka "hybrid plug".
       The external events interactions between Nova and Neutron in these
       situations has historically been fragile. This job exercises them.
-    irrelevant-files: *nova-base-irrelevant-files
+    irrelevant-files: &nova-base-irrelevant-files
+      - ^api-.*$
+      - ^(test-|)requirements.txt$
+      - ^.*\.rst$
+      - ^.git.*$
+      - ^doc/.*$
+      - ^nova/hacking/.*$
+      - ^nova/locale/.*$
+      - ^nova/policies/.*$
+      - ^nova/tests/.*$
+      - ^nova/test.py$
+      - ^releasenotes/.*$
+      - ^setup.cfg$
+      - ^tools/.*$
+      - ^tox.ini$
     vars:
       tox_envlist: all
       tempest_test_regex: (^tempest\..*compute\..*(migration|resize|reboot).*)

From 37129b4b4423ef8ce932ae506f0e9a6ae771ec0c Mon Sep 17 00:00:00 2001
From: Pierre Riteau <pierre@stackhpc.com>
Date: Fri, 16 Sep 2022 10:39:42 +0200
Subject: [PATCH 28/93] Remove mentions of removed scheduler filters

Change-Id: I1348cca8cbd8b1142dab8507c8aa1b9baf01e73c
(cherry picked from commit 4fb4f6832c156907b786571f214984894703bf16)
(cherry picked from commit c3489ed5cc21a9fa968949e04f1c7762f09b5606)
---
 doc/source/contributor/development-environment.rst |  2 +-
 nova/conf/compute.py                               | 12 +++---------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/doc/source/contributor/development-environment.rst b/doc/source/contributor/development-environment.rst
index 32b8f8334e0..3e19ef1ca23 100644
--- a/doc/source/contributor/development-environment.rst
+++ b/doc/source/contributor/development-environment.rst
@@ -197,7 +197,7 @@ Using fake computes for tests
 The number of instances supported by fake computes is not limited by physical
 constraints. It allows you to perform stress tests on a deployment with few
 resources (typically a laptop). Take care to avoid using scheduler filters
-that will limit the number of instances per compute, such as ``AggregateCoreFilter``.
+that will limit the number of instances per compute, such as ``NumInstancesFilter``.
 
 Fake computes can also be used in multi hypervisor-type deployments in order to
 take advantage of fake and "real" computes during tests:
diff --git a/nova/conf/compute.py b/nova/conf/compute.py
index 5abe7694f80..5a60b438954 100644
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@@ -426,9 +426,7 @@
 Virtual CPU to physical CPU allocation ratio.
 
 This option is used to influence the hosts selected by the Placement API by
-configuring the allocation ratio for ``VCPU`` inventory. In addition, the
-``AggregateCoreFilter`` (deprecated) will fall back to this configuration value
-if no per-aggregate setting is found.
+configuring the allocation ratio for ``VCPU`` inventory.
 
 .. note::
 
@@ -459,9 +457,7 @@
 Virtual RAM to physical RAM allocation ratio.
 
 This option is used to influence the hosts selected by the Placement API by
-configuring the allocation ratio for ``MEMORY_MB`` inventory. In addition, the
-``AggregateRamFilter`` (deprecated) will fall back to this configuration value
-if no per-aggregate setting is found.
+configuring the allocation ratio for ``MEMORY_MB`` inventory.
 
 .. note::
 
@@ -487,9 +483,7 @@
 Virtual disk to physical disk allocation ratio.
 
 This option is used to influence the hosts selected by the Placement API by
-configuring the allocation ratio for ``DISK_GB`` inventory. In addition, the
-``AggregateDiskFilter`` (deprecated) will fall back to this configuration value
-if no per-aggregate setting is found.
+configuring the allocation ratio for ``DISK_GB`` inventory.
 
 When configured, a ratio greater than 1.0 will result in over-subscription of
 the available physical disk, which can be useful for more efficiently packing

From 77273f067d96a4ec401c3b36f2922d63c4ad7103 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Thu, 11 Aug 2022 09:50:30 -0700
Subject: [PATCH 29/93] Unify placement client singleton implementations

We have many places where we implement singleton behavior for the
placement client. This unifies them into a single place and
implementation. Not only does this DRY things up, but may cause us
to initialize it fewer times and also allows for emitting a common
set of error messages about expected failures for better
troubleshooting.

Change-Id: Iab8a791f64323f996e1d6e6d5a7e7a7c34eb4fb3
Related-Bug: #1846820
(cherry picked from commit c178d9360665c219cbcc71c9f37b9e6e3055a5e5)
---
 nova/api/openstack/compute/services.py        |  7 +--
 nova/cmd/manage.py                            |  4 +-
 nova/compute/api.py                           | 10 +---
 nova/compute/manager.py                       |  5 ++
 nova/compute/resource_tracker.py              |  2 +-
 nova/conductor/manager.py                     |  2 +-
 nova/conductor/tasks/migrate.py               |  4 +-
 nova/limit/placement.py                       |  6 +--
 nova/quota.py                                 |  7 +--
 nova/scheduler/client/report.py               | 46 +++++++++++++++++++
 nova/scheduler/manager.py                     |  2 +-
 nova/scheduler/request_filter.py              |  2 +-
 nova/test.py                                  |  4 ++
 nova/tests/unit/compute/test_api.py           | 11 ++---
 nova/tests/unit/compute/test_compute.py       | 11 ++---
 .../unit/scheduler/client/test_report.py      | 36 +++++++++++++++
 16 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/nova/api/openstack/compute/services.py b/nova/api/openstack/compute/services.py
index 6deb84a7f1a..e9d51d4d0c8 100644
--- a/nova/api/openstack/compute/services.py
+++ b/nova/api/openstack/compute/services.py
@@ -48,13 +48,10 @@ def __init__(self):
         self.actions = {"enable": self._enable,
                         "disable": self._disable,
                         "disable-log-reason": self._disable_log_reason}
-        self._placementclient = None  # Lazy-load on first access.
 
     @property
     def placementclient(self):
-        if self._placementclient is None:
-            self._placementclient = report.SchedulerReportClient()
-        return self._placementclient
+        return report.report_client_singleton()
 
     def _get_services(self, req):
         # The API services are filtered out since they are not RPC services
@@ -328,7 +325,7 @@ def delete(self, req, id):
                             "Failed to delete compute node resource provider "
                             "for compute node %s: %s",
                             compute_node.uuid, str(e))
-                # remove the host_mapping of this host.
+                # Remove the host_mapping of this host.
                 try:
                     hm = objects.HostMapping.get_by_host(context, service.host)
                     hm.destroy()
diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
index f704a42698e..7067facde70 100644
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@@ -2217,7 +2217,7 @@ def heal_allocations(self, max_count=None, verbose=False, dry_run=False,
                 output(_('No cells to process.'))
                 return 4
 
-        placement = report.SchedulerReportClient()
+        placement = report.report_client_singleton()
 
         neutron = None
         if heal_port_allocations:
@@ -2718,7 +2718,7 @@ def audit(self, verbose=False, provider_uuid=None, delete=False):
         if verbose:
             output = lambda msg: print(msg)
 
-        placement = report.SchedulerReportClient()
+        placement = report.report_client_singleton()
         # Resets two in-memory dicts for knowing instances per compute node
         self.cn_uuid_mapping = collections.defaultdict(tuple)
         self.instances_mapping = collections.defaultdict(list)
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 9a2cbd3325a..ddea854fe2c 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -384,7 +384,6 @@ def __init__(self, image_api=None, network_api=None, volume_api=None):
         self.image_api = image_api or glance.API()
         self.network_api = network_api or neutron.API()
         self.volume_api = volume_api or cinder.API()
-        self._placementclient = None  # Lazy-load on first access.
         self.compute_rpcapi = compute_rpcapi.ComputeAPI()
         self.compute_task_api = conductor.ComputeTaskAPI()
         self.servicegroup_api = servicegroup.API()
@@ -2573,9 +2572,7 @@ def _local_cleanup_bdm_volumes(self, bdms, instance, context):
 
     @property
     def placementclient(self):
-        if self._placementclient is None:
-            self._placementclient = report.SchedulerReportClient()
-        return self._placementclient
+        return report.report_client_singleton()
 
     def _local_delete(self, context, instance, bdms, delete_type, cb):
         if instance.vm_state == vm_states.SHELVED_OFFLOADED:
@@ -6309,13 +6306,10 @@ class AggregateAPI:
     def __init__(self):
         self.compute_rpcapi = compute_rpcapi.ComputeAPI()
         self.query_client = query.SchedulerQueryClient()
-        self._placement_client = None  # Lazy-load on first access.
 
     @property
     def placement_client(self):
-        if self._placement_client is None:
-            self._placement_client = report.SchedulerReportClient()
-        return self._placement_client
+        return report.report_client_singleton()
 
     @wrap_exception()
     def create_aggregate(self, context, aggregate_name, availability_zone):
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 4df1c4112c3..fa9425f50cc 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -623,6 +623,11 @@ def __init__(self, compute_driver=None, *args, **kwargs):
         # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all
         # using the same instance of SchedulerReportClient which has the
         # ProviderTree cache for this compute service.
+        # NOTE(danms): We do not use the global placement client
+        # singleton here, because the above-mentioned stack of objects
+        # maintain local state in the client. Thus, keeping our own
+        # private object for that stack avoids any potential conflict
+        # with other users in our process outside of the above.
         self.reportclient = report.SchedulerReportClient()
         self.virtapi = ComputeVirtAPI(self)
         self.network_api = neutron.API()
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 0b801f7ddf9..058777d1ed0 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -103,7 +103,7 @@ def __init__(self, host, driver, reportclient=None):
         monitor_handler = monitors.MonitorHandler(self)
         self.monitors = monitor_handler.monitors
         self.old_resources = collections.defaultdict(objects.ComputeNode)
-        self.reportclient = reportclient or report.SchedulerReportClient()
+        self.reportclient = reportclient or report.report_client_singleton()
         self.ram_allocation_ratio = CONF.ram_allocation_ratio
         self.cpu_allocation_ratio = CONF.cpu_allocation_ratio
         self.disk_allocation_ratio = CONF.disk_allocation_ratio
diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index 99e5514136e..aaec1c99b77 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -243,7 +243,7 @@ def __init__(self):
         self.network_api = neutron.API()
         self.servicegroup_api = servicegroup.API()
         self.query_client = query.SchedulerQueryClient()
-        self.report_client = report.SchedulerReportClient()
+        self.report_client = report.report_client_singleton()
         self.notifier = rpc.get_notifier('compute')
         # Help us to record host in EventReporter
         self.host = CONF.host
diff --git a/nova/conductor/tasks/migrate.py b/nova/conductor/tasks/migrate.py
index 6ff6206f659..8838d0240a6 100644
--- a/nova/conductor/tasks/migrate.py
+++ b/nova/conductor/tasks/migrate.py
@@ -54,7 +54,7 @@ def replace_allocation_with_migration(context, instance, migration):
         # and do any rollback required
         raise
 
-    reportclient = report.SchedulerReportClient()
+    reportclient = report.report_client_singleton()
 
     orig_alloc = reportclient.get_allocs_for_consumer(
         context, instance.uuid)['allocations']
@@ -94,7 +94,7 @@ def replace_allocation_with_migration(context, instance, migration):
 def revert_allocation_for_migration(context, source_cn, instance, migration):
     """Revert an allocation made for a migration back to the instance."""
 
-    reportclient = report.SchedulerReportClient()
+    reportclient = report.report_client_singleton()
 
     # FIXME(gibi): This method is flawed in that it does not handle allocations
     # against sharing providers in any special way. This leads to duplicate
diff --git a/nova/limit/placement.py b/nova/limit/placement.py
index 497986c4ab8..eedf7d69e19 100644
--- a/nova/limit/placement.py
+++ b/nova/limit/placement.py
@@ -43,10 +43,8 @@
 def _get_placement_usages(
     context: 'nova.context.RequestContext', project_id: str
 ) -> ty.Dict[str, int]:
-    global PLACEMENT_CLIENT
-    if not PLACEMENT_CLIENT:
-        PLACEMENT_CLIENT = report.SchedulerReportClient()
-    return PLACEMENT_CLIENT.get_usages_counts_for_limits(context, project_id)
+    return report.report_client_singleton().get_usages_counts_for_limits(
+        context, project_id)
 
 
 def _get_usage(
diff --git a/nova/quota.py b/nova/quota.py
index b9dd7630127..eafad4cd23d 100644
--- a/nova/quota.py
+++ b/nova/quota.py
@@ -1348,11 +1348,8 @@ def _instances_cores_ram_count_legacy(context, project_id, user_id=None):
 
 
 def _cores_ram_count_placement(context, project_id, user_id=None):
-    global PLACEMENT_CLIENT
-    if not PLACEMENT_CLIENT:
-        PLACEMENT_CLIENT = report.SchedulerReportClient()
-    return PLACEMENT_CLIENT.get_usages_counts_for_quota(context, project_id,
-                                                        user_id=user_id)
+    return report.report_client_singleton().get_usages_counts_for_quota(
+        context, project_id, user_id=user_id)
 
 
 def _instances_cores_ram_count_api_db_placement(context, project_id,
diff --git a/nova/scheduler/client/report.py b/nova/scheduler/client/report.py
index e4d0c8e3db6..ff86527cf55 100644
--- a/nova/scheduler/client/report.py
+++ b/nova/scheduler/client/report.py
@@ -52,6 +52,7 @@
 NESTED_PROVIDER_API_VERSION = '1.14'
 POST_ALLOCATIONS_API_VERSION = '1.13'
 GET_USAGES_VERSION = '1.9'
+PLACEMENTCLIENT = None
 
 AggInfo = collections.namedtuple('AggInfo', ['aggregates', 'generation'])
 TraitInfo = collections.namedtuple('TraitInfo', ['traits', 'generation'])
@@ -67,6 +68,51 @@ def warn_limit(self, msg):
         LOG.warning(msg)
 
 
+def report_client_singleton():
+    """Return a reference to the global placement client singleton.
+
+    This initializes the placement client once and returns a reference
+    to that singleton on subsequent calls. Errors are raised
+    (particularly ks_exc.*) but context-specific error messages are
+    logged for consistency.
+    """
+    # NOTE(danms): The report client maintains internal state in the
+    # form of the provider tree, which will be shared across all users
+    # of this global client. That is not a problem now, but in the
+    # future it may be beneficial to fix that. One idea would be to
+    # change the behavior of the client such that the static-config
+    # pieces of the actual keystone client are separate from the
+    # internal state, so that we can return a new object here with a
+    # context-specific local state object, but with the client bits
+    # shared.
+    global PLACEMENTCLIENT
+    if PLACEMENTCLIENT is None:
+        try:
+            PLACEMENTCLIENT = SchedulerReportClient()
+        except ks_exc.EndpointNotFound:
+            LOG.error('The placement API endpoint was not found.')
+            raise
+        except ks_exc.MissingAuthPlugin:
+            LOG.error('No authentication information found for placement API.')
+            raise
+        except ks_exc.Unauthorized:
+            LOG.error('Placement service credentials do not work.')
+            raise
+        except ks_exc.DiscoveryFailure:
+            LOG.error('Discovering suitable URL for placement API failed.')
+            raise
+        except (ks_exc.ConnectFailure,
+                ks_exc.RequestTimeout,
+                ks_exc.GatewayTimeout):
+            LOG.error('Placement API service is not responding.')
+            raise
+        except Exception:
+            LOG.error('Failed to initialize placement client '
+                      '(is keystone available?)')
+            raise
+    return PLACEMENTCLIENT
+
+
 def safe_connect(f):
     @functools.wraps(f)
     def wrapper(self, *a, **k):
diff --git a/nova/scheduler/manager.py b/nova/scheduler/manager.py
index 03df615f6a6..10b330653de 100644
--- a/nova/scheduler/manager.py
+++ b/nova/scheduler/manager.py
@@ -66,7 +66,7 @@ def __init__(self, *args, **kwargs):
         self.host_manager = host_manager.HostManager()
         self.servicegroup_api = servicegroup.API()
         self.notifier = rpc.get_notifier('scheduler')
-        self.placement_client = report.SchedulerReportClient()
+        self.placement_client = report.report_client_singleton()
 
         super().__init__(service_name='scheduler', *args, **kwargs)
 
diff --git a/nova/scheduler/request_filter.py b/nova/scheduler/request_filter.py
index bd237b06cac..3f96b7a8806 100644
--- a/nova/scheduler/request_filter.py
+++ b/nova/scheduler/request_filter.py
@@ -311,7 +311,7 @@ def routed_networks_filter(
 
     # Get the clients we need
     network_api = neutron.API()
-    report_api = report.SchedulerReportClient()
+    report_api = report.report_client_singleton()
 
     for requested_network in requested_networks:
         network_id = None
diff --git a/nova/test.py b/nova/test.py
index a6449c01f03..35fef9cdd7f 100644
--- a/nova/test.py
+++ b/nova/test.py
@@ -61,6 +61,7 @@
 from nova import objects
 from nova.objects import base as objects_base
 from nova import quota
+from nova.scheduler.client import report
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.unit import matchers
 from nova import utils
@@ -290,6 +291,9 @@ def setUp(self):
         # instead of only once initialized for test worker
         wsgi_app.init_global_data.reset()
 
+        # Reset the placement client singleton
+        report.PLACEMENTCLIENT = None
+
     def _setup_cells(self):
         """Setup a normal cellsv2 environment.
 
diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py
index 9e85ef633d3..87a6a152cdb 100644
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@@ -7740,16 +7740,13 @@ def test_compute_api_host(self):
         self.assertTrue(hasattr(self.compute_api, 'host'))
         self.assertEqual(CONF.host, self.compute_api.host)
 
-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient')
+    @mock.patch('nova.scheduler.client.report.report_client_singleton')
     def test_placement_client_init(self, mock_report_client):
         """Tests to make sure that the construction of the placement client
-        only happens once per API class instance.
+        uses the singleton helper, and happens only when needed.
         """
-        self.assertIsNone(self.compute_api._placementclient)
-        # Access the property twice to make sure SchedulerReportClient is
-        # only loaded once.
-        for x in range(2):
-            self.compute_api.placementclient
+        self.assertFalse(mock_report_client.called)
+        self.compute_api.placementclient
         mock_report_client.assert_called_once_with()
 
     def test_validate_host_for_cold_migrate_same_host_fails(self):
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index d8f443843f3..df2bd328a3e 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -13046,16 +13046,13 @@ def test_aggregate_list_with_hosts(self, mock_add_host,
         hosts = aggregate.hosts if 'hosts' in aggregate else None
         self.assertIn(values[0][1][0], hosts)
 
-    @mock.patch('nova.scheduler.client.report.SchedulerReportClient')
+    @mock.patch('nova.scheduler.client.report.report_client_singleton')
     def test_placement_client_init(self, mock_report_client):
         """Tests to make sure that the construction of the placement client
-        only happens once per AggregateAPI class instance.
+        uses the singleton helper, and happens only when needed.
         """
-        self.assertIsNone(self.api._placement_client)
-        # Access the property twice to make sure SchedulerReportClient is
-        # only loaded once.
-        for x in range(2):
-            self.api.placement_client
+        self.assertFalse(mock_report_client.called)
+        self.api.placement_client
         mock_report_client.assert_called_once_with()
 
 
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
index 0650c62096f..485f187d9ef 100644
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -10,6 +10,7 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 import copy
+import ddt
 import time
 from urllib import parse
 
@@ -150,6 +151,41 @@ def test_failed_discovery(self, req):
         self.assertTrue(req.called)
 
 
+@ddt.ddt
+class TestSingleton(test.NoDBTestCase):
+    def test_singleton(self):
+        # Make sure we start with a clean slate
+        self.assertIsNone(report.PLACEMENTCLIENT)
+
+        # Make sure the first call creates the singleton, sets it
+        # globally, and returns it
+        client = report.report_client_singleton()
+        self.assertEqual(client, report.PLACEMENTCLIENT)
+
+        # Make sure that a subsequent call returns the same thing
+        # again and that the global is unchanged
+        self.assertEqual(client, report.report_client_singleton())
+        self.assertEqual(client, report.PLACEMENTCLIENT)
+
+    @ddt.data(ks_exc.EndpointNotFound,
+              ks_exc.MissingAuthPlugin,
+              ks_exc.Unauthorized,
+              ks_exc.DiscoveryFailure,
+              ks_exc.ConnectFailure,
+              ks_exc.RequestTimeout,
+              ks_exc.GatewayTimeout,
+              test.TestingException)
+    def test_errors(self, exc):
+        self._test_error(exc)
+
+    @mock.patch.object(report, 'LOG')
+    def _test_error(self, exc, mock_log):
+        with mock.patch.object(report.SchedulerReportClient, '_create_client',
+                               side_effect=exc):
+            self.assertRaises(exc, report.report_client_singleton)
+        mock_log.error.assert_called_once()
+
+
 class TestConstructor(test.NoDBTestCase):
     def setUp(self):
         super(TestConstructor, self).setUp()

From 19346082058d51c78bb157ca5e1304d15691dd9a Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Thu, 11 Aug 2022 10:18:25 -0700
Subject: [PATCH 30/93] Avoid n-cond startup abort for keystone failures

Conductor creates a placement client for the potential case where
it needs to make a call for certain operations. A transient network
or keystone failure will currently cause it to abort startup, which
means it is not available for other unrelated activities, such as
DB proxying for compute.

This makes conductor test the placement client on startup, but only
abort startup on errors that are highly likely to be permanent
configuration errors, and only warn about things like being unable
to contact keystone/placement during initialization. If a non-fatal
error is encountered at startup, later operations needing the
placement client will retry initialization.

Conflicts:
    nova/tests/unit/conductor/test_conductor.py

NOTE(melwitt): The conflict is because change
Id5b04cf2f6ca24af8e366d23f15cf0e5cac8e1cc
(Use unittest.mock instead of third party mock) is not in Yoga.

Closes-Bug: #1846820
Change-Id: Idb7fcbce0c9562e7b9bd3e80f2a6d4b9bc286830
(cherry picked from commit 232684b44022f1bc4d72b07045900780de456e63)
---
 nova/conductor/manager.py                     | 34 +++++++++++++++++-
 nova/tests/unit/conductor/test_conductor.py   | 35 +++++++++++++++++++
 .../unit/scheduler/client/test_report.py      | 19 ++++++++++
 3 files changed, 87 insertions(+), 1 deletion(-)

diff --git a/nova/conductor/manager.py b/nova/conductor/manager.py
index aaec1c99b77..53067bbef7c 100644
--- a/nova/conductor/manager.py
+++ b/nova/conductor/manager.py
@@ -21,6 +21,7 @@
 import functools
 import sys
 
+from keystoneauth1 import exceptions as ks_exc
 from oslo_config import cfg
 from oslo_db import exception as db_exc
 from oslo_limit import exception as limit_exceptions
@@ -243,11 +244,42 @@ def __init__(self):
         self.network_api = neutron.API()
         self.servicegroup_api = servicegroup.API()
         self.query_client = query.SchedulerQueryClient()
-        self.report_client = report.report_client_singleton()
         self.notifier = rpc.get_notifier('compute')
         # Help us to record host in EventReporter
         self.host = CONF.host
 
+        try:
+            # Test our placement client during initialization
+            self.report_client
+        except (ks_exc.EndpointNotFound,
+                ks_exc.DiscoveryFailure,
+                ks_exc.RequestTimeout,
+                ks_exc.GatewayTimeout,
+                ks_exc.ConnectFailure) as e:
+            # Non-fatal, likely transient (although not definitely);
+            # continue startup but log the warning so that when things
+            # fail later, it will be clear why we can not do certain
+            # things.
+            LOG.warning('Unable to initialize placement client (%s); '
+                        'Continuing with startup, but some operations '
+                        'will not be possible.', e)
+        except (ks_exc.MissingAuthPlugin,
+                ks_exc.Unauthorized) as e:
+            # This is almost definitely fatal mis-configuration. The
+            # Unauthorized error might be transient, but it is
+            # probably reasonable to consider it fatal.
+            LOG.error('Fatal error initializing placement client; '
+                      'config is incorrect or incomplete: %s', e)
+            raise
+        except Exception as e:
+            # Unknown/unexpected errors here are fatal
+            LOG.error('Fatal error initializing placement client: %s', e)
+            raise
+
+    @property
+    def report_client(self):
+        return report.report_client_singleton()
+
     def reset(self):
         LOG.info('Reloading compute RPC API')
         compute_rpcapi.LAST_VERSION = None
diff --git a/nova/tests/unit/conductor/test_conductor.py b/nova/tests/unit/conductor/test_conductor.py
index 15aa960aad9..8c954db9a7c 100644
--- a/nova/tests/unit/conductor/test_conductor.py
+++ b/nova/tests/unit/conductor/test_conductor.py
@@ -17,6 +17,8 @@
 
 import copy
 
+import ddt
+from keystoneauth1 import exceptions as ks_exc
 import mock
 from oslo_db import exception as db_exc
 from oslo_limit import exception as limit_exceptions
@@ -52,6 +54,7 @@
 from nova.objects import fields
 from nova.objects import request_spec
 from nova.scheduler.client import query
+from nova.scheduler.client import report
 from nova.scheduler import utils as scheduler_utils
 from nova import test
 from nova.tests import fixtures
@@ -4869,3 +4872,35 @@ def _test(mock_cache):
             logtext)
         self.assertIn('host3\' because it is not up', logtext)
         self.assertIn('image1 failed 1 times', logtext)
+
+
+@ddt.ddt
+class TestConductorTaskManager(test.NoDBTestCase):
+    def test_placement_client_startup(self):
+        self.assertIsNone(report.PLACEMENTCLIENT)
+        conductor_manager.ComputeTaskManager()
+        self.assertIsNotNone(report.PLACEMENTCLIENT)
+
+    @ddt.data(ks_exc.MissingAuthPlugin,
+              ks_exc.Unauthorized,
+              test.TestingException)
+    def test_placement_client_startup_fatals(self, exc):
+        self.assertRaises(exc,
+                          self._test_placement_client_startup_exception, exc)
+
+    @ddt.data(ks_exc.EndpointNotFound,
+              ks_exc.DiscoveryFailure,
+              ks_exc.RequestTimeout,
+              ks_exc.GatewayTimeout,
+              ks_exc.ConnectFailure)
+    def test_placement_client_startup_non_fatal(self, exc):
+        self._test_placement_client_startup_exception(exc)
+
+    @mock.patch.object(report, 'LOG')
+    def _test_placement_client_startup_exception(self, exc, mock_log):
+        with mock.patch.object(report.SchedulerReportClient, '_create_client',
+                               side_effect=exc):
+            try:
+                conductor_manager.ComputeTaskManager()
+            finally:
+                mock_log.error.assert_called_once()
diff --git a/nova/tests/unit/scheduler/client/test_report.py b/nova/tests/unit/scheduler/client/test_report.py
index 485f187d9ef..9b2f5c3a0aa 100644
--- a/nova/tests/unit/scheduler/client/test_report.py
+++ b/nova/tests/unit/scheduler/client/test_report.py
@@ -185,6 +185,25 @@ def _test_error(self, exc, mock_log):
             self.assertRaises(exc, report.report_client_singleton)
         mock_log.error.assert_called_once()
 
+    def test_error_then_success(self):
+        # Simulate an error
+        self._test_error(ks_exc.ConnectFailure)
+
+        # Ensure we did not set the global client
+        self.assertIsNone(report.PLACEMENTCLIENT)
+
+        # Call again, with no error
+        client = report.report_client_singleton()
+
+        # Make sure we got a client and that it was set as the global
+        # one
+        self.assertIsNotNone(client)
+        self.assertEqual(client, report.PLACEMENTCLIENT)
+
+        # Make sure we keep getting the same one
+        client2 = report.report_client_singleton()
+        self.assertEqual(client, client2)
+
 
 class TestConstructor(test.NoDBTestCase):
     def setUp(self):

From 4316234e63b76e4f9877ec6e924b5c54ea761bbb Mon Sep 17 00:00:00 2001
From: Brett Milford <brett.milford@canonical.com>
Date: Thu, 4 Aug 2022 16:52:33 +1000
Subject: [PATCH 31/93] Handle "no RAM info was set" migration case

This handles the case where the live migration monitoring thread may
race and call jobStats() after the migration has completed resulting in
the following error:

    libvirt.libvirtError: internal error: migration was active, but no
    RAM info was set

Closes-Bug: #1982284
Change-Id: I77fdfa9cffbd44b2889f49f266b2582bcc6a4267
(cherry picked from commit 9fea934c71d3c2fa7fdd80c67d94e18466c5cf9a)
(cherry picked from commit 00396fa9396324780c09161ed57a86b7e458c26f)
---
 nova/tests/unit/virt/libvirt/test_guest.py    | 22 +++++++++++++++++++
 nova/virt/libvirt/guest.py                    |  7 ++++++
 ...-no-ram-info-was-set-99784934ed80fd72.yaml | 11 ++++++++++
 3 files changed, 40 insertions(+)
 create mode 100644 releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml

diff --git a/nova/tests/unit/virt/libvirt/test_guest.py b/nova/tests/unit/virt/libvirt/test_guest.py
index 70d438d816a..47e9ba4b623 100644
--- a/nova/tests/unit/virt/libvirt/test_guest.py
+++ b/nova/tests/unit/virt/libvirt/test_guest.py
@@ -1040,3 +1040,25 @@ def test_job_info_operation_invalid(self, mock_stats, mock_info):
 
         mock_stats.assert_called_once_with()
         mock_info.assert_called_once_with()
+
+    @mock.patch.object(fakelibvirt.virDomain, "jobInfo")
+    @mock.patch.object(fakelibvirt.virDomain, "jobStats")
+    def test_job_stats_no_ram(self, mock_stats, mock_info):
+        mock_stats.side_effect = fakelibvirt.make_libvirtError(
+            fakelibvirt.libvirtError,
+            "internal error: migration was active, but no RAM info was set",
+            error_code=fakelibvirt.VIR_ERR_INTERNAL_ERROR,
+            error_message="migration was active, but no RAM info was set")
+
+        info = self.guest.get_job_info()
+
+        self.assertIsInstance(info, libvirt_guest.JobInfo)
+        self.assertEqual(fakelibvirt.VIR_DOMAIN_JOB_NONE, info.type)
+        self.assertEqual(0, info.time_elapsed)
+        self.assertEqual(0, info.time_remaining)
+        self.assertEqual(0, info.memory_total)
+        self.assertEqual(0, info.memory_processed)
+        self.assertEqual(0, info.memory_remaining)
+
+        mock_stats.assert_called_once_with()
+        self.assertFalse(mock_info.called)
diff --git a/nova/virt/libvirt/guest.py b/nova/virt/libvirt/guest.py
index 53080e41f0b..68bd4ca5b07 100644
--- a/nova/virt/libvirt/guest.py
+++ b/nova/virt/libvirt/guest.py
@@ -655,6 +655,7 @@ def get_job_info(self):
                 stats = self._domain.jobStats()
                 return JobInfo(**stats)
             except libvirt.libvirtError as ex:
+                errmsg = ex.get_error_message()
                 if ex.get_error_code() == libvirt.VIR_ERR_NO_SUPPORT:
                     # Remote libvirt doesn't support new API
                     LOG.debug("Missing remote virDomainGetJobStats: %s", ex)
@@ -667,6 +668,12 @@ def get_job_info(self):
                     # away completclsely
                     LOG.debug("Domain has shutdown/gone away: %s", ex)
                     return JobInfo(type=libvirt.VIR_DOMAIN_JOB_COMPLETED)
+                elif (ex.get_error_code() == libvirt.VIR_ERR_INTERNAL_ERROR and
+                      errmsg and "migration was active, "
+                      "but no RAM info was set" in errmsg):
+                    LOG.debug("Migration is active or completed but "
+                              "virDomainGetJobStats is missing ram: %s", ex)
+                    return JobInfo(type=libvirt.VIR_DOMAIN_JOB_NONE)
                 else:
                     LOG.debug("Failed to get job stats: %s", ex)
                     raise
diff --git a/releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml b/releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml
new file mode 100644
index 00000000000..943aa99a436
--- /dev/null
+++ b/releasenotes/notes/bug-1982284-libvirt-handle-no-ram-info-was-set-99784934ed80fd72.yaml
@@ -0,0 +1,11 @@
+---
+other:
+  - |
+    A workaround has been added to the libvirt driver to catch and pass
+    migrations that were previously failing with the error:
+
+    ``libvirt.libvirtError: internal error: migration was active, but no RAM info was set``
+
+    See `bug 1982284`_ for more details.
+
+    .. _bug 1982284: https://bugs.launchpad.net/nova/+bug/1982284

From 71e5a1dbcc22aeaa798d3d06ce392cf73364b8db Mon Sep 17 00:00:00 2001
From: Amit Uniyal <auniyal@redhat.com>
Date: Thu, 25 Aug 2022 05:08:44 +0000
Subject: [PATCH 32/93] Adds a repoducer for post live migration fail

Adds a regression test or repoducer for post live migration
fail at destination, the possible casue can be fail to get
instance network info or  block device info

changes:
adds return server from _live_migrate in _integrated_helpers

Related-Bug: #1628606
Change-Id: I48dbe0aae8a3943fdde69cda1bd663d70ea0eb19
(cherry picked from commit a20baeca1f5ebb0dfe9607335a6986e9ed0e1725)
(cherry picked from commit 74a618a8118642c9fd32c4e0d502d12ac826affe)
---
 nova/tests/functional/integrated_helpers.py   |  3 +-
 .../regressions/test_bug_1628606.py           | 61 +++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 nova/tests/functional/regressions/test_bug_1628606.py

diff --git a/nova/tests/functional/integrated_helpers.py b/nova/tests/functional/integrated_helpers.py
index 028ef53d7ea..bd6244546c8 100644
--- a/nova/tests/functional/integrated_helpers.py
+++ b/nova/tests/functional/integrated_helpers.py
@@ -540,8 +540,9 @@ def _live_migrate(
         self.api.post_server_action(
             server['id'],
             {'os-migrateLive': {'host': None, 'block_migration': 'auto'}})
-        self._wait_for_state_change(server, server_expected_state)
+        server = self._wait_for_state_change(server, server_expected_state)
         self._wait_for_migration_status(server, [migration_expected_state])
+        return server
 
     _live_migrate_server = _live_migrate
 
diff --git a/nova/tests/functional/regressions/test_bug_1628606.py b/nova/tests/functional/regressions/test_bug_1628606.py
new file mode 100644
index 00000000000..995c552cf9a
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1628606.py
@@ -0,0 +1,61 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from nova import test
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional.api import client
+from nova.tests.functional import fixtures as func_fixtures
+from nova.tests.functional import integrated_helpers
+from unittest import mock
+
+
+class PostLiveMigrationFail(
+    test.TestCase, integrated_helpers.InstanceHelperMixin):
+    """Regression test for bug 1628606
+    """
+
+    def setUp(self):
+        super().setUp()
+        self.useFixture(nova_fixtures.NeutronFixture(self))
+        self.glance = self.useFixture(nova_fixtures.GlanceFixture(self))
+        self.useFixture(func_fixtures.PlacementFixture())
+        self.useFixture(nova_fixtures.HostNameWeigherFixture())
+
+        self.start_service('conductor')
+        self.start_service('scheduler')
+
+        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+            api_version='v2.1'))
+
+        self.api = api_fixture.admin_api
+        self.api.microversion = 'latest'
+
+        self.src = self._start_compute(host='host1')
+        self.dest = self._start_compute(host='host2')
+
+    @mock.patch(
+        'nova.compute.manager.ComputeManager'
+        '._post_live_migration_remove_source_vol_connections')
+    def test_post_live_migration(self, mock_migration):
+        server = self._create_server(networks=[])
+        self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host'])
+
+        error = client.OpenStackApiException(
+            "Failed to remove source vol connection post live migration")
+        mock_migration.side_effect = error
+
+        server = self._live_migrate(
+            server, migration_expected_state='error',
+            server_expected_state='ERROR')
+        # FIXME(amit): this should point to the dest as after migration
+        # but does not because of bug 1628606
+        self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host'])

From 17ae907569e45cc0f5c7da9511bb668a877b7b2e Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Thu, 13 May 2021 12:48:21 +0100
Subject: [PATCH 33/93] [compute] always set instance.host in
 post_livemigration

This change add a new _post_live_migration_update_host
function that wraps _post_live_migration and just ensures
that if we exit due to an exception instance.host is set
to the destination host.

when we are in _post_live_migration the guest has already
started running on the destination host and we cannot revert.
Sometimes admins or users will hard reboot the instance expecting
that to fix everything when the vm enters the error state after
the failed migrations. Previously this would end up recreating the
instance on the source node leading to possible data corruption if
the instance used shared storage.

Change-Id: Ibc4bc7edf1c8d1e841c72c9188a0a62836e9f153
Partial-Bug: #1628606
(cherry picked from commit 8449b7caefa4a5c0728e11380a088525f15ad6f5)
(cherry picked from commit 643b0c7d35752b214eee19b8d7298a19a8493f6b)
---
 nova/compute/manager.py                       | 43 +++++++++++++++++--
 .../regressions/test_bug_1628606.py           |  5 +--
 nova/tests/unit/compute/test_compute_mgr.py   | 21 +++++++++
 3 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 4df1c4112c3..1257ed9140f 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -8572,8 +8572,9 @@ def _do_live_migration(self, context, dest, instance, block_migration,
         # host attachment. We fetch BDMs before that to retain connection_info
         # and attachment_id relating to the source host for post migration
         # cleanup.
-        post_live_migration = functools.partial(self._post_live_migration,
-                                                source_bdms=source_bdms)
+        post_live_migration = functools.partial(
+            self._post_live_migration_update_host, source_bdms=source_bdms
+            )
         rollback_live_migration = functools.partial(
             self._rollback_live_migration, source_bdms=source_bdms)
 
@@ -8845,6 +8846,42 @@ def _post_live_migration_remove_source_vol_connections(
                                   bdm.attachment_id, self.host,
                                   str(e), instance=instance)
 
+    # TODO(sean-k-mooney): add typing
+    def _post_live_migration_update_host(
+        self, ctxt, instance, dest, block_migration=False,
+        migrate_data=None, source_bdms=None
+    ):
+        try:
+            self._post_live_migration(
+                ctxt, instance, dest, block_migration, migrate_data,
+                source_bdms)
+        except Exception:
+            # Restore the instance object
+            node_name = None
+            try:
+                # get node name of compute, where instance will be
+                # running after migration, that is destination host
+                compute_node = self._get_compute_info(ctxt, dest)
+                node_name = compute_node.hypervisor_hostname
+            except exception.ComputeHostNotFound:
+                LOG.exception('Failed to get compute_info for %s', dest)
+
+            # we can never rollback from post live migration and we can only
+            # get here if the instance is running on the dest so we ensure
+            # the instance.host is set correctly and reraise the original
+            # exception unmodified.
+            if instance.host != dest:
+                # apply saves the new fields while drop actually removes the
+                # migration context from the instance, so migration persists.
+                instance.apply_migration_context()
+                instance.drop_migration_context()
+                instance.host = dest
+                instance.task_state = None
+                instance.node = node_name
+                instance.progress = 0
+                instance.save()
+            raise
+
     @wrap_exception()
     @wrap_instance_fault
     def _post_live_migration(self, ctxt, instance, dest,
@@ -8856,7 +8893,7 @@ def _post_live_migration(self, ctxt, instance, dest,
         and mainly updating database record.
 
         :param ctxt: security context
-        :param instance: instance dict
+        :param instance: instance object
         :param dest: destination host
         :param block_migration: if true, prepare for block migration
         :param migrate_data: if not None, it is a dict which has data
diff --git a/nova/tests/functional/regressions/test_bug_1628606.py b/nova/tests/functional/regressions/test_bug_1628606.py
index 995c552cf9a..0fccd78ccec 100644
--- a/nova/tests/functional/regressions/test_bug_1628606.py
+++ b/nova/tests/functional/regressions/test_bug_1628606.py
@@ -56,6 +56,5 @@ def test_post_live_migration(self, mock_migration):
         server = self._live_migrate(
             server, migration_expected_state='error',
             server_expected_state='ERROR')
-        # FIXME(amit): this should point to the dest as after migration
-        # but does not because of bug 1628606
-        self.assertEqual(self.src.host, server['OS-EXT-SRV-ATTR:host'])
+
+        self.assertEqual(self.dest.host, server['OS-EXT-SRV-ATTR:host'])
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 760ea79e877..1390520d3d1 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -9958,6 +9958,27 @@ def test_post_live_migration_new_allocations(self):
                                                 self.instance,
                                                 migration)
 
+    def test_post_live_migration_update_host(self):
+        @mock.patch.object(self.compute, '_get_compute_info')
+        def _test_post_live_migration(_get_compute_info):
+            dest_host = 'dest'
+            cn = objects.ComputeNode(hypervisor_hostname=dest_host)
+            _get_compute_info.return_value = cn
+            instance = fake_instance.fake_instance_obj(self.context,
+                                                        node='src',
+                                                        uuid=uuids.instance)
+            with mock.patch.object(self.compute, "_post_live_migration"
+                    ) as plm, mock.patch.object(instance, "save") as save:
+                error = ValueError("some failure")
+                plm.side_effect = error
+                self.assertRaises(
+                    ValueError, self.compute._post_live_migration_update_host,
+                    self.context, instance, dest_host)
+                save.assert_called_once()
+                self.assertEqual(instance.host, dest_host)
+
+        _test_post_live_migration()
+
     def test_post_live_migration_cinder_pre_344_api(self):
         # Because live migration has
         # succeeded,_post_live_migration_remove_source_vol_connections()

From 683cbd06336bba37d033c292c4c0ea83e06d9c07 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Sun, 20 Mar 2022 09:47:10 +0100
Subject: [PATCH 34/93] refactor: remove duplicated logic

Remove _update_port_pci_binding_profile and replace its usage with
_get_pci_device_profile.

Change-Id: I34dae6fdb746205f0baa4997e69eec55634bec4d
(cherry picked from commit 8d2776fb34339b311c713992a39507452c4ae42f)
---
 nova/network/neutron.py | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index 2021bdb58f8..1e7d33f73eb 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -3694,25 +3694,6 @@ def _get_port_pci_dev(self, context, instance, port):
             return None
         return device
 
-    def _update_port_pci_binding_profile(self, pci_dev, binding_profile):
-        """Update the binding profile dict with new PCI device data.
-
-        :param pci_dev: The PciDevice object to update the profile with.
-        :param binding_profile: The dict to update.
-        """
-        binding_profile.update({'pci_slot': pci_dev.address})
-        if binding_profile.get('card_serial_number'):
-            binding_profile.update({
-                'card_serial_number': pci_dev.card_serial_number})
-        if binding_profile.get('pf_mac_address'):
-            binding_profile.update({
-                'pf_mac_address': pci_utils.get_mac_by_pci_address(
-                    pci_dev.parent_addr)})
-        if binding_profile.get('vf_num'):
-            binding_profile.update({
-                'vf_num': pci_utils.get_vf_num_by_pci_address(
-                    pci_dev.address)})
-
     def _update_port_binding_for_instance(
             self, context, instance, host, migration=None,
             provider_mappings=None):
@@ -3781,8 +3762,9 @@ def _update_port_binding_for_instance(
                 else:
                     pci_dev = self._get_port_pci_dev(context, instance, p)
                     if pci_dev:
-                        self._update_port_pci_binding_profile(pci_dev,
-                                                              binding_profile)
+                        binding_profile.update(
+                            self._get_pci_device_profile(pci_dev)
+                        )
                         updates[constants.BINDING_PROFILE] = binding_profile
 
             # NOTE(gibi): during live migration the conductor already sets the

From 813377077bd0173bdf128823e46b5df7c0a575b9 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <balazs.gibizer@est.tech>
Date: Tue, 15 Feb 2022 14:38:41 +0100
Subject: [PATCH 35/93] Record SRIOV PF MAC in the binding profile

Today Nova updates the mac_address of a direct-physical port to reflect
the MAC address of the physical device the port is bound to. But this
can only be done before the port is bound. However during migration Nova
does not update the MAC when the port is bound to a different physical
device on the destination host.

This patch extends the libvirt virt driver to provide the MAC address of
the PF in the pci_info returned to the resource tracker. This
information will be then persisted in the extra_info field of the
PciDevice object.

Then the port update logic during migration, resize, live
migration, evacuation and unshelve is also extended to record the MAC of
physical device in the port binding profile according to the device on
the destination host.

The related neutron change Ib0638f5db69cb92daf6932890cb89e83cf84f295
uses this info from the binding profile to update the mac_address field
of the port when the binding is activated.

Closes-Bug: #1942329

Conflicts:
    nova/objects/pci_device.py
    nova/virt/libvirt/host.py

Change-Id: Iad5e70b43a65c076134e1874cb8e75d1ba214fde
(cherry picked from commit cd03bbc1c33e33872594cf002f0e7011ab8ea047)
---
 nova/compute/manager.py                       |   3 +
 nova/network/neutron.py                       |  33 +-
 nova/objects/pci_device.py                    |  19 +
 nova/tests/fixtures/libvirt.py                |  34 +-
 nova/tests/functional/libvirt/base.py         |  24 +-
 .../libvirt/test_pci_sriov_servers.py         | 337 ++++++++++++++++--
 nova/tests/unit/compute/test_compute.py       |   6 +-
 nova/tests/unit/compute/test_compute_mgr.py   | 108 ++++--
 nova/tests/unit/network/test_neutron.py       | 255 +++++++++++--
 nova/tests/unit/virt/libvirt/test_driver.py   |   5 +-
 nova/tests/unit/virt/libvirt/test_host.py     |  26 +-
 nova/virt/fake.py                             |  35 ++
 nova/virt/libvirt/host.py                     |  15 +
 .../notes/bug-1942329-22b08fa4b322881d.yaml   |   9 +
 14 files changed, 807 insertions(+), 102 deletions(-)
 create mode 100644 releasenotes/notes/bug-1942329-22b08fa4b322881d.yaml

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 1257ed9140f..0762098328a 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -10968,6 +10968,9 @@ def _update_migrate_vifs_profile_with_pci(self,
                 profile['vf_num'] = pci_utils.get_vf_num_by_pci_address(
                     pci_dev.address)
 
+            if pci_dev.mac_address:
+                profile['device_mac_address'] = pci_dev.mac_address
+
             mig_vif.profile = profile
             LOG.debug("Updating migrate VIF profile for port %(port_id)s:"
                       "%(profile)s", {'port_id': port_id,
diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index 1e7d33f73eb..3c9da4e9370 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -684,7 +684,8 @@ def _unbind_ports(self, context, ports,
             for profile_key in ('pci_vendor_info', 'pci_slot',
                                 constants.ALLOCATION, 'arq_uuid',
                                 'physical_network', 'card_serial_number',
-                                'vf_num', 'pf_mac_address'):
+                                'vf_num', 'pf_mac_address',
+                                'device_mac_address'):
                 if profile_key in port_profile:
                     del port_profile[profile_key]
             port_req_body['port'][constants.BINDING_PROFILE] = port_profile
@@ -1307,6 +1308,10 @@ def _update_ports_for_instance(self, context, instance, neutron,
                     network=network, neutron=neutron,
                     bind_host_id=bind_host_id,
                     port_arq=port_arq)
+                # NOTE(gibi): Remove this once we are sure that the fix for
+                # bug 1942329 is always present in the deployed neutron. The
+                # _populate_neutron_extension_values() call above already
+                # populated this MAC to the binding profile instead.
                 self._populate_pci_mac_address(instance,
                     request.pci_request_id, port_req_body)
 
@@ -1622,6 +1627,18 @@ def _get_pci_device_profile(self, pci_dev):
             if pci_dev.dev_type == obj_fields.PciDeviceType.SRIOV_VF:
                 dev_profile.update(
                     self._get_vf_pci_device_profile(pci_dev))
+
+            if pci_dev.dev_type == obj_fields.PciDeviceType.SRIOV_PF:
+                # In general the MAC address information flows fom the neutron
+                # port to the device in the backend. Except for direct-physical
+                # ports. In that case the MAC address flows from the physical
+                # device, the PF, to the neutron port. So when such a port is
+                # being bound to a host the port's MAC address needs to be
+                # updated. Nova needs to put the new MAC into the binding
+                # profile.
+                if pci_dev.mac_address:
+                    dev_profile['device_mac_address'] = pci_dev.mac_address
+
             return dev_profile
 
         raise exception.PciDeviceNotFound(node_id=pci_dev.compute_node_id,
@@ -3664,11 +3681,10 @@ def _get_pci_mapping_for_migration(self, instance, migration):
                   migration.get('status') == 'reverted')
         return instance.migration_context.get_pci_mapping_for_migration(revert)
 
-    def _get_port_pci_dev(self, context, instance, port):
+    def _get_port_pci_dev(self, instance, port):
         """Find the PCI device corresponding to the port.
         Assumes the port is an SRIOV one.
 
-        :param context: The request context.
         :param instance: The instance to which the port is attached.
         :param port: The Neutron port, as obtained from the Neutron API
             JSON form.
@@ -3756,15 +3772,14 @@ def _update_port_binding_for_instance(
                             raise exception.PortUpdateFailed(port_id=p['id'],
                                 reason=_("Unable to correlate PCI slot %s") %
                                          pci_slot)
-                # NOTE(artom) If migration is None, this is an unshevle, and we
-                # need to figure out the pci_slot from the InstancePCIRequest
-                # and PciDevice objects.
+                # NOTE(artom) If migration is None, this is an unshelve, and we
+                # need to figure out the pci related binding information from
+                # the InstancePCIRequest and PciDevice objects.
                 else:
-                    pci_dev = self._get_port_pci_dev(context, instance, p)
+                    pci_dev = self._get_port_pci_dev(instance, p)
                     if pci_dev:
                         binding_profile.update(
-                            self._get_pci_device_profile(pci_dev)
-                        )
+                            self._get_pci_device_profile(pci_dev))
                         updates[constants.BINDING_PROFILE] = binding_profile
 
             # NOTE(gibi): during live migration the conductor already sets the
diff --git a/nova/objects/pci_device.py b/nova/objects/pci_device.py
index b0d5b75826b..f30555849c3 100644
--- a/nova/objects/pci_device.py
+++ b/nova/objects/pci_device.py
@@ -148,6 +148,12 @@ def obj_make_compatible(self, primitive, target_version):
                     reason='dev_type=%s not supported in version %s' % (
                         dev_type, target_version))
 
+    def __repr__(self):
+        return (
+            f'PciDevice(address={self.address}, '
+            f'compute_node_id={self.compute_node_id})'
+        )
+
     def update_device(self, dev_dict):
         """Sync the content from device dictionary to device object.
 
@@ -175,6 +181,9 @@ def update_device(self, dev_dict):
                 # NOTE(ralonsoh): list of parameters currently added to
                 # "extra_info" dict:
                 #     - "capabilities": dict of (strings/list of strings)
+                #     - "parent_ifname": the netdev name of the parent (PF)
+                #        device of a VF
+                #     - "mac_address": the MAC address of the PF
                 extra_info = self.extra_info
                 data = v if isinstance(v, str) else jsonutils.dumps(v)
                 extra_info.update({k: data})
@@ -566,6 +575,13 @@ def card_serial_number(self):
         caps = jsonutils.loads(caps_json)
         return caps.get('vpd', {}).get('card_serial_number')
 
+    @property
+    def mac_address(self):
+        """The MAC address of the PF physical device or None if the device is
+        not a PF or if the MAC is not available.
+        """
+        return self.extra_info.get('mac_address')
+
 
 @base.NovaObjectRegistry.register
 class PciDeviceList(base.ObjectListBase, base.NovaObject):
@@ -605,3 +621,6 @@ def get_by_parent_address(cls, context, node_id, parent_addr):
                                                            parent_addr)
         return base.obj_make_list(context, cls(context), objects.PciDevice,
                                   db_dev_list)
+
+    def __repr__(self):
+        return f"PciDeviceList(objects={[repr(obj) for obj in self.objects]})"
diff --git a/nova/tests/fixtures/libvirt.py b/nova/tests/fixtures/libvirt.py
index 891e9572005..f6d5d656a2e 100644
--- a/nova/tests/fixtures/libvirt.py
+++ b/nova/tests/fixtures/libvirt.py
@@ -309,7 +309,7 @@ def __init__(
         self, dev_type, bus, slot, function, iommu_group, numa_node, *,
         vf_ratio=None, multiple_gpu_types=False, generic_types=False,
         parent=None, vend_id=None, vend_name=None, prod_id=None,
-        prod_name=None, driver_name=None, vpd_fields=None
+        prod_name=None, driver_name=None, vpd_fields=None, mac_address=None,
     ):
         """Populate pci devices
 
@@ -331,6 +331,8 @@ def __init__(
         :param prod_id: (str) The product ID.
         :param prod_name: (str) The product name.
         :param driver_name: (str) The driver name.
+        :param mac_address: (str) The MAC of the device.
+            Used in case of SRIOV PFs
         """
 
         self.dev_type = dev_type
@@ -349,6 +351,7 @@ def __init__(
         self.prod_id = prod_id
         self.prod_name = prod_name
         self.driver_name = driver_name
+        self.mac_address = mac_address
 
         self.vpd_fields = vpd_fields
 
@@ -364,7 +367,9 @@ def generate_xml(self, skip_capability=False):
             assert not self.vf_ratio, 'vf_ratio does not apply for PCI devices'
 
         if self.dev_type in ('PF', 'VF'):
-            assert self.vf_ratio, 'require vf_ratio for PFs and VFs'
+            assert (
+                self.vf_ratio is not None
+            ), 'require vf_ratio for PFs and VFs'
 
         if self.dev_type == 'VF':
             assert self.parent, 'require parent for VFs'
@@ -497,6 +502,10 @@ def format_vpd_cap(self):
     def XMLDesc(self, flags):
         return self.pci_device
 
+    @property
+    def address(self):
+        return "0000:%02x:%02x.%1x" % (self.bus, self.slot, self.function)
+
 
 # TODO(stephenfin): Remove all of these HostFooDevicesInfo objects in favour of
 # a unified devices object
@@ -609,7 +618,7 @@ def add_device(
         self, dev_type, bus, slot, function, iommu_group, numa_node,
         vf_ratio=None, multiple_gpu_types=False, generic_types=False,
         parent=None, vend_id=None, vend_name=None, prod_id=None,
-        prod_name=None, driver_name=None, vpd_fields=None,
+        prod_name=None, driver_name=None, vpd_fields=None, mac_address=None,
     ):
         pci_dev_name = _get_libvirt_nodedev_name(bus, slot, function)
 
@@ -632,6 +641,7 @@ def add_device(
             prod_name=prod_name,
             driver_name=driver_name,
             vpd_fields=vpd_fields,
+            mac_address=mac_address,
         )
         self.devices[pci_dev_name] = dev
         return dev
@@ -651,6 +661,13 @@ def get_all_mdev_capable_devices(self):
         return [dev for dev in self.devices
                 if self.devices[dev].is_capable_of_mdevs]
 
+    def get_pci_address_mac_mapping(self):
+        return {
+            device.address: device.mac_address
+            for dev_addr, device in self.devices.items()
+            if device.mac_address
+        }
+
 
 class FakeMdevDevice(object):
     template = """
@@ -2182,6 +2199,15 @@ class LibvirtFixture(fixtures.Fixture):
 
     def __init__(self, stub_os_vif=True):
         self.stub_os_vif = stub_os_vif
+        self.pci_address_to_mac_map = collections.defaultdict(
+            lambda: '52:54:00:1e:59:c6')
+
+    def update_sriov_mac_address_mapping(self, pci_address_to_mac_map):
+        self.pci_address_to_mac_map.update(pci_address_to_mac_map)
+
+    def fake_get_mac_by_pci_address(self, pci_addr, pf_interface=False):
+        res = self.pci_address_to_mac_map[pci_addr]
+        return res
 
     def setUp(self):
         super().setUp()
@@ -2205,7 +2231,7 @@ def setUp(self):
 
         self.useFixture(fixtures.MockPatch(
             'nova.pci.utils.get_mac_by_pci_address',
-            return_value='52:54:00:1e:59:c6'))
+            new=self.fake_get_mac_by_pci_address))
 
         # libvirt calls out to sysfs to get the vfs ID during macvtap plug
         self.useFixture(fixtures.MockPatch(
diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py
index 3d8aec8106b..c325c0b0407 100644
--- a/nova/tests/functional/libvirt/base.py
+++ b/nova/tests/functional/libvirt/base.py
@@ -42,7 +42,7 @@ def setUp(self):
         super(ServersTestBase, self).setUp()
 
         self.useFixture(nova_fixtures.LibvirtImageBackendFixture())
-        self.useFixture(nova_fixtures.LibvirtFixture())
+        self.libvirt = self.useFixture(nova_fixtures.LibvirtFixture())
         self.useFixture(nova_fixtures.OSBrickFixture())
 
         self.useFixture(fixtures.MockPatch(
@@ -134,6 +134,12 @@ def _start_compute(hostname, host_info):
                 host_info, pci_info, mdev_info, vdpa_info, libvirt_version,
                 qemu_version, hostname,
             )
+            # If the compute is configured with PCI devices then we need to
+            # make sure that the stubs around sysfs has the MAC address
+            # information for the PCI PF devices
+            if pci_info:
+                self.libvirt.update_sriov_mac_address_mapping(
+                    pci_info.get_pci_address_mac_mapping())
             # This is fun. Firstly we need to do a global'ish mock so we can
             # actually start the service.
             with mock.patch('nova.virt.libvirt.host.Host.get_connection',
@@ -392,6 +398,22 @@ class LibvirtNeutronFixture(nova_fixtures.NeutronFixture):
         'binding:vnic_type': 'remote-managed',
     }
 
+    network_4_port_pf = {
+        'id': 'c6f51315-9202-416f-9e2f-eb78b3ac36d9',
+        'network_id': network_4['id'],
+        'status': 'ACTIVE',
+        'mac_address': 'b5:bc:2e:e7:51:01',
+        'fixed_ips': [
+            {
+                'ip_address': '192.168.4.8',
+                'subnet_id': subnet_4['id']
+            }
+        ],
+        'binding:vif_details': {'vlan': 42},
+        'binding:vif_type': 'hostdev_physical',
+        'binding:vnic_type': 'direct-physical',
+    }
+
     def __init__(self, test):
         super(LibvirtNeutronFixture, self).__init__(test)
         self._networks = {
diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
index a5e52555e05..c9d277f498d 100644
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@@ -366,31 +366,66 @@ def _test_move_operation_with_neutron(self, move_operation,
                                           expect_fail=False):
         # The purpose here is to force an observable PCI slot update when
         # moving from source to dest. This is accomplished by having a single
-        # PCI device on the source, 2 PCI devices on the test, and relying on
-        # the fact that our fake HostPCIDevicesInfo creates predictable PCI
-        # addresses. The PCI device on source and the first PCI device on dest
-        # will have identical PCI addresses. By sticking a "placeholder"
-        # instance on that first PCI device on the dest, the incoming instance
-        # from source will be forced to consume the second dest PCI device,
-        # with a different PCI address.
+        # PCI VF device on the source, 2 PCI VF devices on the dest, and
+        # relying on the fact that our fake HostPCIDevicesInfo creates
+        # predictable PCI addresses. The PCI VF device on source and the first
+        # PCI VF device on dest will have identical PCI addresses. By sticking
+        # a "placeholder" instance on that first PCI VF device on the dest, the
+        # incoming instance from source will be forced to consume the second
+        # dest PCI VF device, with a different PCI address.
+        # We want to test server operations with SRIOV VFs and SRIOV PFs so
+        # the config of the compute hosts also have one extra PCI PF devices
+        # without any VF children. But the two compute has different PCI PF
+        # addresses and MAC so that the test can observe the slot update as
+        # well as the MAC updated during migration and after revert.
+        source_pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1)
+        # add an extra PF without VF to be used by direct-physical ports
+        source_pci_info.add_device(
+            dev_type='PF',
+            bus=0x82,  # the HostPCIDevicesInfo use the 0x81 by default
+            slot=0x0,
+            function=0,
+            iommu_group=42,
+            numa_node=0,
+            vf_ratio=0,
+            mac_address='b4:96:91:34:f4:aa',
+        )
         self.start_compute(
             hostname='source',
-            pci_info=fakelibvirt.HostPCIDevicesInfo(
-                num_pfs=1, num_vfs=1))
+            pci_info=source_pci_info)
+
+        dest_pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=2)
+        # add an extra PF without VF to be used by direct-physical ports
+        dest_pci_info.add_device(
+            dev_type='PF',
+            bus=0x82,  # the HostPCIDevicesInfo use the 0x81 by default
+            slot=0x6,  # make it different from the source host
+            function=0,
+            iommu_group=42,
+            numa_node=0,
+            vf_ratio=0,
+            mac_address='b4:96:91:34:f4:bb',
+        )
         self.start_compute(
             hostname='dest',
-            pci_info=fakelibvirt.HostPCIDevicesInfo(
-                num_pfs=1, num_vfs=2))
+            pci_info=dest_pci_info)
 
         source_port = self.neutron.create_port(
             {'port': self.neutron.network_4_port_1})
+        source_pf_port = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_pf})
         dest_port1 = self.neutron.create_port(
             {'port': self.neutron.network_4_port_2})
         dest_port2 = self.neutron.create_port(
             {'port': self.neutron.network_4_port_3})
 
         source_server = self._create_server(
-            networks=[{'port': source_port['port']['id']}], host='source')
+            networks=[
+                {'port': source_port['port']['id']},
+                {'port': source_pf_port['port']['id']}
+            ],
+            host='source',
+        )
         dest_server1 = self._create_server(
             networks=[{'port': dest_port1['port']['id']}], host='dest')
         dest_server2 = self._create_server(
@@ -398,6 +433,7 @@ def _test_move_operation_with_neutron(self, move_operation,
 
         # Refresh the ports.
         source_port = self.neutron.show_port(source_port['port']['id'])
+        source_pf_port = self.neutron.show_port(source_pf_port['port']['id'])
         dest_port1 = self.neutron.show_port(dest_port1['port']['id'])
         dest_port2 = self.neutron.show_port(dest_port2['port']['id'])
 
@@ -413,11 +449,24 @@ def _test_move_operation_with_neutron(self, move_operation,
             same_slot_port = dest_port2
             self._delete_server(dest_server1)
 
-        # Before moving, explictly assert that the servers on source and dest
+        # Before moving, explicitly assert that the servers on source and dest
         # have the same pci_slot in their port's binding profile
         self.assertEqual(source_port['port']['binding:profile']['pci_slot'],
                          same_slot_port['port']['binding:profile']['pci_slot'])
 
+        # Assert that the direct-physical port got the pci_slot information
+        # according to the source host PF PCI device.
+        self.assertEqual(
+            '0000:82:00.0',  # which is in sync with the source host pci_info
+            source_pf_port['port']['binding:profile']['pci_slot']
+        )
+        # Assert that the direct-physical port is updated with the MAC address
+        # of the PF device from the source host
+        self.assertEqual(
+            'b4:96:91:34:f4:aa',
+            source_pf_port['port']['binding:profile']['device_mac_address']
+        )
+
         # Before moving, assert that the servers on source and dest have the
         # same PCI source address in their XML for their SRIOV nic.
         source_conn = self.computes['source'].driver._host.get_connection()
@@ -434,14 +483,28 @@ def _test_move_operation_with_neutron(self, move_operation,
         move_operation(source_server)
 
         # Refresh the ports again, keeping in mind the source_port is now bound
-        # on the dest after unshelving.
+        # on the dest after the move.
         source_port = self.neutron.show_port(source_port['port']['id'])
         same_slot_port = self.neutron.show_port(same_slot_port['port']['id'])
+        source_pf_port = self.neutron.show_port(source_pf_port['port']['id'])
 
         self.assertNotEqual(
             source_port['port']['binding:profile']['pci_slot'],
             same_slot_port['port']['binding:profile']['pci_slot'])
 
+        # Assert that the direct-physical port got the pci_slot information
+        # according to the dest host PF PCI device.
+        self.assertEqual(
+            '0000:82:06.0',  # which is in sync with the dest host pci_info
+            source_pf_port['port']['binding:profile']['pci_slot']
+        )
+        # Assert that the direct-physical port is updated with the MAC address
+        # of the PF device from the dest host
+        self.assertEqual(
+            'b4:96:91:34:f4:bb',
+            source_pf_port['port']['binding:profile']['device_mac_address']
+        )
+
         conn = self.computes['dest'].driver._host.get_connection()
         vms = [vm._def for vm in conn._vms.values()]
         self.assertEqual(2, len(vms))
@@ -469,6 +532,169 @@ def move_operation(source_server):
             self._confirm_resize(source_server)
         self._test_move_operation_with_neutron(move_operation)
 
+    def test_cold_migrate_and_rever_server_with_neutron(self):
+        # The purpose here is to force an observable PCI slot update when
+        # moving from source to dest and the from dest to source after the
+        # revert. This is accomplished by having a single
+        # PCI VF device on the source, 2 PCI VF devices on the dest, and
+        # relying on the fact that our fake HostPCIDevicesInfo creates
+        # predictable PCI addresses. The PCI VF device on source and the first
+        # PCI VF device on dest will have identical PCI addresses. By sticking
+        # a "placeholder" instance on that first PCI VF device on the dest, the
+        # incoming instance from source will be forced to consume the second
+        # dest PCI VF device, with a different PCI address.
+        # We want to test server operations with SRIOV VFs and SRIOV PFs so
+        # the config of the compute hosts also have one extra PCI PF devices
+        # without any VF children. But the two compute has different PCI PF
+        # addresses and MAC so that the test can observe the slot update as
+        # well as the MAC updated during migration and after revert.
+        source_pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1)
+        # add an extra PF without VF to be used by direct-physical ports
+        source_pci_info.add_device(
+            dev_type='PF',
+            bus=0x82,  # the HostPCIDevicesInfo use the 0x81 by default
+            slot=0x0,
+            function=0,
+            iommu_group=42,
+            numa_node=0,
+            vf_ratio=0,
+            mac_address='b4:96:91:34:f4:aa',
+        )
+        self.start_compute(
+            hostname='source',
+            pci_info=source_pci_info)
+        dest_pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=2)
+        # add an extra PF without VF to be used by direct-physical ports
+        dest_pci_info.add_device(
+            dev_type='PF',
+            bus=0x82,  # the HostPCIDevicesInfo use the 0x81 by default
+            slot=0x6,  # make it different from the source host
+            function=0,
+            iommu_group=42,
+            numa_node=0,
+            vf_ratio=0,
+            mac_address='b4:96:91:34:f4:bb',
+        )
+        self.start_compute(
+            hostname='dest',
+            pci_info=dest_pci_info)
+        source_port = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_1})
+        source_pf_port = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_pf})
+        dest_port1 = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_2})
+        dest_port2 = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_3})
+        source_server = self._create_server(
+            networks=[
+                {'port': source_port['port']['id']},
+                {'port': source_pf_port['port']['id']}
+            ],
+            host='source',
+        )
+        dest_server1 = self._create_server(
+            networks=[{'port': dest_port1['port']['id']}], host='dest')
+        dest_server2 = self._create_server(
+            networks=[{'port': dest_port2['port']['id']}], host='dest')
+        # Refresh the ports.
+        source_port = self.neutron.show_port(source_port['port']['id'])
+        source_pf_port = self.neutron.show_port(source_pf_port['port']['id'])
+        dest_port1 = self.neutron.show_port(dest_port1['port']['id'])
+        dest_port2 = self.neutron.show_port(dest_port2['port']['id'])
+        # Find the server on the dest compute that's using the same pci_slot as
+        # the server on the source compute, and delete the other one to make
+        # room for the incoming server from the source.
+        source_pci_slot = source_port['port']['binding:profile']['pci_slot']
+        dest_pci_slot1 = dest_port1['port']['binding:profile']['pci_slot']
+        if dest_pci_slot1 == source_pci_slot:
+            same_slot_port = dest_port1
+            self._delete_server(dest_server2)
+        else:
+            same_slot_port = dest_port2
+            self._delete_server(dest_server1)
+        # Before moving, explicitly assert that the servers on source and dest
+        # have the same pci_slot in their port's binding profile
+        self.assertEqual(source_port['port']['binding:profile']['pci_slot'],
+                         same_slot_port['port']['binding:profile']['pci_slot'])
+        # Assert that the direct-physical port got the pci_slot information
+        # according to the source host PF PCI device.
+        self.assertEqual(
+            '0000:82:00.0',  # which is in sync with the source host pci_info
+            source_pf_port['port']['binding:profile']['pci_slot']
+        )
+        # Assert that the direct-physical port is updated with the MAC address
+        # of the PF device from the source host
+        self.assertEqual(
+            'b4:96:91:34:f4:aa',
+            source_pf_port['port']['binding:profile']['device_mac_address']
+        )
+        # Before moving, assert that the servers on source and dest have the
+        # same PCI source address in their XML for their SRIOV nic.
+        source_conn = self.computes['source'].driver._host.get_connection()
+        dest_conn = self.computes['source'].driver._host.get_connection()
+        source_vms = [vm._def for vm in source_conn._vms.values()]
+        dest_vms = [vm._def for vm in dest_conn._vms.values()]
+        self.assertEqual(1, len(source_vms))
+        self.assertEqual(1, len(dest_vms))
+        self.assertEqual(1, len(source_vms[0]['devices']['nics']))
+        self.assertEqual(1, len(dest_vms[0]['devices']['nics']))
+        self.assertEqual(source_vms[0]['devices']['nics'][0]['source'],
+                         dest_vms[0]['devices']['nics'][0]['source'])
+
+        # TODO(stephenfin): The mock of 'migrate_disk_and_power_off' should
+        # probably be less...dumb
+        with mock.patch('nova.virt.libvirt.driver.LibvirtDriver'
+                        '.migrate_disk_and_power_off', return_value='{}'):
+            self._migrate_server(source_server)
+
+        # Refresh the ports again, keeping in mind the ports are now bound
+        # on the dest after migrating.
+        source_port = self.neutron.show_port(source_port['port']['id'])
+        same_slot_port = self.neutron.show_port(same_slot_port['port']['id'])
+        source_pf_port = self.neutron.show_port(source_pf_port['port']['id'])
+        self.assertNotEqual(
+            source_port['port']['binding:profile']['pci_slot'],
+            same_slot_port['port']['binding:profile']['pci_slot'])
+        # Assert that the direct-physical port got the pci_slot information
+        # according to the dest host PF PCI device.
+        self.assertEqual(
+            '0000:82:06.0',  # which is in sync with the dest host pci_info
+            source_pf_port['port']['binding:profile']['pci_slot']
+        )
+        # Assert that the direct-physical port is updated with the MAC address
+        # of the PF device from the dest host
+        self.assertEqual(
+            'b4:96:91:34:f4:bb',
+            source_pf_port['port']['binding:profile']['device_mac_address']
+        )
+        conn = self.computes['dest'].driver._host.get_connection()
+        vms = [vm._def for vm in conn._vms.values()]
+        self.assertEqual(2, len(vms))
+        for vm in vms:
+            self.assertEqual(1, len(vm['devices']['nics']))
+        self.assertNotEqual(vms[0]['devices']['nics'][0]['source'],
+                            vms[1]['devices']['nics'][0]['source'])
+
+        self._revert_resize(source_server)
+
+        # Refresh the ports again, keeping in mind the ports are now bound
+        # on the source as the migration is reverted
+        source_pf_port = self.neutron.show_port(source_pf_port['port']['id'])
+
+        # Assert that the direct-physical port got the pci_slot information
+        # according to the source host PF PCI device.
+        self.assertEqual(
+            '0000:82:00.0',  # which is in sync with the source host pci_info
+            source_pf_port['port']['binding:profile']['pci_slot']
+        )
+        # Assert that the direct-physical port is updated with the MAC address
+        # of the PF device from the source host
+        self.assertEqual(
+            'b4:96:91:34:f4:aa',
+            source_pf_port['port']['binding:profile']['device_mac_address']
+        )
+
     def test_evacuate_server_with_neutron(self):
         def move_operation(source_server):
             # Down the source compute to enable the evacuation
@@ -486,17 +712,44 @@ def test_live_migrate_server_with_neutron(self):
         """
 
         # start two compute services with differing PCI device inventory
-        self.start_compute(
-            hostname='test_compute0',
-            pci_info=fakelibvirt.HostPCIDevicesInfo(
-                num_pfs=2, num_vfs=8, numa_node=0))
-        self.start_compute(
-            hostname='test_compute1',
-            pci_info=fakelibvirt.HostPCIDevicesInfo(
-                num_pfs=1, num_vfs=2, numa_node=1))
+        source_pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pfs=2, num_vfs=8, numa_node=0)
+        # add an extra PF without VF to be used by direct-physical ports
+        source_pci_info.add_device(
+            dev_type='PF',
+            bus=0x82,  # the HostPCIDevicesInfo use the 0x81 by default
+            slot=0x0,
+            function=0,
+            iommu_group=42,
+            numa_node=0,
+            vf_ratio=0,
+            mac_address='b4:96:91:34:f4:aa',
+        )
+        self.start_compute(hostname='test_compute0', pci_info=source_pci_info)
 
-        # create the port
-        self.neutron.create_port({'port': self.neutron.network_4_port_1})
+        dest_pci_info = fakelibvirt.HostPCIDevicesInfo(
+            num_pfs=1, num_vfs=2, numa_node=1)
+        # add an extra PF without VF to be used by direct-physical ports
+        dest_pci_info.add_device(
+            dev_type='PF',
+            bus=0x82,  # the HostPCIDevicesInfo use the 0x81 by default
+            slot=0x6,  # make it different from the source host
+            function=0,
+            iommu_group=42,
+            # numa node needs to be aligned with the other pci devices in this
+            # host as the instance needs to fit into a single host numa node
+            numa_node=1,
+            vf_ratio=0,
+            mac_address='b4:96:91:34:f4:bb',
+        )
+
+        self.start_compute(hostname='test_compute1', pci_info=dest_pci_info)
+
+        # create the ports
+        port = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_1})['port']
+        pf_port = self.neutron.create_port(
+            {'port': self.neutron.network_4_port_pf})['port']
 
         # create a server using the VF via neutron
         extra_spec = {'hw:cpu_policy': 'dedicated'}
@@ -504,7 +757,8 @@ def test_live_migrate_server_with_neutron(self):
         server = self._create_server(
             flavor_id=flavor_id,
             networks=[
-                {'port': base.LibvirtNeutronFixture.network_4_port_1['id']},
+                {'port': port['id']},
+                {'port': pf_port['id']},
             ],
             host='test_compute0',
         )
@@ -512,8 +766,8 @@ def test_live_migrate_server_with_neutron(self):
         # our source host should have marked two PCI devices as used, the VF
         # and the parent PF, while the future destination is currently unused
         self.assertEqual('test_compute0', server['OS-EXT-SRV-ATTR:host'])
-        self.assertPCIDeviceCounts('test_compute0', total=10, free=8)
-        self.assertPCIDeviceCounts('test_compute1', total=3, free=3)
+        self.assertPCIDeviceCounts('test_compute0', total=11, free=8)
+        self.assertPCIDeviceCounts('test_compute1', total=4, free=4)
 
         # the instance should be on host NUMA node 0, since that's where our
         # PCI devices are
@@ -544,13 +798,26 @@ def test_live_migrate_server_with_neutron(self):
             port['binding:profile'],
         )
 
+        # ensure the binding details sent to "neutron" are correct
+        pf_port = self.neutron.show_port(pf_port['id'],)['port']
+        self.assertIn('binding:profile', pf_port)
+        self.assertEqual(
+            {
+                'pci_vendor_info': '8086:1528',
+                'pci_slot': '0000:82:00.0',
+                'physical_network': 'physnet4',
+                'device_mac_address': 'b4:96:91:34:f4:aa',
+            },
+            pf_port['binding:profile'],
+        )
+
         # now live migrate that server
         self._live_migrate(server, 'completed')
 
         # we should now have transitioned our usage to the destination, freeing
         # up the source in the process
-        self.assertPCIDeviceCounts('test_compute0', total=10, free=10)
-        self.assertPCIDeviceCounts('test_compute1', total=3, free=1)
+        self.assertPCIDeviceCounts('test_compute0', total=11, free=11)
+        self.assertPCIDeviceCounts('test_compute1', total=4, free=1)
 
         # the instance should now be on host NUMA node 1, since that's where
         # our PCI devices are for this second host
@@ -577,6 +844,18 @@ def test_live_migrate_server_with_neutron(self):
             },
             port['binding:profile'],
         )
+        # ensure the binding details sent to "neutron" are correct
+        pf_port = self.neutron.show_port(pf_port['id'],)['port']
+        self.assertIn('binding:profile', pf_port)
+        self.assertEqual(
+            {
+                'pci_vendor_info': '8086:1528',
+                'pci_slot': '0000:82:06.0',
+                'physical_network': 'physnet4',
+                'device_mac_address': 'b4:96:91:34:f4:bb',
+            },
+            pf_port['binding:profile'],
+        )
 
     def test_get_server_diagnostics_server_with_VF(self):
         """Ensure server disagnostics include info on VF-type PCI devices."""
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index d8f443843f3..e98ea6ab062 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -5714,13 +5714,15 @@ def _test_resize_with_pci(self, method, expected_pci_addr):
             objects=[objects.PciDevice(vendor_id='1377',
                                        product_id='0047',
                                        address='0000:0a:00.1',
-                                       request_id=uuids.req1)])
+                                       request_id=uuids.req1,
+                                       compute_node_id=1)])
 
         new_pci_devices = objects.PciDeviceList(
             objects=[objects.PciDevice(vendor_id='1377',
                                        product_id='0047',
                                        address='0000:0b:00.1',
-                                       request_id=uuids.req2)])
+                                       request_id=uuids.req2,
+                                       compute_node_id=2)])
 
         if expected_pci_addr == old_pci_devices[0].address:
             expected_pci_device = old_pci_devices[0]
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1390520d3d1..974c669bc76 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -8983,10 +8983,12 @@ def driver_confirm_resize(*args, **kwargs):
             self._mock_rt()
             old_devs = objects.PciDeviceList(
                 objects=[objects.PciDevice(
+                    compute_node_id=1,
                     address='0000:04:00.2',
                     request_id=uuids.pcidev1)])
             new_devs = objects.PciDeviceList(
                 objects=[objects.PciDevice(
+                    compute_node_id=2,
                     address='0000:05:00.3',
                     request_id=uuids.pcidev1)])
             self.instance.migration_context = objects.MigrationContext(
@@ -10978,40 +10980,94 @@ def get_pci_req_side_effect(context, instance, vif):
         _test()
 
     def test__update_migrate_vifs_profile_with_pci(self):
-        # Define two migrate vifs with only one pci that is required
-        # to be updated. Make sure method under test updated the correct one
+        # Define three migrate vifs with two pci devs that are required
+        # to be updated, one VF and on PF.
+        # Make sure method under test updated the correct devs with the correct
+        # values.
         nw_vifs = network_model.NetworkInfo(
-            [network_model.VIF(
-                id=uuids.port0,
-                vnic_type='direct',
-                type=network_model.VIF_TYPE_HW_VEB,
-                profile={'pci_slot': '0000:04:00.3',
-                         'pci_vendor_info': '15b3:1018',
-                         'physical_network': 'default'}),
-            network_model.VIF(
-                id=uuids.port1,
-                vnic_type='normal',
-                type=network_model.VIF_TYPE_OVS,
-                profile={'some': 'attribute'})])
-        pci_dev = objects.PciDevice(request_id=uuids.pci_req,
-                                    address='0000:05:00.4',
-                                    vendor_id='15b3',
-                                    product_id='1018')
-        port_id_to_pci_dev = {uuids.port0: pci_dev}
-        mig_vifs = migrate_data_obj.VIFMigrateData.\
-            create_skeleton_migrate_vifs(nw_vifs)
-        self.compute._update_migrate_vifs_profile_with_pci(mig_vifs,
-                                                           port_id_to_pci_dev)
+            [
+                network_model.VIF(
+                    id=uuids.port0,
+                    vnic_type='direct',
+                    type=network_model.VIF_TYPE_HW_VEB,
+                    profile={
+                        'pci_slot': '0000:04:00.3',
+                        'pci_vendor_info': '15b3:1018',
+                        'physical_network': 'default',
+                    },
+                ),
+                network_model.VIF(
+                    id=uuids.port1,
+                    vnic_type='normal',
+                    type=network_model.VIF_TYPE_OVS,
+                    profile={'some': 'attribute'},
+                ),
+                network_model.VIF(
+                    id=uuids.port2,
+                    vnic_type='direct-physical',
+                    type=network_model.VIF_TYPE_HOSTDEV,
+                    profile={
+                        'pci_slot': '0000:01:00',
+                        'pci_vendor_info': '8086:154d',
+                        'physical_network': 'physnet2',
+                    },
+                ),
+            ]
+        )
+
+        pci_vf_dev = objects.PciDevice(
+            request_id=uuids.pci_req,
+            address='0000:05:00.4',
+            parent_addr='0000:05:00',
+            vendor_id='15b3',
+            product_id='1018',
+            compute_node_id=13,
+            dev_type=fields.PciDeviceType.SRIOV_VF,
+        )
+        pci_pf_dev = objects.PciDevice(
+            request_id=uuids.pci_req2,
+            address='0000:01:00',
+            parent_addr='0000:02:00',
+            vendor_id='8086',
+            product_id='154d',
+            compute_node_id=13,
+            dev_type=fields.PciDeviceType.SRIOV_PF,
+            extra_info={'mac_address': 'b4:96:91:34:f4:36'},
+        )
+        port_id_to_pci_dev = {
+            uuids.port0: pci_vf_dev,
+            uuids.port2: pci_pf_dev,
+        }
+        mig_vifs = (
+            migrate_data_obj.VIFMigrateData.create_skeleton_migrate_vifs(
+                nw_vifs)
+        )
+
+        self.compute._update_migrate_vifs_profile_with_pci(
+            mig_vifs, port_id_to_pci_dev)
+
         # Make sure method under test updated the correct one.
-        changed_mig_vif = mig_vifs[0]
+        changed_vf_mig_vif = mig_vifs[0]
         unchanged_mig_vif = mig_vifs[1]
+        changed_pf_mig_vif = mig_vifs[2]
         # Migrate vifs profile was updated with pci_dev.address
         # for port ID uuids.port0.
-        self.assertEqual(changed_mig_vif.profile['pci_slot'],
-                         pci_dev.address)
+        self.assertEqual(changed_vf_mig_vif.profile['pci_slot'],
+                         pci_vf_dev.address)
+        # MAC is not added as this is a VF
+        self.assertNotIn('device_mac_address', changed_vf_mig_vif.profile)
         # Migrate vifs profile was unchanged for port ID uuids.port1.
         # i.e 'profile' attribute does not exist.
         self.assertNotIn('profile', unchanged_mig_vif)
+        # Migrate vifs profile was updated with pci_dev.address
+        # for port ID uuids.port2.
+        self.assertEqual(changed_pf_mig_vif.profile['pci_slot'],
+                         pci_pf_dev.address)
+        # MAC is updated as this is a PF
+        self.assertEqual(
+            'b4:96:91:34:f4:36',
+            changed_pf_mig_vif.profile['device_mac_address']
+        )
 
     def test_get_updated_nw_info_with_pci_mapping(self):
         old_dev = objects.PciDevice(address='0000:04:00.2')
diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py
index 40137cef39a..03e65bb6081 100644
--- a/nova/tests/unit/network/test_neutron.py
+++ b/nova/tests/unit/network/test_neutron.py
@@ -4805,6 +4805,174 @@ def test_update_port_bindings_for_instance_with_live_migration(
                 constants.BINDING_HOST_ID],
             'new-host')
 
+    @mock.patch(
+        'nova.network.neutron.API.has_extended_resource_request_extension',
+        new=mock.Mock(return_value=False),
+    )
+    @mock.patch.object(pci_whitelist.Whitelist, 'get_devspec')
+    @mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock())
+    def test_update_port_bindings_for_instance_with_sriov_pf(
+        self, get_client_mock, get_pci_device_devspec_mock
+    ):
+        devspec = mock.Mock()
+        devspec.get_tags.return_value = {'physical_network': 'physnet1'}
+        get_pci_device_devspec_mock.return_value = devspec
+
+        instance = fake_instance.fake_instance_obj(self.context)
+        instance.migration_context = objects.MigrationContext()
+        instance.migration_context.old_pci_devices = objects.PciDeviceList(
+            objects=[
+                objects.PciDevice(
+                    vendor_id='8086',
+                    product_id='154d',
+                    address='0000:0a:01',
+                    compute_node_id=1,
+                    request_id=uuids.pci_req,
+                    dev_type=obj_fields.PciDeviceType.SRIOV_PF,
+                    extra_info={'mac_address': 'b4:96:91:34:f4:36'},
+                )
+            ]
+        )
+        instance.pci_devices = instance.migration_context.old_pci_devices
+        instance.migration_context.new_pci_devices = objects.PciDeviceList(
+            objects=[
+                objects.PciDevice(
+                    vendor_id='8086',
+                    product_id='154d',
+                    address='0000:0a:02',
+                    compute_node_id=2,
+                    request_id=uuids.pci_req,
+                    dev_type=obj_fields.PciDeviceType.SRIOV_PF,
+                    extra_info={'mac_address': 'b4:96:91:34:f4:dd'},
+                )
+            ]
+        )
+        instance.pci_devices = instance.migration_context.new_pci_devices
+
+        fake_ports = {
+            'ports': [
+                {
+                    'id': uuids.port,
+                    'binding:vnic_type': 'direct-physical',
+                    constants.BINDING_HOST_ID: 'fake-host-old',
+                    constants.BINDING_PROFILE: {
+                        'pci_slot': '0000:0a:01',
+                        'physical_network': 'old_phys_net',
+                        'pci_vendor_info': 'old_pci_vendor_info',
+                    },
+                },
+            ]
+        }
+
+        migration = objects.Migration(
+            status='confirmed', migration_type='migration')
+        list_ports_mock = mock.Mock(return_value=fake_ports)
+        get_client_mock.return_value.list_ports = list_ports_mock
+
+        update_port_mock = mock.Mock()
+        get_client_mock.return_value.update_port = update_port_mock
+
+        self.api._update_port_binding_for_instance(
+            self.context, instance, instance.host, migration)
+
+        # Assert that update_port is called with the binding:profile
+        # corresponding to the PCI device specified including MAC address.
+        update_port_mock.assert_called_once_with(
+            uuids.port,
+            {
+                'port': {
+                    constants.BINDING_HOST_ID: 'fake-host',
+                    'device_owner': 'compute:%s' % instance.availability_zone,
+                    constants.BINDING_PROFILE: {
+                        'pci_slot': '0000:0a:02',
+                        'physical_network': 'physnet1',
+                        'pci_vendor_info': '8086:154d',
+                        'device_mac_address': 'b4:96:91:34:f4:dd',
+                    },
+                }
+            },
+        )
+
+    @mock.patch(
+        'nova.network.neutron.API.has_extended_resource_request_extension',
+        new=mock.Mock(return_value=False),
+    )
+    @mock.patch.object(pci_whitelist.Whitelist, 'get_devspec')
+    @mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock())
+    def test_update_port_bindings_for_instance_with_sriov_pf_no_migration(
+        self, get_client_mock, get_pci_device_devspec_mock
+    ):
+        devspec = mock.Mock()
+        devspec.get_tags.return_value = {'physical_network': 'physnet1'}
+        get_pci_device_devspec_mock.return_value = devspec
+
+        instance = fake_instance.fake_instance_obj(self.context)
+        instance.pci_requests = objects.InstancePCIRequests(
+            instance_uuid=instance.uuid,
+            requests=[
+                objects.InstancePCIRequest(
+                    requester_id=uuids.port,
+                    request_id=uuids.pci_req,
+                )
+            ],
+        )
+        instance.pci_devices = objects.PciDeviceList(
+            objects=[
+                objects.PciDevice(
+                    vendor_id='8086',
+                    product_id='154d',
+                    address='0000:0a:02',
+                    compute_node_id=2,
+                    request_id=uuids.pci_req,
+                    dev_type=obj_fields.PciDeviceType.SRIOV_PF,
+                    extra_info={'mac_address': 'b4:96:91:34:f4:36'},
+                )
+            ]
+        )
+
+        fake_ports = {
+            'ports': [
+                {
+                    'id': uuids.port,
+                    'binding:vnic_type': 'direct-physical',
+                    constants.BINDING_HOST_ID: 'fake-host-old',
+                    constants.BINDING_PROFILE: {
+                        'pci_slot': '0000:0a:01',
+                        'physical_network': 'old_phys_net',
+                        'pci_vendor_info': 'old_pci_vendor_info',
+                        'device_mac_address': 'b4:96:91:34:f4:dd'
+                    },
+                },
+            ]
+        }
+
+        list_ports_mock = mock.Mock(return_value=fake_ports)
+        get_client_mock.return_value.list_ports = list_ports_mock
+
+        update_port_mock = mock.Mock()
+        get_client_mock.return_value.update_port = update_port_mock
+
+        self.api._update_port_binding_for_instance(
+            self.context, instance, instance.host)
+
+        # Assert that update_port is called with the binding:profile
+        # corresponding to the PCI device specified including MAC address.
+        update_port_mock.assert_called_once_with(
+            uuids.port,
+            {
+                'port': {
+                    constants.BINDING_HOST_ID: 'fake-host',
+                    'device_owner': 'compute:%s' % instance.availability_zone,
+                    constants.BINDING_PROFILE: {
+                        'pci_slot': '0000:0a:02',
+                        'physical_network': 'physnet1',
+                        'pci_vendor_info': '8086:154d',
+                        'device_mac_address': 'b4:96:91:34:f4:36',
+                    },
+                }
+            },
+        )
+
     @mock.patch(
         'nova.network.neutron.API.has_extended_resource_request_extension',
         new=mock.Mock(return_value=False),
@@ -7190,23 +7358,21 @@ def test_get_port_pci_dev(self, mock_debug):
                                              request_id=uuids.pci_request_id)
         bad_request = objects.InstancePCIRequest(
             requester_id=uuids.wrong_port_id)
-        device = objects.PciDevice(request_id=uuids.pci_request_id,
-                                   address='fake-pci-address')
+        device = objects.PciDevice(request_id=uuids.pci_request_id)
         bad_device = objects.PciDevice(request_id=uuids.wrong_request_id)
         # Test the happy path
         instance = objects.Instance(
             pci_requests=objects.InstancePCIRequests(requests=[request]),
             pci_devices=objects.PciDeviceList(objects=[device]))
         self.assertEqual(
-            'fake-pci-address',
-            self.api._get_port_pci_dev(
-                self.context, instance, fake_port).address)
+            device,
+            self.api._get_port_pci_dev(instance, fake_port))
         # Test not finding the request
         instance = objects.Instance(
             pci_requests=objects.InstancePCIRequests(
                 requests=[objects.InstancePCIRequest(bad_request)]))
         self.assertIsNone(
-            self.api._get_port_pci_dev(self.context, instance, fake_port))
+            self.api._get_port_pci_dev(instance, fake_port))
         mock_debug.assert_called_with('No PCI request found for port %s',
                                       uuids.fake_port_id, instance=instance)
         mock_debug.reset_mock()
@@ -7215,7 +7381,7 @@ def test_get_port_pci_dev(self, mock_debug):
             pci_requests=objects.InstancePCIRequests(requests=[request]),
             pci_devices=objects.PciDeviceList(objects=[bad_device]))
         self.assertIsNone(
-            self.api._get_port_pci_dev(self.context, instance, fake_port))
+            self.api._get_port_pci_dev(instance, fake_port))
         mock_debug.assert_called_with('No PCI device found for request %s',
                                       uuids.pci_request_id, instance=instance)
 
@@ -7740,6 +7906,45 @@ def test_populate_neutron_extension_values_binding_sriov_with_cap(
                          port_req_body['port'][
                              constants.BINDING_PROFILE])
 
+    @mock.patch.object(pci_whitelist.Whitelist, 'get_devspec')
+    @mock.patch.object(pci_manager, 'get_instance_pci_devs')
+    def test_populate_neutron_extension_values_binding_sriov_pf(
+        self, mock_get_instance_pci_devs, mock_get_devspec
+    ):
+        host_id = 'my_host_id'
+        instance = {'host': host_id}
+        port_req_body = {'port': {}}
+
+        pci_dev = objects.PciDevice(
+            request_id=uuids.pci_req,
+            address='0000:01:00',
+            parent_addr='0000:02:00',
+            vendor_id='8086',
+            product_id='154d',
+            dev_type=obj_fields.PciDeviceType.SRIOV_PF,
+            extra_info={'mac_address': 'b4:96:91:34:f4:36'}
+        )
+
+        expected_profile = {
+            'pci_vendor_info': '8086:154d',
+            'pci_slot': '0000:01:00',
+            'physical_network': 'physnet1',
+            'device_mac_address': 'b4:96:91:34:f4:36',
+        }
+
+        mock_get_instance_pci_devs.return_value = [pci_dev]
+        devspec = mock.Mock()
+        devspec.get_tags.return_value = {'physical_network': 'physnet1'}
+        mock_get_devspec.return_value = devspec
+
+        self.api._populate_neutron_binding_profile(
+            instance, uuids.pci_req, port_req_body, None)
+
+        self.assertEqual(
+            expected_profile,
+            port_req_body['port'][constants.BINDING_PROFILE]
+        )
+
     @mock.patch.object(
         pci_utils, 'get_vf_num_by_pci_address',
         new=mock.MagicMock(side_effect=(lambda vf_a: 1
@@ -7867,21 +8072,29 @@ def test__get_pci_device_profile_pf(self, mock_get_pci_device_devspec):
         devspec.get_tags.return_value = {'physical_network': 'physnet1'}
         mock_get_pci_device_devspec.return_value = devspec
 
-        pci_dev = {'vendor_id': 'a2d6',
-                   'product_id': '15b3',
-                   'address': '0000:0a:00.0',
-                   'card_serial_number': 'MT2113X00000',
-                   'dev_type': obj_fields.PciDeviceType.SRIOV_PF,
-                  }
-        PciDevice = collections.namedtuple('PciDevice',
-                               ['vendor_id', 'product_id', 'address',
-                                'card_serial_number', 'dev_type'])
-        mydev = PciDevice(**pci_dev)
+        pci_dev = objects.PciDevice(
+            request_id=uuids.pci_req,
+            address='0000:0a:00.0',
+            parent_addr='0000:02:00',
+            vendor_id='a2d6',
+            product_id='15b3',
+            dev_type=obj_fields.PciDeviceType.SRIOV_PF,
+            extra_info={
+                'capabilities': jsonutils.dumps(
+                    {'card_serial_number': 'MT2113X00000'}),
+                'mac_address': 'b4:96:91:34:f4:36',
+            },
 
-        self.assertEqual({'pci_slot': '0000:0a:00.0',
-                          'pci_vendor_info': 'a2d6:15b3',
-                          'physical_network': 'physnet1'},
-                         self.api._get_pci_device_profile(mydev))
+        )
+        self.assertEqual(
+            {
+                'pci_slot': '0000:0a:00.0',
+                'pci_vendor_info': 'a2d6:15b3',
+                'physical_network': 'physnet1',
+                'device_mac_address': 'b4:96:91:34:f4:36',
+            },
+            self.api._get_pci_device_profile(pci_dev),
+        )
 
     @mock.patch.object(pci_whitelist.Whitelist, 'get_devspec')
     @mock.patch.object(pci_manager, 'get_instance_pci_devs')
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 5632fcba868..f49d9be5e56 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -17614,7 +17614,10 @@ def test_get_pci_passthrough_devices(self, mock_list, mock_get_ifname):
                 "vendor_id": '8086',
                 "dev_type": fields.PciDeviceType.SRIOV_PF,
                 "phys_function": None,
-                "numa_node": None},
+                "numa_node": None,
+                # value defined in the LibvirtFixture
+                "mac_address": "52:54:00:1e:59:c6",
+            },
             {
                 "dev_id": "pci_0000_04_10_7",
                 "domain": 0,
diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py
index d71d13ab372..741d834fb5e 100644
--- a/nova/tests/unit/virt/libvirt/test_host.py
+++ b/nova/tests/unit/virt/libvirt/test_host.py
@@ -1156,9 +1156,9 @@ def test_get_pcidev_info(self, mock_get_ifname):
             dev for dev in node_devs.values() if dev.name() in devs]
 
         name = "pci_0000_04_00_3"
-        actual_vf = self.host._get_pcidev_info(
+        actual_pf = self.host._get_pcidev_info(
             name, node_devs[name], net_devs, [], [])
-        expect_vf = {
+        expect_pf = {
             "dev_id": "pci_0000_04_00_3",
             "address": "0000:04:00.3",
             "product_id": '1521',
@@ -1166,8 +1166,10 @@ def test_get_pcidev_info(self, mock_get_ifname):
             "vendor_id": '8086',
             "label": 'label_8086_1521',
             "dev_type": obj_fields.PciDeviceType.SRIOV_PF,
+            # value defined in the LibvirtFixture
+            "mac_address": "52:54:00:1e:59:c6",
             }
-        self.assertEqual(expect_vf, actual_vf)
+        self.assertEqual(expect_pf, actual_pf)
 
         name = "pci_0000_04_10_7"
         actual_vf = self.host._get_pcidev_info(
@@ -1222,9 +1224,9 @@ def test_get_pcidev_info(self, mock_get_ifname):
         self.assertEqual(expect_vf, actual_vf)
 
         name = "pci_0000_03_00_0"
-        actual_vf = self.host._get_pcidev_info(
+        actual_pf = self.host._get_pcidev_info(
             name, node_devs[name], net_devs, [], [])
-        expect_vf = {
+        expect_pf = {
             "dev_id": "pci_0000_03_00_0",
             "address": "0000:03:00.0",
             "product_id": '1013',
@@ -1232,13 +1234,15 @@ def test_get_pcidev_info(self, mock_get_ifname):
             "vendor_id": '15b3',
             "label": 'label_15b3_1013',
             "dev_type": obj_fields.PciDeviceType.SRIOV_PF,
+            # value defined in the LibvirtFixture
+            "mac_address": "52:54:00:1e:59:c6",
             }
-        self.assertEqual(expect_vf, actual_vf)
+        self.assertEqual(expect_pf, actual_pf)
 
         name = "pci_0000_03_00_1"
-        actual_vf = self.host._get_pcidev_info(
+        actual_pf = self.host._get_pcidev_info(
             name, node_devs[name], net_devs, [], [])
-        expect_vf = {
+        expect_pf = {
             "dev_id": "pci_0000_03_00_1",
             "address": "0000:03:00.1",
             "product_id": '1013',
@@ -1246,8 +1250,10 @@ def test_get_pcidev_info(self, mock_get_ifname):
             "vendor_id": '15b3',
             "label": 'label_15b3_1013',
             "dev_type": obj_fields.PciDeviceType.SRIOV_PF,
+            # value defined in the LibvirtFixture
+            "mac_address": "52:54:00:1e:59:c6",
             }
-        self.assertEqual(expect_vf, actual_vf)
+        self.assertEqual(expect_pf, actual_pf)
 
         # Parent PF with a VPD cap.
         name = "pci_0000_82_00_0"
@@ -1264,6 +1270,8 @@ def test_get_pcidev_info(self, mock_get_ifname):
             "capabilities": {
                 # Should be obtained from the parent PF in this case.
                 "vpd": {"card_serial_number": "MT2113X00000"}},
+            # value defined in the LibvirtFixture
+            "mac_address": "52:54:00:1e:59:c6",
         }
         self.assertEqual(expect_pf, actual_pf)
 
diff --git a/nova/virt/fake.py b/nova/virt/fake.py
index 5aab8ce3007..02fc1f07bcb 100644
--- a/nova/virt/fake.py
+++ b/nova/virt/fake.py
@@ -891,6 +891,36 @@ class FakeLiveMigrateDriverWithNestedCustomResources(
 
 
 class FakeDriverWithPciResources(SmallFakeDriver):
+    """NOTE: this driver provides symmetric compute nodes. Each compute will
+    have the same resources with the same addresses. It is dangerous as using
+    this driver can hide issues when in an asymmetric environment nova fails to
+    update entities according to the host specific addresses (e.g. pci_slot of
+    the neutron port bindings).
+
+    The current non virt driver specific functional test environment has many
+    shortcomings making it really hard to simulate host specific virt drivers.
+
+    1) The virt driver is instantiated by the service logic from the name of
+    the driver class. This makes passing input to the driver instance from the
+    test at init time pretty impossible. This could be solved with some
+    fixtures around nova.virt.driver.load_compute_driver()
+
+    2) The compute service access the hypervisor not only via the virt
+    interface but also reads the sysfs of the host. So simply providing a fake
+    virt driver instance is not enough to isolate simulated compute services
+    that are running on the same host. Also these low level sysfs reads are not
+    having host specific information in the call params. So simply mocking the
+    low level call does not give a way to provide host specific return values.
+
+    3) CONF is global, and it is read dynamically by the driver. So
+    providing host specific CONF to driver instances without race conditions
+    between the drivers are extremely hard especially if periodic tasks are
+    enabled.
+
+    The libvirt based functional test env under nova.tests.functional.libvirt
+    has better support to create asymmetric environments. So please consider
+    using that if possible instead.
+    """
 
     PCI_ADDR_PF1 = '0000:01:00.0'
     PCI_ADDR_PF1_VF1 = '0000:01:00.1'
@@ -955,6 +985,11 @@ def setUp(self):
             ],
                              group='pci')
 
+            # These mocks should be removed after bug
+            # https://bugs.launchpad.net/nova/+bug/1961587 has been fixed and
+            # every SRIOV device related information is transferred through the
+            # virt driver and the PciDevice object instead of queried with
+            # sysfs calls by the network.neutron.API code.
             self.useFixture(fixtures.MockPatch(
                 'nova.pci.utils.get_mac_by_pci_address',
                 return_value='52:54:00:1e:59:c6'))
diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py
index cdf47008de4..ce418d27fd8 100644
--- a/nova/virt/libvirt/host.py
+++ b/nova/virt/libvirt/host.py
@@ -1267,6 +1267,20 @@ def _get_vpd_card_serial_number(
             return None
         return vpd_cap.card_serial_number
 
+    def _get_pf_details(self, device: dict, pci_address: str) -> dict:
+        if device.get('dev_type') != fields.PciDeviceType.SRIOV_PF:
+            return {}
+
+        try:
+            return {
+                'mac_address': pci_utils.get_mac_by_pci_address(pci_address)
+            }
+        except exception.PciDeviceNotFoundById:
+            LOG.debug(
+                'Cannot get MAC address of the PF %s. It is probably attached '
+                'to a guest already', pci_address)
+            return {}
+
     def _get_pcidev_info(
         self,
         devname: str,
@@ -1426,6 +1440,7 @@ def _get_vpd_details(
             _get_device_type(cfgdev, address, dev, net_devs, vdpa_devs))
         device.update(_get_device_capabilities(device, dev, net_devs))
         device.update(_get_vpd_details(device, dev, pci_devs))
+        device.update(self._get_pf_details(device, address))
         return device
 
     def get_vdpa_nodedev_by_address(
diff --git a/releasenotes/notes/bug-1942329-22b08fa4b322881d.yaml b/releasenotes/notes/bug-1942329-22b08fa4b322881d.yaml
new file mode 100644
index 00000000000..496508ca13a
--- /dev/null
+++ b/releasenotes/notes/bug-1942329-22b08fa4b322881d.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    As a fix for `bug 1942329 <https://bugs.launchpad.net/neutron/+bug/1942329>`_
+    nova now updates the MAC address of the ``direct-physical`` ports during
+    mova operations to reflect the MAC address of the physical device on the
+    destination host. Those servers that were created before this fix need to be
+    moved or the port needs to be detached and the re-attached to synchronize the
+    MAC address.

From b40bd1bf52c87c31e18caf85d79dd03da6c7cffc Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Thu, 28 Jul 2022 19:50:29 +0200
Subject: [PATCH 36/93] Remove double mocking

In py310 unittest.mock does not allow to mock the same function twice as
the second mocking will fail to autospec the Mock object created by the
first mocking.

This patch manually fixes the double mocking.

Fixed cases:
1) one of the mock was totally unnecessary so it was removed
2) the second mock specialized the behavior of the first generic mock.
   In this case the second mock is replaced with the configuration of
   the first mock
3) a test case with two test steps mocked the same function for each
   step with overlapping mocks. Here the overlap was removed to have
   the two mock exists independently

The get_connection injection in the libvirt functional test needed a
further tweak (yeah I know it has many already) to act like a single
mock (basically case #2) instead of a temporary re-mocking. Still the
globalness of the get_connection mocking warrant the special set / reset
logic there.

Conflicts:
    nova/tests/functional/regressions/test_bug_1781286.py
    nova/tests/unit/api/openstack/compute/test_shelve.py

Change-Id: I3998d0d49583806ac1c3ae64f1b1fe343cefd20d
(cherry picked from commit f8cf050a1380ae844e0184ed45f4a04fde3b07a9)
---
 nova/test.py                                  |  15 +-
 nova/tests/functional/libvirt/base.py         |  18 +-
 nova/tests/functional/libvirt/test_vtpm.py    |   2 +-
 .../regressions/test_bug_1781286.py           |  30 +-
 .../api/openstack/compute/test_hypervisors.py | 394 ++++++++----------
 .../unit/api/openstack/compute/test_limits.py | 164 ++++----
 .../openstack/compute/test_server_actions.py  |  38 +-
 nova/tests/unit/compute/test_compute.py       |  95 ++---
 nova/tests/unit/compute/test_compute_mgr.py   |  19 +-
 .../unit/compute/test_resource_tracker.py     |  12 +-
 nova/tests/unit/db/main/test_api.py           |  18 +-
 nova/tests/unit/pci/test_stats.py             |  23 +-
 nova/tests/unit/test_metadata.py              |  23 +-
 nova/tests/unit/test_test.py                  |  15 -
 nova/tests/unit/virt/hyperv/test_vmops.py     |  17 +-
 nova/tests/unit/virt/libvirt/test_driver.py   |  41 +-
 .../unit/virt/libvirt/volume/test_lightos.py  |  21 +-
 .../unit/virt/libvirt/volume/test_nvme.py     |  17 +-
 nova/tests/unit/virt/test_block_device.py     | 239 +++++------
 nova/tests/unit/virt/vmwareapi/test_images.py |   8 +-
 20 files changed, 540 insertions(+), 669 deletions(-)

diff --git a/nova/test.py b/nova/test.py
index a6449c01f03..364268e096b 100644
--- a/nova/test.py
+++ b/nova/test.py
@@ -355,7 +355,7 @@ def stub_out(self, old, new):
         self.useFixture(fixtures.MonkeyPatch(old, new))
 
     @staticmethod
-    def patch_exists(patched_path, result):
+    def patch_exists(patched_path, result, other=None):
         """Provide a static method version of patch_exists(), which if you
         haven't already imported nova.test can be slightly easier to
         use as a context manager within a test method via:
@@ -364,7 +364,7 @@ def test_something(self):
                 with self.patch_exists(path, True):
                     ...
         """
-        return patch_exists(patched_path, result)
+        return patch_exists(patched_path, result, other)
 
     @staticmethod
     def patch_open(patched_path, read_data):
@@ -848,10 +848,12 @@ def __repr__(self):
 
 
 @contextlib.contextmanager
-def patch_exists(patched_path, result):
+def patch_exists(patched_path, result, other=None):
     """Selectively patch os.path.exists() so that if it's called with
     patched_path, return result.  Calls with any other path are passed
-    through to the real os.path.exists() function.
+    through to the real os.path.exists() function if other is not provided.
+    If other is provided then that will be the result of the call on paths
+    other than patched_path.
 
     Either import and use as a decorator / context manager, or use the
     nova.TestCase.patch_exists() static method as a context manager.
@@ -885,7 +887,10 @@ def test_my_code(self, mock_exists):
     def fake_exists(path):
         if path == patched_path:
             return result
-        return real_exists(path)
+        elif other is not None:
+            return other
+        else:
+            return real_exists(path)
 
     with mock.patch.object(os.path, "exists") as mock_exists:
         mock_exists.side_effect = fake_exists
diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py
index c325c0b0407..1553e8e59e0 100644
--- a/nova/tests/functional/libvirt/base.py
+++ b/nova/tests/functional/libvirt/base.py
@@ -142,15 +142,15 @@ def _start_compute(hostname, host_info):
                     pci_info.get_pci_address_mac_mapping())
             # This is fun. Firstly we need to do a global'ish mock so we can
             # actually start the service.
-            with mock.patch('nova.virt.libvirt.host.Host.get_connection',
-                            return_value=fake_connection):
-                compute = self.start_service('compute', host=hostname)
-                # Once that's done, we need to tweak the compute "service" to
-                # make sure it returns unique objects. We do this inside the
-                # mock context to avoid a small window between the end of the
-                # context and the tweaking where get_connection would revert to
-                # being an autospec mock.
-                compute.driver._host.get_connection = lambda: fake_connection
+            orig_con = self.mock_conn.return_value
+            self.mock_conn.return_value = fake_connection
+            compute = self.start_service('compute', host=hostname)
+            # Once that's done, we need to tweak the compute "service" to
+            # make sure it returns unique objects.
+            compute.driver._host.get_connection = lambda: fake_connection
+            # Then we revert the local mock tweaking so the next compute can
+            # get its own
+            self.mock_conn.return_value = orig_con
             return compute
 
         # ensure we haven't already registered services with these hostnames
diff --git a/nova/tests/functional/libvirt/test_vtpm.py b/nova/tests/functional/libvirt/test_vtpm.py
index c07c38f02d9..4e9c705052e 100644
--- a/nova/tests/functional/libvirt/test_vtpm.py
+++ b/nova/tests/functional/libvirt/test_vtpm.py
@@ -128,7 +128,7 @@ def setUp(self):
         # the presence of users on the host, none of which makes sense here
         _p = mock.patch(
             'nova.virt.libvirt.driver.LibvirtDriver._check_vtpm_support')
-        self.mock_conn = _p.start()
+        _p.start()
         self.addCleanup(_p.stop)
 
         self.key_mgr = crypto._get_key_manager()
diff --git a/nova/tests/functional/regressions/test_bug_1781286.py b/nova/tests/functional/regressions/test_bug_1781286.py
index 7b2d603092d..bb47eb0ea8a 100644
--- a/nova/tests/functional/regressions/test_bug_1781286.py
+++ b/nova/tests/functional/regressions/test_bug_1781286.py
@@ -10,7 +10,6 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
-import fixtures
 import mock
 from oslo_db import exception as oslo_db_exc
 
@@ -67,11 +66,11 @@ def test_server_create_reschedule_blocked_az_up_call(self):
         def wrap_bari(*args, **kwargs):
             # Poison the AZ query to blow up as if the cell conductor does not
             # have access to the API DB.
-            self.useFixture(
-                fixtures.MockPatch(
-                    'nova.objects.AggregateList.get_by_host',
-                    side_effect=oslo_db_exc.CantStartEngineError))
-            return original_bari(*args, **kwargs)
+            with mock.patch(
+                'nova.objects.AggregateList.get_by_host',
+                side_effect=oslo_db_exc.CantStartEngineError
+            ):
+                return original_bari(*args, **kwargs)
 
         self.stub_out('nova.compute.manager.ComputeManager.'
                       'build_and_run_instance', wrap_bari)
@@ -81,10 +80,6 @@ def wrap_bari(*args, **kwargs):
         # compute service we have to wait for the notification that the build
         # is complete and then stop the mock so we can use the API again.
         self.notifier.wait_for_versioned_notifications('instance.create.end')
-        # Note that we use stopall here because we actually called
-        # build_and_run_instance twice so we have more than one instance of
-        # the mock that needs to be stopped.
-        mock.patch.stopall()
         server = self._wait_for_state_change(server, 'ACTIVE')
         # We should have rescheduled and the instance AZ should be set from the
         # Selection object. Since neither compute host is in an AZ, the server
@@ -128,19 +123,20 @@ def test_migrate_reschedule_blocked_az_up_call(self):
         self.rescheduled = None
 
         def wrap_prep_resize(_self, *args, **kwargs):
-            # Poison the AZ query to blow up as if the cell conductor does not
-            # have access to the API DB.
-            self.agg_mock = self.useFixture(
-                fixtures.MockPatch(
-                    'nova.objects.AggregateList.get_by_host',
-                    side_effect=oslo_db_exc.CantStartEngineError)).mock
             if self.rescheduled is None:
                 # Track the first host that we rescheduled from.
                 self.rescheduled = _self.host
                 # Trigger a reschedule.
                 raise exception.ComputeResourcesUnavailable(
                     reason='test_migrate_reschedule_blocked_az_up_call')
-            return original_prep_resize(_self, *args, **kwargs)
+            # Poison the AZ query to blow up as if the cell conductor does not
+            # have access to the API DB.
+            with mock.patch(
+                'nova.objects.AggregateList.get_by_host',
+                side_effect=oslo_db_exc.CantStartEngineError,
+            ) as agg_mock:
+                self.agg_mock = agg_mock
+                return original_prep_resize(_self, *args, **kwargs)
 
         self.stub_out('nova.compute.manager.ComputeManager._prep_resize',
                       wrap_prep_resize)
diff --git a/nova/tests/unit/api/openstack/compute/test_hypervisors.py b/nova/tests/unit/api/openstack/compute/test_hypervisors.py
index facc5389be3..6545031a0ba 100644
--- a/nova/tests/unit/api/openstack/compute/test_hypervisors.py
+++ b/nova/tests/unit/api/openstack/compute/test_hypervisors.py
@@ -368,25 +368,23 @@ def fake_service_get_by_compute_host(context, host):
                 return TEST_SERVICES[0]
             raise exception.ComputeHostNotFound(host=host)
 
-        @mock.patch.object(self.controller.host_api, 'compute_node_get_all',
-                           return_value=compute_nodes)
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host',
-                           fake_service_get_by_compute_host)
-        def _test(self, compute_node_get_all):
-            req = self._get_request(True)
-            result = self.controller.index(req)
-            self.assertEqual(1, len(result['hypervisors']))
-            expected = {
-                'id': compute_nodes[0].uuid if self.expect_uuid_for_id
-                                            else compute_nodes[0].id,
-                'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
-                'state': 'up',
-                'status': 'enabled',
-            }
-            self.assertDictEqual(expected, result['hypervisors'][0])
+        m_get = self.controller.host_api.compute_node_get_all
+        m_get.side_effect = None
+        m_get.return_value = compute_nodes
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+                fake_service_get_by_compute_host)
 
-        _test(self)
+        req = self._get_request(True)
+        result = self.controller.index(req)
+        self.assertEqual(1, len(result['hypervisors']))
+        expected = {
+            'id': compute_nodes[0].uuid if self.expect_uuid_for_id
+                                        else compute_nodes[0].id,
+            'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
+            'state': 'up',
+            'status': 'enabled',
+        }
+        self.assertDictEqual(expected, result['hypervisors'][0])
 
     def test_index_compute_host_not_mapped(self):
         """Tests that we don't fail index if a host is not mapped."""
@@ -402,25 +400,22 @@ def fake_service_get_by_compute_host(context, host):
                 return TEST_SERVICES[0]
             raise exception.HostMappingNotFound(name=host)
 
-        @mock.patch.object(self.controller.host_api, 'compute_node_get_all',
-                           return_value=compute_nodes)
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host',
-                           fake_service_get_by_compute_host)
-        def _test(self, compute_node_get_all):
-            req = self._get_request(True)
-            result = self.controller.index(req)
-            self.assertEqual(1, len(result['hypervisors']))
-            expected = {
-                'id': compute_nodes[0].uuid if self.expect_uuid_for_id
-                                            else compute_nodes[0].id,
-                'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
-                'state': 'up',
-                'status': 'enabled',
-            }
-            self.assertDictEqual(expected, result['hypervisors'][0])
+        self.controller.host_api.compute_node_get_all.return_value = (
+            compute_nodes)
+        self.controller.host_api.service_get_by_compute_host = (
+            fake_service_get_by_compute_host)
 
-        _test(self)
+        req = self._get_request(True)
+        result = self.controller.index(req)
+        self.assertEqual(1, len(result['hypervisors']))
+        expected = {
+            'id': compute_nodes[0].uuid if self.expect_uuid_for_id
+                                        else compute_nodes[0].id,
+            'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
+            'state': 'up',
+            'status': 'enabled',
+        }
+        self.assertDictEqual(expected, result['hypervisors'][0])
 
     def test_detail(self):
         req = self._get_request(True)
@@ -444,32 +439,30 @@ def fake_service_get_by_compute_host(context, host):
                 return TEST_SERVICES[0]
             raise exception.ComputeHostNotFound(host=host)
 
-        @mock.patch.object(self.controller.host_api, 'compute_node_get_all',
-                           return_value=compute_nodes)
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host',
-                           fake_service_get_by_compute_host)
-        def _test(self, compute_node_get_all):
-            req = self._get_request(True)
-            result = self.controller.detail(req)
-            self.assertEqual(1, len(result['hypervisors']))
-            expected = {
-                'id': compute_nodes[0].id,
-                'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
-                'state': 'up',
-                'status': 'enabled',
-            }
-            # we don't care about all of the details, just make sure we get
-            # the subset we care about and there are more keys than what index
-            # would return
-            hypervisor = result['hypervisors'][0]
-            self.assertTrue(
-                set(expected.keys()).issubset(set(hypervisor.keys())))
-            self.assertGreater(len(hypervisor.keys()), len(expected.keys()))
-            self.assertEqual(compute_nodes[0].hypervisor_hostname,
-                             hypervisor['hypervisor_hostname'])
-
-        _test(self)
+        m_get = self.controller.host_api.compute_node_get_all
+        m_get.side_effect = None
+        m_get.return_value = compute_nodes
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+            fake_service_get_by_compute_host)
+
+        req = self._get_request(True)
+        result = self.controller.detail(req)
+        self.assertEqual(1, len(result['hypervisors']))
+        expected = {
+            'id': compute_nodes[0].id,
+            'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
+            'state': 'up',
+            'status': 'enabled',
+        }
+        # we don't care about all of the details, just make sure we get
+        # the subset we care about and there are more keys than what index
+        # would return
+        hypervisor = result['hypervisors'][0]
+        self.assertTrue(
+            set(expected.keys()).issubset(set(hypervisor.keys())))
+        self.assertGreater(len(hypervisor.keys()), len(expected.keys()))
+        self.assertEqual(compute_nodes[0].hypervisor_hostname,
+                         hypervisor['hypervisor_hostname'])
 
     def test_detail_compute_host_not_mapped(self):
         """Tests that if a service is deleted but the compute node is not we
@@ -487,32 +480,28 @@ def fake_service_get_by_compute_host(context, host):
                 return TEST_SERVICES[0]
             raise exception.HostMappingNotFound(name=host)
 
-        @mock.patch.object(self.controller.host_api, 'compute_node_get_all',
-                           return_value=compute_nodes)
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host',
-                           fake_service_get_by_compute_host)
-        def _test(self, compute_node_get_all):
-            req = self._get_request(True)
-            result = self.controller.detail(req)
-            self.assertEqual(1, len(result['hypervisors']))
-            expected = {
-                'id': compute_nodes[0].id,
-                'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
-                'state': 'up',
-                'status': 'enabled',
-            }
-            # we don't care about all of the details, just make sure we get
-            # the subset we care about and there are more keys than what index
-            # would return
-            hypervisor = result['hypervisors'][0]
-            self.assertTrue(
-                set(expected.keys()).issubset(set(hypervisor.keys())))
-            self.assertGreater(len(hypervisor.keys()), len(expected.keys()))
-            self.assertEqual(compute_nodes[0].hypervisor_hostname,
-                             hypervisor['hypervisor_hostname'])
-
-        _test(self)
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+                fake_service_get_by_compute_host)
+        self.controller.host_api.compute_node_get_all.return_value = (
+                compute_nodes)
+        req = self._get_request(True)
+        result = self.controller.detail(req)
+        self.assertEqual(1, len(result['hypervisors']))
+        expected = {
+            'id': compute_nodes[0].id,
+            'hypervisor_hostname': compute_nodes[0].hypervisor_hostname,
+            'state': 'up',
+            'status': 'enabled',
+        }
+        # we don't care about all of the details, just make sure we get
+        # the subset we care about and there are more keys than what index
+        # would return
+        hypervisor = result['hypervisors'][0]
+        self.assertTrue(
+            set(expected.keys()).issubset(set(hypervisor.keys())))
+        self.assertGreater(len(hypervisor.keys()), len(expected.keys()))
+        self.assertEqual(compute_nodes[0].hypervisor_hostname,
+                         hypervisor['hypervisor_hostname'])
 
     def test_show(self):
         req = self._get_request(True)
@@ -525,21 +514,16 @@ def test_show_compute_host_not_mapped(self):
         """Tests that if a service is deleted but the compute node is not we
         don't fail when listing hypervisors.
         """
-
-        @mock.patch.object(self.controller.host_api, 'compute_node_get',
-                           return_value=self.TEST_HYPERS_OBJ[0])
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host')
-        def _test(self, mock_service, mock_compute_node_get):
-            req = self._get_request(True)
-            mock_service.side_effect = exception.HostMappingNotFound(
-                name='foo')
-            hyper_id = self._get_hyper_id()
-            self.assertRaises(exc.HTTPNotFound, self.controller.show,
-                              req, hyper_id)
-            self.assertTrue(mock_service.called)
-            mock_compute_node_get.assert_called_once_with(mock.ANY, hyper_id)
-        _test(self)
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+            exception.HostMappingNotFound(name='foo'))
+        req = self._get_request(True)
+        hyper_id = self._get_hyper_id()
+        self.assertRaises(
+            exc.HTTPNotFound, self.controller.show, req, hyper_id)
+        self.assertTrue(
+            self.controller.host_api.service_get_by_compute_host.called)
+        self.controller.host_api.compute_node_get.assert_called_once_with(
+                mock.ANY, hyper_id)
 
     def test_show_noid(self):
         req = self._get_request(True)
@@ -611,20 +595,15 @@ def test_uptime_hypervisor_down(self):
                 mock.ANY, self.TEST_HYPERS_OBJ[0].host)
 
     def test_uptime_hypervisor_not_mapped_service_get(self):
-        @mock.patch.object(self.controller.host_api, 'compute_node_get')
-        @mock.patch.object(self.controller.host_api, 'get_host_uptime')
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host',
-                           side_effect=exception.HostMappingNotFound(
-                               name='dummy'))
-        def _test(mock_get, _, __):
-            req = self._get_request(True)
-            hyper_id = self._get_hyper_id()
-            self.assertRaises(exc.HTTPNotFound,
-                              self.controller.uptime, req, hyper_id)
-            self.assertTrue(mock_get.called)
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+            exception.HostMappingNotFound(name='dummy'))
 
-        _test()
+        req = self._get_request(True)
+        hyper_id = self._get_hyper_id()
+        self.assertRaises(exc.HTTPNotFound,
+                          self.controller.uptime, req, hyper_id)
+        self.assertTrue(
+            self.controller.host_api.service_get_by_compute_host.called)
 
     def test_uptime_hypervisor_not_mapped(self):
         with mock.patch.object(self.controller.host_api, 'get_host_uptime',
@@ -644,30 +623,26 @@ def test_search(self):
         self.assertEqual(dict(hypervisors=self.INDEX_HYPER_DICTS), result)
 
     def test_search_non_exist(self):
-        with mock.patch.object(self.controller.host_api,
-                               'compute_node_search_by_hypervisor',
-                               return_value=[]) as mock_node_search:
-            req = self._get_request(True)
-            self.assertRaises(exc.HTTPNotFound, self.controller.search,
-                              req, 'a')
-            self.assertEqual(1, mock_node_search.call_count)
+        m_search = self.controller.host_api.compute_node_search_by_hypervisor
+        m_search.side_effect = None
+        m_search.return_value = []
+
+        req = self._get_request(True)
+        self.assertRaises(exc.HTTPNotFound, self.controller.search, req, 'a')
+        self.assertEqual(1, m_search.call_count)
 
     def test_search_unmapped(self):
+        m_search = self.controller.host_api.compute_node_search_by_hypervisor
+        m_search.side_effect = None
+        m_search.return_value = [mock.MagicMock()]
 
-        @mock.patch.object(self.controller.host_api,
-                           'compute_node_search_by_hypervisor')
-        @mock.patch.object(self.controller.host_api,
-                           'service_get_by_compute_host')
-        def _test(mock_service, mock_search):
-            mock_search.return_value = [mock.MagicMock()]
-            mock_service.side_effect = exception.HostMappingNotFound(
-                name='foo')
-            req = self._get_request(True)
-            self.assertRaises(exc.HTTPNotFound, self.controller.search,
-                              req, 'a')
-            self.assertTrue(mock_service.called)
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+            exception.HostMappingNotFound(name='foo'))
 
-        _test()
+        req = self._get_request(True)
+        self.assertRaises(exc.HTTPNotFound, self.controller.search, req, 'a')
+        self.assertTrue(
+            self.controller.host_api.service_get_by_compute_host.called)
 
     @mock.patch.object(objects.InstanceList, 'get_by_host',
                        side_effect=fake_instance_get_all_by_host)
@@ -702,15 +677,12 @@ def test_servers_not_mapped(self):
     def test_servers_compute_host_not_found(self):
         req = self._get_request(True)
 
-        with test.nested(
-            mock.patch.object(
-                self.controller.host_api, 'instance_get_all_by_host',
-                side_effect=fake_instance_get_all_by_host,
-            ),
-            mock.patch.object(
-                self.controller.host_api, 'service_get_by_compute_host',
-                side_effect=exception.ComputeHostNotFound(host='foo'),
-            ),
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+            exception.ComputeHostNotFound(host='foo'))
+        with mock.patch.object(
+            self.controller.host_api,
+            'instance_get_all_by_host',
+            side_effect=fake_instance_get_all_by_host,
         ):
             # The result should be empty since every attempt to fetch the
             # service for a hypervisor "failed"
@@ -718,24 +690,25 @@ def test_servers_compute_host_not_found(self):
             self.assertEqual({'hypervisors': []}, result)
 
     def test_servers_non_id(self):
-        with mock.patch.object(self.controller.host_api,
-                               'compute_node_search_by_hypervisor',
-                               return_value=[]) as mock_node_search:
-            req = self._get_request(True)
-            self.assertRaises(exc.HTTPNotFound,
-                              self.controller.servers,
-                              req, '115')
-            self.assertEqual(1, mock_node_search.call_count)
+        m_search = self.controller.host_api.compute_node_search_by_hypervisor
+        m_search.side_effect = None
+        m_search.return_value = []
+
+        req = self._get_request(True)
+        self.assertRaises(exc.HTTPNotFound,
+                          self.controller.servers,
+                          req, '115')
+        self.assertEqual(1, m_search.call_count)
 
     def test_servers_with_non_integer_hypervisor_id(self):
-        with mock.patch.object(self.controller.host_api,
-                               'compute_node_search_by_hypervisor',
-                               return_value=[]) as mock_node_search:
+        m_search = self.controller.host_api.compute_node_search_by_hypervisor
+        m_search.side_effect = None
+        m_search.return_value = []
 
-            req = self._get_request(True)
-            self.assertRaises(exc.HTTPNotFound,
-                              self.controller.servers, req, 'abc')
-            self.assertEqual(1, mock_node_search.call_count)
+        req = self._get_request(True)
+        self.assertRaises(
+            exc.HTTPNotFound, self.controller.servers, req, 'abc')
+        self.assertEqual(1, m_search.call_count)
 
     def test_servers_with_no_servers(self):
         with mock.patch.object(self.controller.host_api,
@@ -1089,15 +1062,13 @@ def test_index_with_servers_compute_host_not_found(self):
             use_admin_context=True,
             url='/os-hypervisors?with_servers=1')
 
-        with test.nested(
-            mock.patch.object(
-                self.controller.host_api, 'instance_get_all_by_host',
-                side_effect=fake_instance_get_all_by_host,
-            ),
-            mock.patch.object(
-                self.controller.host_api, 'service_get_by_compute_host',
-                side_effect=exception.ComputeHostNotFound(host='foo'),
-            ),
+        self.controller.host_api.service_get_by_compute_host.side_effect = (
+            exception.ComputeHostNotFound(host='foo'))
+
+        with mock.patch.object(
+            self.controller.host_api,
+            "instance_get_all_by_host",
+            side_effect=fake_instance_get_all_by_host,
         ):
             # The result should be empty since every attempt to fetch the
             # service for a hypervisor "failed"
@@ -1157,11 +1128,13 @@ def test_index_with_hostname_pattern_no_match(self):
             use_admin_context=True,
             url='/os-hypervisors?with_servers=yes&'
                 'hypervisor_hostname_pattern=shenzhen')
-        with mock.patch.object(self.controller.host_api,
-                               'compute_node_search_by_hypervisor',
-                               return_value=objects.ComputeNodeList()) as s:
-            self.assertRaises(exc.HTTPNotFound, self.controller.index, req)
-            s.assert_called_once_with(req.environ['nova.context'], 'shenzhen')
+        m_search = self.controller.host_api.compute_node_search_by_hypervisor
+        m_search.side_effect = None
+        m_search.return_value = objects.ComputeNodeList()
+
+        self.assertRaises(exc.HTTPNotFound, self.controller.index, req)
+        m_search.assert_called_once_with(
+            req.environ['nova.context'], 'shenzhen')
 
     def test_detail_with_hostname_pattern(self):
         """Test listing hypervisors with details and using the
@@ -1170,13 +1143,14 @@ def test_detail_with_hostname_pattern(self):
         req = self._get_request(
             use_admin_context=True,
             url='/os-hypervisors?hypervisor_hostname_pattern=shenzhen')
-        with mock.patch.object(
-            self.controller.host_api,
-            'compute_node_search_by_hypervisor',
-            return_value=objects.ComputeNodeList(objects=[TEST_HYPERS_OBJ[0]])
-        ) as s:
-            result = self.controller.detail(req)
-            s.assert_called_once_with(req.environ['nova.context'], 'shenzhen')
+        m_search = self.controller.host_api.compute_node_search_by_hypervisor
+        m_search.side_effect = None
+        m_search.return_value = objects.ComputeNodeList(
+            objects=[TEST_HYPERS_OBJ[0]])
+
+        result = self.controller.detail(req)
+        m_search.assert_called_once_with(
+            req.environ['nova.context'], 'shenzhen')
 
         expected = {'hypervisors': [self.DETAIL_HYPERS_DICTS[0]]}
 
@@ -1483,15 +1457,11 @@ def test_uptime(self):
             self.controller.uptime, req)
 
     def test_uptime_old_version(self):
-        with mock.patch.object(
-            self.controller.host_api, 'get_host_uptime',
-            return_value='fake uptime',
-        ):
-            req = self._get_request(use_admin_context=True, version='2.87')
-            hyper_id = self._get_hyper_id()
+        req = self._get_request(use_admin_context=True, version='2.87')
+        hyper_id = self._get_hyper_id()
 
-            # no exception == pass
-            self.controller.uptime(req, hyper_id)
+        # no exception == pass
+        self.controller.uptime(req, hyper_id)
 
     def test_uptime_noid(self):
         # the separate 'uptime' API has been removed, so skip this test
@@ -1526,34 +1496,36 @@ def test_uptime_hypervisor_not_mapped(self):
         pass
 
     def test_show_with_uptime_notimplemented(self):
-        with mock.patch.object(
-            self.controller.host_api, 'get_host_uptime',
-            side_effect=NotImplementedError,
-        ) as mock_get_uptime:
-            req = self._get_request(use_admin_context=True)
-            hyper_id = self._get_hyper_id()
+        self.controller.host_api.get_host_uptime.side_effect = (
+            NotImplementedError())
 
-            result = self.controller.show(req, hyper_id)
+        req = self._get_request(use_admin_context=True)
+        hyper_id = self._get_hyper_id()
 
-            expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[0])
-            expected_dict.update({'uptime': None})
-            self.assertEqual({'hypervisor': expected_dict}, result)
-            self.assertEqual(1, mock_get_uptime.call_count)
+        result = self.controller.show(req, hyper_id)
+
+        expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[0])
+        expected_dict.update({'uptime': None})
+        self.assertEqual({'hypervisor': expected_dict}, result)
+        self.assertEqual(
+            1, self.controller.host_api.get_host_uptime.call_count)
 
     def test_show_with_uptime_hypervisor_down(self):
-        with mock.patch.object(
-            self.controller.host_api, 'get_host_uptime',
-            side_effect=exception.ComputeServiceUnavailable(host='dummy')
-        ) as mock_get_uptime:
-            req = self._get_request(use_admin_context=True)
-            hyper_id = self._get_hyper_id()
+        self.controller.host_api.get_host_uptime.side_effect = (
+            exception.ComputeServiceUnavailable(host='dummy'))
 
-            result = self.controller.show(req, hyper_id)
+        req = self._get_request(use_admin_context=True)
+        hyper_id = self._get_hyper_id()
 
-            expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[0])
-            expected_dict.update({'uptime': None})
-            self.assertEqual({'hypervisor': expected_dict}, result)
-            self.assertEqual(1, mock_get_uptime.call_count)
+        result = self.controller.show(req, hyper_id)
+
+        expected_dict = copy.deepcopy(self.DETAIL_HYPERS_DICTS[0])
+        expected_dict.update({'uptime': None})
+        self.assertEqual({'hypervisor': expected_dict}, result)
+        self.assertEqual(
+            1,
+            self.controller.host_api.get_host_uptime.call_count
+        )
 
     def test_show_old_version(self):
         # ensure things still work as expected here
diff --git a/nova/tests/unit/api/openstack/compute/test_limits.py b/nova/tests/unit/api/openstack/compute/test_limits.py
index a5ac0bca24c..69676e28acf 100644
--- a/nova/tests/unit/api/openstack/compute/test_limits.py
+++ b/nova/tests/unit/api/openstack/compute/test_limits.py
@@ -34,7 +34,6 @@
 from nova.limit import placement as placement_limit
 from nova import objects
 from nova.policies import limits as l_policies
-from nova import quota
 from nova import test
 from nova.tests.unit.api.openstack import fakes
 from nova.tests.unit import matchers
@@ -52,12 +51,12 @@ def stub_get_project_quotas(context, project_id, usages=True):
             return {k: dict(limit=v, in_use=v // 2)
                     for k, v in self.absolute_limits.items()}
 
-        mock_get_project_quotas = mock.patch.object(
+        patcher_get_project_quotas = mock.patch.object(
             nova.quota.QUOTAS,
             "get_project_quotas",
-            side_effect = stub_get_project_quotas)
-        mock_get_project_quotas.start()
-        self.addCleanup(mock_get_project_quotas.stop)
+            side_effect=stub_get_project_quotas)
+        self.mock_get_project_quotas = patcher_get_project_quotas.start()
+        self.addCleanup(patcher_get_project_quotas.stop)
         patcher = self.mock_can = mock.patch('nova.context.RequestContext.can')
         self.mock_can = patcher.start()
         self.addCleanup(patcher.stop)
@@ -154,16 +153,14 @@ def _get_project_quotas(context, project_id, usages=True):
             return {k: dict(limit=v, in_use=v // 2)
                     for k, v in self.absolute_limits.items()}
 
-        with mock.patch('nova.quota.QUOTAS.get_project_quotas') as \
-                get_project_quotas:
-            get_project_quotas.side_effect = _get_project_quotas
+        self.mock_get_project_quotas.side_effect = _get_project_quotas
 
-            response = request.get_response(self.controller)
+        response = request.get_response(self.controller)
 
-            body = jsonutils.loads(response.body)
-            self.assertEqual(expected, body)
-            get_project_quotas.assert_called_once_with(context, tenant_id,
-                                                       usages=True)
+        body = jsonutils.loads(response.body)
+        self.assertEqual(expected, body)
+        self.mock_get_project_quotas.assert_called_once_with(
+            context, tenant_id, usages=True)
 
     def _do_test_used_limits(self, reserved):
         request = self._get_index_request(tenant_id=None)
@@ -186,8 +183,7 @@ def _do_test_used_limits(self, reserved):
         def stub_get_project_quotas(context, project_id, usages=True):
             return limits
 
-        self.stub_out('nova.quota.QUOTAS.get_project_quotas',
-                      stub_get_project_quotas)
+        self.mock_get_project_quotas.side_effect = stub_get_project_quotas
 
         res = request.get_response(self.controller)
         body = jsonutils.loads(res.body)
@@ -211,14 +207,15 @@ def test_admin_can_fetch_limits_for_a_given_tenant_id(self):
                                            user_id=user_id,
                                            project_id=project_id)
         context = fake_req.environ["nova.context"]
-        with mock.patch.object(quota.QUOTAS, 'get_project_quotas',
-                              return_value={}) as mock_get_quotas:
-            fake_req.get_response(self.controller)
-            self.assertEqual(2, self.mock_can.call_count)
-            self.mock_can.assert_called_with(
-                l_policies.OTHER_PROJECT_LIMIT_POLICY_NAME)
-            mock_get_quotas.assert_called_once_with(context,
-                tenant_id, usages=True)
+        self.mock_get_project_quotas.side_effect = None
+        self.mock_get_project_quotas.return_value = {}
+
+        fake_req.get_response(self.controller)
+        self.assertEqual(2, self.mock_can.call_count)
+        self.mock_can.assert_called_with(
+            l_policies.OTHER_PROJECT_LIMIT_POLICY_NAME)
+        self.mock_get_project_quotas.assert_called_once_with(context,
+            tenant_id, usages=True)
 
     def _test_admin_can_fetch_used_limits_for_own_project(self, req_get):
         project_id = "123456"
@@ -230,11 +227,12 @@ def _test_admin_can_fetch_used_limits_for_own_project(self, req_get):
                                            project_id=project_id)
         context = fake_req.environ["nova.context"]
 
-        with mock.patch.object(quota.QUOTAS, 'get_project_quotas',
-                               return_value={}) as mock_get_quotas:
-            fake_req.get_response(self.controller)
-            mock_get_quotas.assert_called_once_with(context,
-                project_id, usages=True)
+        self.mock_get_project_quotas.side_effect = None
+        self.mock_get_project_quotas.return_value = {}
+
+        fake_req.get_response(self.controller)
+        self.mock_get_project_quotas.assert_called_once_with(
+            context, project_id, usages=True)
 
     def test_admin_can_fetch_used_limits_for_own_project(self):
         req_get = {}
@@ -262,12 +260,13 @@ def test_used_limits_fetched_for_context_project_id(self):
         project_id = "123456"
         fake_req = self._get_index_request(project_id=project_id)
         context = fake_req.environ["nova.context"]
-        with mock.patch.object(quota.QUOTAS, 'get_project_quotas',
-                               return_value={}) as mock_get_quotas:
-            fake_req.get_response(self.controller)
+        self.mock_get_project_quotas.side_effect = None
+        self.mock_get_project_quotas.return_value = {}
 
-            mock_get_quotas.assert_called_once_with(context,
-                project_id, usages=True)
+        fake_req.get_response(self.controller)
+
+        self.mock_get_project_quotas.assert_called_once_with(
+            context, project_id, usages=True)
 
     def test_used_ram_added(self):
         fake_req = self._get_index_request()
@@ -275,28 +274,26 @@ def test_used_ram_added(self):
         def stub_get_project_quotas(context, project_id, usages=True):
             return {'ram': {'limit': 512, 'in_use': 256}}
 
-        with mock.patch.object(quota.QUOTAS, 'get_project_quotas',
-                               side_effect=stub_get_project_quotas
-                               ) as mock_get_quotas:
+        self.mock_get_project_quotas.side_effect = stub_get_project_quotas
 
-            res = fake_req.get_response(self.controller)
-            body = jsonutils.loads(res.body)
-            abs_limits = body['limits']['absolute']
-            self.assertIn('totalRAMUsed', abs_limits)
-            self.assertEqual(256, abs_limits['totalRAMUsed'])
-            self.assertEqual(1, mock_get_quotas.call_count)
+        res = fake_req.get_response(self.controller)
+        body = jsonutils.loads(res.body)
+        abs_limits = body['limits']['absolute']
+        self.assertIn('totalRAMUsed', abs_limits)
+        self.assertEqual(256, abs_limits['totalRAMUsed'])
+        self.assertEqual(1, self.mock_get_project_quotas.call_count)
 
     def test_no_ram_quota(self):
         fake_req = self._get_index_request()
 
-        with mock.patch.object(quota.QUOTAS, 'get_project_quotas',
-                               return_value={}) as mock_get_quotas:
+        self.mock_get_project_quotas.side_effect = None
+        self.mock_get_project_quotas.return_value = {}
 
-            res = fake_req.get_response(self.controller)
-            body = jsonutils.loads(res.body)
-            abs_limits = body['limits']['absolute']
-            self.assertNotIn('totalRAMUsed', abs_limits)
-            self.assertEqual(1, mock_get_quotas.call_count)
+        res = fake_req.get_response(self.controller)
+        body = jsonutils.loads(res.body)
+        abs_limits = body['limits']['absolute']
+        self.assertNotIn('totalRAMUsed', abs_limits)
+        self.assertEqual(1, self.mock_get_project_quotas.call_count)
 
 
 class FakeHttplibSocket(object):
@@ -398,25 +395,24 @@ def _get_project_quotas(context, project_id, usages=True):
             return {k: dict(limit=v, in_use=v // 2)
                     for k, v in absolute_limits.items()}
 
-        with mock.patch('nova.quota.QUOTAS.get_project_quotas') as \
-                get_project_quotas:
-            get_project_quotas.side_effect = _get_project_quotas
-            response = self.controller.index(self.req)
-            expected_response = {
-                "limits": {
-                    "rate": [],
-                    "absolute": {
-                        "maxTotalRAMSize": 512,
-                        "maxTotalInstances": 5,
-                        "maxTotalCores": 21,
-                        "maxTotalKeypairs": 10,
-                        "totalRAMUsed": 256,
-                        "totalCoresUsed": 10,
-                        "totalInstancesUsed": 2,
-                    },
+        self.mock_get_project_quotas.side_effect = _get_project_quotas
+
+        response = self.controller.index(self.req)
+        expected_response = {
+            "limits": {
+                "rate": [],
+                "absolute": {
+                    "maxTotalRAMSize": 512,
+                    "maxTotalInstances": 5,
+                    "maxTotalCores": 21,
+                    "maxTotalKeypairs": 10,
+                    "totalRAMUsed": 256,
+                    "totalCoresUsed": 10,
+                    "totalInstancesUsed": 2,
                 },
-            }
-            self.assertEqual(expected_response, response)
+            },
+        }
+        self.assertEqual(expected_response, response)
 
 
 class LimitsControllerTestV239(BaseLimitTestSuite):
@@ -436,21 +432,20 @@ def _get_project_quotas(context, project_id, usages=True):
             return {k: dict(limit=v, in_use=v // 2)
                     for k, v in absolute_limits.items()}
 
-        with mock.patch('nova.quota.QUOTAS.get_project_quotas') as \
-                get_project_quotas:
-            get_project_quotas.side_effect = _get_project_quotas
-            response = self.controller.index(self.req)
-            # staring from version 2.39 there is no 'maxImageMeta' field
-            # in response after removing 'image-metadata' proxy API
-            expected_response = {
-                "limits": {
-                    "rate": [],
-                    "absolute": {
-                        "maxServerMeta": 1,
-                    },
+        self.mock_get_project_quotas.side_effect = _get_project_quotas
+
+        response = self.controller.index(self.req)
+        # starting from version 2.39 there is no 'maxImageMeta' field
+        # in response after removing 'image-metadata' proxy API
+        expected_response = {
+            "limits": {
+                "rate": [],
+                "absolute": {
+                    "maxServerMeta": 1,
                 },
-            }
-            self.assertEqual(expected_response, response)
+            },
+        }
+        self.assertEqual(expected_response, response)
 
 
 class LimitsControllerTestV275(BaseLimitTestSuite):
@@ -469,10 +464,9 @@ def _get_project_quotas(context, project_id, usages=True):
             return {k: dict(limit=v, in_use=v // 2)
                     for k, v in absolute_limits.items()}
 
-        with mock.patch('nova.quota.QUOTAS.get_project_quotas') as \
-                get_project_quotas:
-            get_project_quotas.side_effect = _get_project_quotas
-            self.controller.index(req)
+        self.mock_get_project_quotas.side_effect = _get_project_quotas
+        self.controller.index(req)
+        self.controller.index(req)
 
     def test_index_additional_query_param(self):
         req = fakes.HTTPRequest.blank("/?unkown=fake",
diff --git a/nova/tests/unit/api/openstack/compute/test_server_actions.py b/nova/tests/unit/api/openstack/compute/test_server_actions.py
index d07924abe84..b4daad1286a 100644
--- a/nova/tests/unit/api/openstack/compute/test_server_actions.py
+++ b/nova/tests/unit/api/openstack/compute/test_server_actions.py
@@ -66,11 +66,11 @@ def setUp(self):
 
         self.controller = self._get_controller()
         self.compute_api = self.controller.compute_api
-        # We don't care about anything getting as far as hitting the compute
-        # RPC API so we just mock it out here.
-        mock_rpcapi = mock.patch.object(self.compute_api, 'compute_rpcapi')
-        mock_rpcapi.start()
-        self.addCleanup(mock_rpcapi.stop)
+        # In most of the cases we don't care about anything getting as far as
+        # hitting the compute RPC API so we just mock it out here.
+        patcher_rpcapi = mock.patch.object(self.compute_api, 'compute_rpcapi')
+        self.mock_rpcapi = patcher_rpcapi.start()
+        self.addCleanup(patcher_rpcapi.stop)
         # The project_id here matches what is used by default in
         # fake_compute_get which need to match for policy checks.
         self.req = fakes.HTTPRequest.blank('',
@@ -1079,21 +1079,23 @@ def fake_block_device_mapping_get_all_by_instance(context, inst_id,
 
         snapshot = dict(id=_fake_id('d'))
 
+        self.mock_rpcapi.quiesce_instance.side_effect = (
+            exception.InstanceQuiesceNotSupported(
+                instance_id="fake", reason="test"
+            )
+        )
+
         with test.nested(
             mock.patch.object(
                 self.controller.compute_api.volume_api, 'get_absolute_limits',
                 return_value={'totalSnapshotsUsed': 0,
                               'maxTotalSnapshots': 10}),
-            mock.patch.object(self.controller.compute_api.compute_rpcapi,
-                'quiesce_instance',
-                side_effect=exception.InstanceQuiesceNotSupported(
-                    instance_id='fake', reason='test')),
             mock.patch.object(self.controller.compute_api.volume_api, 'get',
                               return_value=volume),
             mock.patch.object(self.controller.compute_api.volume_api,
                               'create_snapshot_force',
                               return_value=snapshot),
-        ) as (mock_get_limits, mock_quiesce, mock_vol_get, mock_vol_create):
+        ) as (mock_get_limits, mock_vol_get, mock_vol_create):
 
             if mock_vol_create_side_effect:
                 mock_vol_create.side_effect = mock_vol_create_side_effect
@@ -1125,7 +1127,7 @@ def fake_block_device_mapping_get_all_by_instance(context, inst_id,
             for k in extra_properties.keys():
                 self.assertEqual(properties[k], extra_properties[k])
 
-            mock_quiesce.assert_called_once_with(mock.ANY, mock.ANY)
+            self.mock_rpcapi.quiesce_instance.assert_called_once()
             mock_vol_get.assert_called_once_with(mock.ANY, volume['id'])
             mock_vol_create.assert_called_once_with(mock.ANY, volume['id'],
                                                     mock.ANY, mock.ANY)
@@ -1189,21 +1191,23 @@ def fake_block_device_mapping_get_all_by_instance(context, inst_id,
 
         snapshot = dict(id=_fake_id('d'))
 
+        self.mock_rpcapi.quiesce_instance.side_effect = (
+            exception.InstanceQuiesceNotSupported(
+                instance_id="fake", reason="test"
+            )
+        )
+
         with test.nested(
             mock.patch.object(
                 self.controller.compute_api.volume_api, 'get_absolute_limits',
                 return_value={'totalSnapshotsUsed': 0,
                               'maxTotalSnapshots': 10}),
-            mock.patch.object(self.controller.compute_api.compute_rpcapi,
-                'quiesce_instance',
-                side_effect=exception.InstanceQuiesceNotSupported(
-                    instance_id='fake', reason='test')),
             mock.patch.object(self.controller.compute_api.volume_api, 'get',
                               return_value=volume),
             mock.patch.object(self.controller.compute_api.volume_api,
                               'create_snapshot_force',
                               return_value=snapshot),
-        ) as (mock_get_limits, mock_quiesce, mock_vol_get, mock_vol_create):
+        ) as (mock_get_limits, mock_vol_get, mock_vol_create):
 
             response = self.controller._action_create_image(self.req,
                 FAKE_UUID, body=body)
@@ -1218,7 +1222,7 @@ def fake_block_device_mapping_get_all_by_instance(context, inst_id,
                 for key, val in extra_metadata.items():
                     self.assertEqual(properties[key], val)
 
-            mock_quiesce.assert_called_once_with(mock.ANY, mock.ANY)
+            self.mock_rpcapi.quiesce_instance.assert_called_once()
             mock_vol_get.assert_called_once_with(mock.ANY, volume['id'])
             mock_vol_create.assert_called_once_with(mock.ANY, volume['id'],
                                                     mock.ANY, mock.ANY)
diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py
index e98ea6ab062..538596f6b12 100644
--- a/nova/tests/unit/compute/test_compute.py
+++ b/nova/tests/unit/compute/test_compute.py
@@ -8620,16 +8620,13 @@ def test_create_instance_defaults_display_name(self):
 
     def test_create_instance_sets_system_metadata(self):
         # Make sure image properties are copied into system metadata.
-        with mock.patch.object(
-            self.compute_api.compute_task_api, 'schedule_and_build_instances',
-        ) as mock_sbi:
-            ref, resv_id = self.compute_api.create(
-                self.context,
-                flavor=self.default_flavor,
-                image_href='f5000000-0000-0000-0000-000000000000')
+        ref, resv_id = self.compute_api.create(
+            self.context,
+            flavor=self.default_flavor,
+            image_href='f5000000-0000-0000-0000-000000000000')
 
-            build_call = mock_sbi.call_args_list[0]
-            instance = build_call[1]['build_requests'][0].instance
+        build_call = self.schedule_and_build_instances_mock.call_args_list[0]
+        instance = build_call[1]['build_requests'][0].instance
 
         image_props = {'image_kernel_id': uuids.kernel_id,
                  'image_ramdisk_id': uuids.ramdisk_id,
@@ -8639,16 +8636,14 @@ def test_create_instance_sets_system_metadata(self):
             self.assertEqual(value, instance.system_metadata[key])
 
     def test_create_saves_flavor(self):
-        with mock.patch.object(
-            self.compute_api.compute_task_api, 'schedule_and_build_instances',
-        ) as mock_sbi:
-            ref, resv_id = self.compute_api.create(
-                self.context,
-                flavor=self.default_flavor,
-                image_href=uuids.image_href_id)
+        ref, resv_id = self.compute_api.create(
+            self.context,
+            flavor=self.default_flavor,
+            image_href=uuids.image_href_id)
+
+        build_call = self.schedule_and_build_instances_mock.call_args_list[0]
+        instance = build_call[1]['build_requests'][0].instance
 
-            build_call = mock_sbi.call_args_list[0]
-            instance = build_call[1]['build_requests'][0].instance
         self.assertIn('flavor', instance)
         self.assertEqual(self.default_flavor.flavorid,
                          instance.flavor.flavorid)
@@ -8656,19 +8651,18 @@ def test_create_saves_flavor(self):
 
     def test_create_instance_associates_security_groups(self):
         # Make sure create associates security groups.
-        with test.nested(
-                mock.patch.object(self.compute_api.compute_task_api,
-                                  'schedule_and_build_instances'),
-                mock.patch('nova.network.security_group_api.validate_name',
-                           return_value=uuids.secgroup_id),
-        ) as (mock_sbi, mock_secgroups):
+        with mock.patch(
+            "nova.network.security_group_api.validate_name",
+            return_value=uuids.secgroup_id,
+        ) as mock_secgroups:
             self.compute_api.create(
                 self.context,
                 flavor=self.default_flavor,
                 image_href=uuids.image_href_id,
                 security_groups=['testgroup'])
 
-            build_call = mock_sbi.call_args_list[0]
+            build_call = (
+                self.schedule_and_build_instances_mock.call_args_list[0])
             reqspec = build_call[1]['request_spec'][0]
 
         self.assertEqual(1, len(reqspec.security_groups))
@@ -8703,22 +8697,19 @@ def test_create_instance_associates_requested_networks(self):
         requested_networks = objects.NetworkRequestList(
             objects=[objects.NetworkRequest(port_id=uuids.port_instance)])
 
-        with test.nested(
-            mock.patch.object(
-                self.compute_api.compute_task_api,
-                'schedule_and_build_instances'),
-            mock.patch.object(
-                self.compute_api.network_api,
-                'create_resource_requests',
-                return_value=(None, [], objects.RequestLevelParams())),
-        ) as (mock_sbi, _mock_create_resreqs):
+        with mock.patch.object(
+            self.compute_api.network_api,
+            "create_resource_requests",
+            return_value=(None, [], objects.RequestLevelParams()),
+        ):
             self.compute_api.create(
                 self.context,
                 flavor=self.default_flavor,
                 image_href=uuids.image_href_id,
                 requested_networks=requested_networks)
 
-            build_call = mock_sbi.call_args_list[0]
+            build_call = (
+                self.schedule_and_build_instances_mock.call_args_list[0])
             reqspec = build_call[1]['request_spec'][0]
 
         self.assertEqual(1, len(reqspec.requested_networks))
@@ -10218,8 +10209,7 @@ def test_console_output_no_host(self):
                           self.compute_api.get_console_output,
                           self.context, instance)
 
-    @mock.patch.object(compute_utils, 'notify_about_instance_action')
-    def test_attach_interface(self, mock_notify):
+    def test_attach_interface(self):
         instance = self._create_fake_instance_obj()
         nwinfo = [fake_network_cache_model.new_vif()]
         network_id = nwinfo[0]['network']['id']
@@ -10239,8 +10229,12 @@ def test_attach_interface(self, mock_notify):
             mock.patch.object(
                 self.compute,
                 "_claim_pci_device_for_interface_attach",
-                return_value=None)
-        ) as (cap, mock_lock, mock_create_resource_req, mock_claim_pci):
+                return_value=None),
+            mock.patch.object(compute_utils, 'notify_about_instance_action'),
+        ) as (
+            cap, mock_lock, mock_create_resource_req, mock_claim_pci,
+            mock_notify
+        ):
             mock_create_resource_req.return_value = (
                 None, [], mock.sentinel.req_lvl_params)
             vif = self.compute.attach_interface(self.context,
@@ -11058,8 +11052,7 @@ def test__allocate_port_resource_for_instance_fails_to_update_pci(self):
             mock_remove_res.assert_called_once_with(
                 self.context, instance.uuid, mock.sentinel.resources)
 
-    @mock.patch.object(compute_utils, 'notify_about_instance_action')
-    def test_detach_interface(self, mock_notify):
+    def test_detach_interface(self):
         nwinfo, port_id = self.test_attach_interface()
         instance = self._create_fake_instance_obj()
         instance.info_cache = objects.InstanceInfoCache.new(
@@ -11092,10 +11085,13 @@ def test_detach_interface(self, mock_notify):
             mock.patch('nova.pci.request.get_instance_pci_request_from_vif',
                        return_value=pci_req),
             mock.patch.object(self.compute.rt, 'unclaim_pci_devices'),
-            mock.patch.object(instance, 'save')
+            mock.patch.object(instance, 'save'),
+            mock.patch.object(compute_utils, 'notify_about_instance_action'),
         ) as (
-                mock_remove_alloc, mock_deallocate, mock_lock,
-                mock_get_pci_req, mock_unclaim_pci, mock_instance_save):
+            mock_remove_alloc, mock_deallocate, mock_lock,
+            mock_get_pci_req, mock_unclaim_pci, mock_instance_save,
+            mock_notify
+        ):
             self.compute.detach_interface(self.context, instance, port_id)
 
             mock_deallocate.assert_called_once_with(
@@ -11902,17 +11898,16 @@ def fake_rebuild_instance(*args, **kwargs):
             instance.save()
 
         @mock.patch.object(objects.Service, 'get_by_compute_host')
-        @mock.patch.object(self.compute_api.compute_task_api,
-                           'rebuild_instance')
         @mock.patch.object(objects.ComputeNodeList, 'get_all_by_host')
         @mock.patch.object(objects.RequestSpec,
                            'get_by_instance_uuid')
         @mock.patch.object(self.compute_api.servicegroup_api, 'service_is_up')
-        def do_test(service_is_up, get_by_instance_uuid, get_all_by_host,
-                    rebuild_instance, get_service):
+        def do_test(
+            service_is_up, get_by_instance_uuid, get_all_by_host, get_service
+        ):
             service_is_up.return_value = False
             get_by_instance_uuid.return_value = fake_spec
-            rebuild_instance.side_effect = fake_rebuild_instance
+            self.rebuild_instance_mock.side_effect = fake_rebuild_instance
             get_all_by_host.return_value = objects.ComputeNodeList(
                 objects=[objects.ComputeNode(
                     host='fake_dest_host',
@@ -11930,7 +11925,7 @@ def do_test(service_is_up, get_by_instance_uuid, get_all_by_host,
                 host = None
             else:
                 host = 'fake_dest_host'
-            rebuild_instance.assert_called_once_with(
+            self.rebuild_instance_mock.assert_called_once_with(
                 ctxt,
                 instance=instance,
                 new_pass=None,
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 974c669bc76..147e4480199 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -8563,11 +8563,9 @@ def _test_revert_resize_instance_destroy_disks(self, is_shared=False):
         @mock.patch.object(self.compute.network_api, 'setup_networks_on_host')
         @mock.patch.object(self.compute.network_api, 'migrate_instance_start')
         @mock.patch.object(compute_utils, 'notify_usage_exists')
-        @mock.patch.object(self.migration, 'save')
         @mock.patch.object(objects.BlockDeviceMappingList,
                            'get_by_instance_uuid')
         def do_test(get_by_instance_uuid,
-                    migration_save,
                     notify_usage_exists,
                     migrate_instance_start,
                     setup_networks_on_host,
@@ -8639,7 +8637,6 @@ def _get_instance_nw_info(context, instance):
         @mock.patch.object(self.compute.network_api, 'migrate_instance_finish',
                            side_effect=_migrate_instance_finish)
         @mock.patch.object(self.compute.network_api, 'setup_networks_on_host')
-        @mock.patch.object(self.migration, 'save')
         @mock.patch.object(self.instance, 'save')
         @mock.patch.object(self.compute, '_set_instance_info')
         @mock.patch.object(db, 'instance_fault_create')
@@ -8653,7 +8650,6 @@ def do_test(notify_about_instance_usage,
                     fault_create,
                     set_instance_info,
                     instance_save,
-                    migration_save,
                     setup_networks_on_host,
                     migrate_instance_finish,
                     get_instance_nw_info,
@@ -8697,11 +8693,9 @@ def test_finish_revert_resize_migration_context(self):
         @mock.patch.object(self.compute.network_api, 'migrate_instance_start')
         @mock.patch.object(compute_utils, 'notify_usage_exists')
         @mock.patch.object(db, 'instance_extra_update_by_uuid')
-        @mock.patch.object(self.migration, 'save')
         @mock.patch.object(objects.BlockDeviceMappingList,
                            'get_by_instance_uuid')
         def do_revert_resize(mock_get_by_instance_uuid,
-                             mock_migration_save,
                              mock_extra_update,
                              mock_notify_usage_exists,
                              mock_migrate_instance_start,
@@ -8748,7 +8742,6 @@ def do_revert_resize(mock_get_by_instance_uuid,
         @mock.patch.object(compute_utils, 'notify_about_instance_action')
         @mock.patch.object(self.compute, "_set_instance_info")
         @mock.patch.object(self.instance, 'save')
-        @mock.patch.object(self.migration, 'save')
         @mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
         @mock.patch.object(db, 'instance_fault_create')
         @mock.patch.object(db, 'instance_extra_update_by_uuid')
@@ -8772,7 +8765,6 @@ def do_finish_revert_resize(mock_attachment_complete,
                                     mock_extra_update,
                                     mock_fault_create,
                                     mock_fault_from_exc,
-                                    mock_mig_save,
                                     mock_inst_save,
                                     mock_set,
                                     mock_notify_about_instance_action,
@@ -8866,7 +8858,6 @@ def test_confirm_resize_deletes_allocations_and_update_scheduler(self):
         @mock.patch.object(self.compute, '_delete_scheduler_instance_info')
         @mock.patch('nova.objects.Instance.get_by_uuid')
         @mock.patch('nova.objects.Migration.get_by_id')
-        @mock.patch.object(self.migration, 'save')
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
         @mock.patch.object(self.compute, 'network_api')
         @mock.patch.object(self.compute.driver, 'confirm_migration')
@@ -8875,7 +8866,7 @@ def test_confirm_resize_deletes_allocations_and_update_scheduler(self):
         @mock.patch.object(self.instance, 'save')
         def do_confirm_resize(mock_save, mock_drop, mock_delete,
                               mock_confirm, mock_nwapi, mock_notify,
-                              mock_mig_save, mock_mig_get, mock_inst_get,
+                              mock_mig_get, mock_inst_get,
                               mock_delete_scheduler_info):
 
             self._mock_rt()
@@ -8958,16 +8949,16 @@ def test_confirm_resize_driver_confirm_migration_fails(
         instance_get_by_uuid.assert_called_once()
 
     def test_confirm_resize_calls_virt_driver_with_old_pci(self):
-        @mock.patch.object(self.migration, 'save')
         @mock.patch.object(self.compute, '_notify_about_instance_usage')
         @mock.patch.object(self.compute, 'network_api')
         @mock.patch.object(self.compute.driver, 'confirm_migration')
         @mock.patch.object(self.compute, '_delete_allocation_after_move')
         @mock.patch.object(self.instance, 'drop_migration_context')
         @mock.patch.object(self.instance, 'save')
-        def do_confirm_resize(mock_save, mock_drop, mock_delete,
-                              mock_confirm, mock_nwapi, mock_notify,
-                              mock_mig_save):
+        def do_confirm_resize(
+            mock_save, mock_drop, mock_delete, mock_confirm, mock_nwapi,
+            mock_notify
+        ):
             # Mock virt driver confirm_resize() to save the provided
             # network_info, we will check it later.
             updated_nw_info = []
diff --git a/nova/tests/unit/compute/test_resource_tracker.py b/nova/tests/unit/compute/test_resource_tracker.py
index caa12cb754c..5aab64e72c4 100644
--- a/nova/tests/unit/compute/test_resource_tracker.py
+++ b/nova/tests/unit/compute/test_resource_tracker.py
@@ -4205,9 +4205,9 @@ def test_clean_compute_node_cache(self, mock_remove):
         invalid_nodename = "invalid-node"
         self.rt.compute_nodes[_NODENAME] = self.compute
         self.rt.compute_nodes[invalid_nodename] = mock.sentinel.compute
-        with mock.patch.object(
-            self.rt.reportclient, "invalidate_resource_provider",
-        ) as mock_invalidate:
-            self.rt.clean_compute_node_cache([self.compute])
-            mock_remove.assert_called_once_with(invalid_nodename)
-            mock_invalidate.assert_called_once_with(invalid_nodename)
+        mock_invalidate = self.rt.reportclient.invalidate_resource_provider
+
+        self.rt.clean_compute_node_cache([self.compute])
+
+        mock_remove.assert_called_once_with(invalid_nodename)
+        mock_invalidate.assert_called_once_with(invalid_nodename)
diff --git a/nova/tests/unit/db/main/test_api.py b/nova/tests/unit/db/main/test_api.py
index c9a9e83154a..e869d0403c3 100644
--- a/nova/tests/unit/db/main/test_api.py
+++ b/nova/tests/unit/db/main/test_api.py
@@ -279,33 +279,21 @@ def _test_pick_context_manager_disable_db_access(
             'No DB access allowed in ',
             mock_log.error.call_args[0][0])
 
-    @mock.patch.object(db, 'LOG')
-    @mock.patch.object(db, 'DISABLE_DB_ACCESS', return_value=True)
-    def test_pick_context_manager_writer_disable_db_access(
-        self, mock_DISABLE_DB_ACCESS, mock_log,
-    ):
+    def test_pick_context_manager_writer_disable_db_access(self):
         @db.pick_context_manager_writer
         def func(context, value):
             pass
 
         self._test_pick_context_manager_disable_db_access(func)
 
-    @mock.patch.object(db, 'LOG')
-    @mock.patch.object(db, 'DISABLE_DB_ACCESS', return_value=True)
-    def test_pick_context_manager_reader_disable_db_access(
-        self, mock_DISABLE_DB_ACCESS, mock_log,
-    ):
+    def test_pick_context_manager_reader_disable_db_access(self):
         @db.pick_context_manager_reader
         def func(context, value):
             pass
 
         self._test_pick_context_manager_disable_db_access(func)
 
-    @mock.patch.object(db, 'LOG')
-    @mock.patch.object(db, 'DISABLE_DB_ACCESS', return_value=True)
-    def test_pick_context_manager_reader_allow_async_disable_db_access(
-        self, mock_DISABLE_DB_ACCESS, mock_log,
-    ):
+    def test_pick_context_manager_reader_allow_async_disable_db_access(self):
         @db.pick_context_manager_reader_allow_async
         def func(context, value):
             pass
diff --git a/nova/tests/unit/pci/test_stats.py b/nova/tests/unit/pci/test_stats.py
index 804b76ffba4..b88cfd19efb 100644
--- a/nova/tests/unit/pci/test_stats.py
+++ b/nova/tests/unit/pci/test_stats.py
@@ -98,16 +98,7 @@ def _add_fake_devs_with_numa(self):
 
     def setUp(self):
         super(PciDeviceStatsTestCase, self).setUp()
-        self._setup_pci_stats()
-
-    def _setup_pci_stats(self, numa_topology=None):
-        """Exists for tests that need to setup pci_stats with a specific NUMA
-        topology, while still allowing tests that don't care to get the default
-        "empty" one.
-        """
-        if not numa_topology:
-            numa_topology = objects.NUMATopology()
-        self.pci_stats = stats.PciDeviceStats(numa_topology)
+        self.pci_stats = stats.PciDeviceStats(objects.NUMATopology())
         # The following two calls need to be made before adding the devices.
         patcher = fakes.fake_pci_whitelist()
         self.addCleanup(patcher.stop)
@@ -240,18 +231,18 @@ def test_support_requests_no_numa_info_pci_numa_policy_required(self):
         self.assertFalse(self.pci_stats.support_requests(pci_requests, cells))
 
     def test_filter_pools_for_socket_affinity_no_socket(self):
-        self._setup_pci_stats(
-            objects.NUMATopology(
-                cells=[objects.NUMACell(socket=None)]))
+        self.pci_stats.numa_topology = objects.NUMATopology(
+                cells=[objects.NUMACell(socket=None)])
+
         self.assertEqual(
             [],
             self.pci_stats._filter_pools_for_socket_affinity(
                 self.pci_stats.pools, [objects.InstanceNUMACell()]))
 
     def test_filter_pools_for_socket_affinity(self):
-        self._setup_pci_stats(
-            objects.NUMATopology(
-                cells=[objects.NUMACell(id=1, socket=1)]))
+        self.pci_stats.numa_topology = objects.NUMATopology(
+                cells=[objects.NUMACell(id=1, socket=1)])
+
         pools = self.pci_stats._filter_pools_for_socket_affinity(
             self.pci_stats.pools, [objects.InstanceNUMACell(id=1)])
         self.assertEqual(1, len(pools))
diff --git a/nova/tests/unit/test_metadata.py b/nova/tests/unit/test_metadata.py
index 630cb544188..1c78ddea51a 100644
--- a/nova/tests/unit/test_metadata.py
+++ b/nova/tests/unit/test_metadata.py
@@ -1458,20 +1458,17 @@ def fake_list_ports(context, fixed_ips, network_id, fields):
                        for c in range(ord('a'), ord('z'))]
         mock_client.list_subnets.return_value = {
             'subnets': subnet_list}
+        mock_client.list_ports.side_effect = fake_list_ports
 
-        with mock.patch.object(
-                mock_client, 'list_ports',
-                side_effect=fake_list_ports) as mock_list_ports:
-
-            response = fake_request(
-                self, self.mdinst,
-                relpath="/2009-04-04/user-data",
-                address="192.192.192.2",
-                fake_get_metadata_by_instance_id=self._fake_x_get_metadata,
-                headers={'X-Forwarded-For': '192.192.192.2',
-                         'X-Metadata-Provider': proxy_lb_id})
-
-            self.assertEqual(3, mock_list_ports.call_count)
+        response = fake_request(
+            self, self.mdinst,
+            relpath="/2009-04-04/user-data",
+            address="192.192.192.2",
+            fake_get_metadata_by_instance_id=self._fake_x_get_metadata,
+            headers={'X-Forwarded-For': '192.192.192.2',
+                     'X-Metadata-Provider': proxy_lb_id})
+
+        self.assertEqual(3, mock_client.list_ports.call_count)
 
         self.assertEqual(200, response.status_int)
 
diff --git a/nova/tests/unit/test_test.py b/nova/tests/unit/test_test.py
index 8381792de64..5642a6da746 100644
--- a/nova/tests/unit/test_test.py
+++ b/nova/tests/unit/test_test.py
@@ -361,21 +361,6 @@ def test_patch_exists_decorator_false(self):
         self.assertTrue(os.path.exists(os.path.dirname(__file__)))
         self.assertFalse(os.path.exists('non-existent/file'))
 
-    @test.patch_exists('fake_file1', True)
-    @test.patch_exists('fake_file2', True)
-    @test.patch_exists(__file__, False)
-    def test_patch_exists_multiple_decorators(self):
-        """Test that @patch_exists can be used multiple times on the
-        same method.
-        """
-        self.assertTrue(os.path.exists('fake_file1'))
-        self.assertTrue(os.path.exists('fake_file2'))
-        self.assertFalse(os.path.exists(__file__))
-
-        # Check non-patched parameters
-        self.assertTrue(os.path.exists(os.path.dirname(__file__)))
-        self.assertFalse(os.path.exists('non-existent/file'))
-
 
 class PatchOpenTestCase(test.NoDBTestCase):
     fake_contents = "These file contents don't really exist"
diff --git a/nova/tests/unit/virt/hyperv/test_vmops.py b/nova/tests/unit/virt/hyperv/test_vmops.py
index dd4dc52d5b6..1a71045ea27 100644
--- a/nova/tests/unit/virt/hyperv/test_vmops.py
+++ b/nova/tests/unit/virt/hyperv/test_vmops.py
@@ -1374,12 +1374,10 @@ def test_set_vm_state_exception(self):
     def test_get_vm_state(self):
         summary_info = {'EnabledState': os_win_const.HYPERV_VM_STATE_DISABLED}
 
-        with mock.patch.object(self._vmops._vmutils,
-                               'get_vm_summary_info') as mock_get_summary_info:
-            mock_get_summary_info.return_value = summary_info
+        self._vmops._vmutils.get_vm_summary_info.return_value = summary_info
 
-            response = self._vmops._get_vm_state(mock.sentinel.FAKE_VM_NAME)
-            self.assertEqual(response, os_win_const.HYPERV_VM_STATE_DISABLED)
+        response = self._vmops._get_vm_state(mock.sentinel.FAKE_VM_NAME)
+        self.assertEqual(response, os_win_const.HYPERV_VM_STATE_DISABLED)
 
     @mock.patch.object(vmops.VMOps, '_get_vm_state')
     def test_wait_for_power_off_true(self, mock_get_state):
@@ -1418,12 +1416,11 @@ def test_create_vm_com_port_pipes(self):
 
     def test_list_instance_uuids(self):
         fake_uuid = '4f54fb69-d3a2-45b7-bb9b-b6e6b3d893b3'
-        with mock.patch.object(self._vmops._vmutils,
-                               'list_instance_notes') as mock_list_notes:
-            mock_list_notes.return_value = [('fake_name', [fake_uuid])]
+        self._vmops._vmutils.list_instance_notes.return_value = (
+            [('fake_name', [fake_uuid])])
 
-            response = self._vmops.list_instance_uuids()
-            mock_list_notes.assert_called_once_with()
+        response = self._vmops.list_instance_uuids()
+        self._vmops._vmutils.list_instance_notes.assert_called_once_with()
 
         self.assertEqual(response, [fake_uuid])
 
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index f49d9be5e56..45526830f23 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -6974,14 +6974,12 @@ def test_get_guest_config_with_rng_limits(self):
         self.assertEqual(cfg.devices[5].rate_bytes, 1024)
         self.assertEqual(cfg.devices[5].rate_period, 2)
 
-    @mock.patch('nova.virt.libvirt.driver.os.path.exists')
-    @test.patch_exists(SEV_KERNEL_PARAM_FILE, False)
-    def test_get_guest_config_with_rng_backend(self, mock_path):
+    @test.patch_exists(SEV_KERNEL_PARAM_FILE, result=False, other=True)
+    def test_get_guest_config_with_rng_backend(self):
         self.flags(virt_type='kvm',
                    rng_dev_path='/dev/hw_rng',
                    group='libvirt')
         self.flags(pointer_model='ps2mouse')
-        mock_path.return_value = True
 
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
         instance_ref = objects.Instance(**self.test_instance)
@@ -7591,11 +7589,8 @@ def test_get_guest_config_armv7(self, mock_numa, mock_storage):
     @mock.patch.object(libvirt_driver.LibvirtDriver,
                        "_get_guest_storage_config")
     @mock.patch.object(libvirt_driver.LibvirtDriver, "_has_numa_support")
-    @mock.patch('os.path.exists', return_value=True)
-    @test.patch_exists(SEV_KERNEL_PARAM_FILE, False)
-    def test_get_guest_config_aarch64(
-        self, mock_path_exists, mock_numa, mock_storage,
-    ):
+    @test.patch_exists(SEV_KERNEL_PARAM_FILE, result=False, other=True)
+    def test_get_guest_config_aarch64(self, mock_numa, mock_storage):
         TEST_AMOUNT_OF_PCIE_SLOTS = 8
         CONF.set_override("num_pcie_ports", TEST_AMOUNT_OF_PCIE_SLOTS,
                 group='libvirt')
@@ -7615,7 +7610,6 @@ def test_get_guest_config_aarch64(
         cfg = drvr._get_guest_config(instance_ref,
                                      _fake_network_info(self),
                                      image_meta, disk_info)
-        self.assertTrue(mock_path_exists.called)
         self.assertEqual(cfg.os_mach_type, "virt")
 
         num_ports = 0
@@ -7632,10 +7626,9 @@ def test_get_guest_config_aarch64(
     @mock.patch.object(libvirt_driver.LibvirtDriver,
                        "_get_guest_storage_config")
     @mock.patch.object(libvirt_driver.LibvirtDriver, "_has_numa_support")
-    @mock.patch('os.path.exists', return_value=True)
-    @test.patch_exists(SEV_KERNEL_PARAM_FILE, False)
+    @test.patch_exists(SEV_KERNEL_PARAM_FILE, result=False, other=True)
     def test_get_guest_config_aarch64_with_graphics(
-        self, mock_path_exists, mock_numa, mock_storage,
+        self, mock_numa, mock_storage,
     ):
         self.mock_uname.return_value = fakelibvirt.os_uname(
             'Linux', '', '5.4.0-0-generic', '', fields.Architecture.AARCH64)
@@ -7645,7 +7638,6 @@ def test_get_guest_config_aarch64_with_graphics(
 
         cfg = self._get_guest_config_with_graphics()
 
-        self.assertTrue(mock_path_exists.called)
         self.assertEqual(cfg.os_mach_type, "virt")
 
         usbhost_exists = False
@@ -11407,8 +11399,6 @@ def test_check_can_live_migrate_source_block_migration_none_no_share(self):
             False,
             False)
 
-    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
-                '_assert_dest_node_has_enough_disk')
     @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
                 '_assert_dest_node_has_enough_disk')
     @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
@@ -11416,7 +11406,7 @@ def test_check_can_live_migrate_source_block_migration_none_no_share(self):
     @mock.patch('nova.virt.libvirt.driver.LibvirtDriver.'
                 '_check_shared_storage_test_file')
     def test_check_can_live_migration_source_disk_over_commit_none(self,
-            mock_check, mock_shared_block, mock_enough, mock_disk_check):
+            mock_check, mock_shared_block, mock_disk_check):
 
         mock_check.return_value = False
         mock_shared_block.return_value = False
@@ -15548,8 +15538,7 @@ def test_create_image_with_ephemerals(self, mock_get_ext):
             filename=filename, size=100 * units.Gi, ephemeral_size=mock.ANY,
             specified_fs=None)
 
-    @mock.patch.object(nova.virt.libvirt.imagebackend.Image, 'cache')
-    def test_create_image_resize_snap_backend(self, mock_cache):
+    def test_create_image_resize_snap_backend(self):
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
         instance = objects.Instance(**self.test_instance)
         instance.task_state = task_states.RESIZE_FINISH
@@ -22063,11 +22052,8 @@ def test_migrate_disk_and_power_off_resize_error_default_ephemeral(
                           self.drvr.migrate_disk_and_power_off,
                           'ctx', instance, '10.0.0.1', flavor_obj, None)
 
-    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver'
-                '._get_instance_disk_info')
     @mock.patch('nova.virt.driver.block_device_info_get_ephemerals')
-    def test_migrate_disk_and_power_off_resize_error_eph(self, mock_get,
-                                                         mock_get_disk_info):
+    def test_migrate_disk_and_power_off_resize_error_eph(self, mock_get):
         mappings = [
             {
                  'device_name': '/dev/sdb4',
@@ -22114,7 +22100,6 @@ def test_migrate_disk_and_power_off_resize_error_eph(self, mock_get,
         # Old flavor, eph is 20, real disk is 3, target is 2, fail
         flavor = {'root_gb': 10, 'ephemeral_gb': 2}
         flavor_obj = objects.Flavor(**flavor)
-        mock_get_disk_info.return_value = fake_disk_info_json(instance)
 
         self.assertRaises(
             exception.InstanceFaultRollback,
@@ -25564,9 +25549,7 @@ def test_get_gpu_inventories_with_a_single_type(self):
         }
         self._test_get_gpu_inventories(drvr, expected, ['nvidia-11'])
 
-    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver'
-                '._get_mdev_capable_devices')
-    def test_get_gpu_inventories_with_two_types(self, get_mdev_capable_devs):
+    def test_get_gpu_inventories_with_two_types(self):
         self.flags(enabled_mdev_types=['nvidia-11', 'nvidia-12'],
                    group='devices')
         # we need to call the below again to ensure the updated
@@ -28513,13 +28496,11 @@ class LVMSnapshotTests(_BaseSnapshotTests):
                 new=mock.Mock(return_value=None))
     @mock.patch('nova.virt.libvirt.utils.get_disk_type_from_path',
                 new=mock.Mock(return_value='lvm'))
-    @mock.patch('nova.virt.libvirt.utils.file_open',
-                side_effect=[io.BytesIO(b''), io.BytesIO(b'')])
     @mock.patch.object(libvirt_driver.imagebackend.images,
                        'convert_image')
     @mock.patch.object(libvirt_driver.imagebackend.lvm, 'volume_info')
     def _test_lvm_snapshot(self, disk_format, mock_volume_info,
-                           mock_convert_image, mock_file_open):
+                           mock_convert_image):
         self.flags(images_type='lvm',
                    images_volume_group='nova-vg', group='libvirt')
 
diff --git a/nova/tests/unit/virt/libvirt/volume/test_lightos.py b/nova/tests/unit/virt/libvirt/volume/test_lightos.py
index 554647acf40..67fead13df4 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_lightos.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_lightos.py
@@ -30,7 +30,7 @@ def test_libvirt_lightos_driver(self, mock_factory, mock_helper):
             device_scan_attempts=5)
 
     @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory',
-        new=mock.Mock(return_value=mock.Mock()))
+        new=mock.Mock())
     def test_libvirt_lightos_driver_connect(self):
         lightos_driver = lightos.LibvirtLightOSVolumeDriver(
             self.fake_host)
@@ -40,15 +40,16 @@ def test_libvirt_lightos_driver_connect(self):
             'name': 'aLightVolume',
             'conf': config}
         connection_info = {'data': disk_info}
-        with mock.patch.object(lightos_driver.connector,
-                            'connect_volume',
-                            return_value={'path': '/dev/dms1234567'}):
-            lightos_driver.connect_volume(connection_info, None)
-            (lightos_driver.connector.connect_volume.
-                assert_called_once_with(
-                connection_info['data']))
-            self.assertEqual('/dev/dms1234567',
-                            connection_info['data']['device_path'])
+        lightos_driver.connector.connect_volume.return_value = (
+            {'path': '/dev/dms1234567'})
+
+        lightos_driver.connect_volume(connection_info, None)
+
+        lightos_driver.connector.connect_volume.assert_called_once_with(
+            connection_info['data'])
+        self.assertEqual(
+            '/dev/dms1234567',
+            connection_info['data']['device_path'])
 
     @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory',
         new=mock.Mock(return_value=mock.Mock()))
diff --git a/nova/tests/unit/virt/libvirt/volume/test_nvme.py b/nova/tests/unit/virt/libvirt/volume/test_nvme.py
index fcb303b4c3e..5159f3aaf60 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_nvme.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_nvme.py
@@ -56,14 +56,15 @@ def test_libvirt_nvme_driver_connect(self):
             'name': 'aNVMEVolume',
             'conf': config}
         connection_info = {'data': disk_info}
-        with mock.patch.object(nvme_driver.connector,
-                               'connect_volume',
-                               return_value={'path': '/dev/dms1234567'}):
-            nvme_driver.connect_volume(connection_info, None)
-            nvme_driver.connector.connect_volume.assert_called_once_with(
-                connection_info['data'])
-            self.assertEqual('/dev/dms1234567',
-                             connection_info['data']['device_path'])
+        nvme_driver.connector.connect_volume.return_value = (
+            {'path': '/dev/dms1234567'})
+
+        nvme_driver.connect_volume(connection_info, None)
+
+        nvme_driver.connector.connect_volume.assert_called_once_with(
+            connection_info['data'])
+        self.assertEqual(
+            '/dev/dms1234567', connection_info['data']['device_path'])
 
     @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory',
         new=mock.Mock(return_value=mock.Mock()))
diff --git a/nova/tests/unit/virt/test_block_device.py b/nova/tests/unit/virt/test_block_device.py
index aff6c5ef199..94d9297ca35 100644
--- a/nova/tests/unit/virt/test_block_device.py
+++ b/nova/tests/unit/virt/test_block_device.py
@@ -433,24 +433,23 @@ def test_driver_blank_block_device(self):
     def _test_call_wait_func(self, delete_on_termination, delete_fail=False):
         test_bdm = self.driver_classes['volume'](self.volume_bdm)
         test_bdm['delete_on_termination'] = delete_on_termination
-        with mock.patch.object(self.volume_api, 'delete') as vol_delete:
-            wait_func = mock.MagicMock()
-            mock_exception = exception.VolumeNotCreated(volume_id='fake-id',
-                                                        seconds=1,
-                                                        attempts=1,
-                                                        volume_status='error')
-            wait_func.side_effect = mock_exception
-
-            if delete_on_termination and delete_fail:
-                vol_delete.side_effect = Exception()
-
-            self.assertRaises(exception.VolumeNotCreated,
-                              test_bdm._call_wait_func,
-                              context=self.context,
-                              wait_func=wait_func,
-                              volume_api=self.volume_api,
-                              volume_id='fake-id')
-            self.assertEqual(delete_on_termination, vol_delete.called)
+        if delete_on_termination and delete_fail:
+            self.volume_api.delete.side_effect = Exception()
+
+        wait_func = mock.MagicMock()
+        mock_exception = exception.VolumeNotCreated(volume_id='fake-id',
+                                                    seconds=1,
+                                                    attempts=1,
+                                                    volume_status='error')
+        wait_func.side_effect = mock_exception
+
+        self.assertRaises(exception.VolumeNotCreated,
+                          test_bdm._call_wait_func,
+                          context=self.context,
+                          wait_func=wait_func,
+                          volume_api=self.volume_api,
+                          volume_id='fake-id')
+        self.assertEqual(delete_on_termination, self.volume_api.delete.called)
 
     def test_call_wait_delete_volume(self):
         self._test_call_wait_func(True)
@@ -483,25 +482,24 @@ def test_volume_delete_attachment(
             volume['shared_targets'] = True
             volume['service_uuid'] = uuids.service_uuid
 
+        if delete_attachment_raises:
+            self.volume_api.attachment_delete.side_effect = (
+                delete_attachment_raises)
+
+        self.virt_driver.get_volume_connector.return_value = connector
+
         with test.nested(
             mock.patch.object(driver_bdm, '_get_volume', return_value=volume),
-            mock.patch.object(self.virt_driver, 'get_volume_connector',
-                              return_value=connector),
             mock.patch('os_brick.initiator.utils.guard_connection'),
-            mock.patch.object(self.volume_api, 'attachment_delete'),
-        ) as (mock_get_volume, mock_get_connector, mock_guard,
-              vapi_attach_del):
-
-            if delete_attachment_raises:
-                vapi_attach_del.side_effect = delete_attachment_raises
+        ) as (mock_get_volume, mock_guard):
 
             driver_bdm.detach(elevated_context, instance,
                               self.volume_api, self.virt_driver,
                               attachment_id=attachment_id)
 
             mock_guard.assert_called_once_with(volume)
-            vapi_attach_del.assert_called_once_with(elevated_context,
-                                                    attachment_id)
+            self.volume_api.attachment_delete.assert_called_once_with(
+                elevated_context, attachment_id)
 
     def test_volume_delete_attachment_with_shared_targets(self):
         self.test_volume_delete_attachment(include_shared_targets=True)
@@ -952,31 +950,28 @@ def test_snapshot_attach_fail_volume(self):
 
         instance = fake_instance.fake_instance_obj(mock.sentinel.ctx,
                                                    **{'uuid': uuids.uuid})
-        with test.nested(
-            mock.patch.object(self.volume_api, 'get_snapshot',
-                              return_value=snapshot),
-            mock.patch.object(self.volume_api, 'create', return_value=volume),
-            mock.patch.object(self.volume_api, 'delete'),
-        ) as (vol_get_snap, vol_create, vol_delete):
-            wait_func = mock.MagicMock()
-            mock_exception = exception.VolumeNotCreated(volume_id=volume['id'],
-                                                        seconds=1,
-                                                        attempts=1,
-                                                        volume_status='error')
-            wait_func.side_effect = mock_exception
-            self.assertRaises(exception.VolumeNotCreated,
-                              test_bdm.attach, context=self.context,
-                              instance=instance,
-                              volume_api=self.volume_api,
-                              virt_driver=self.virt_driver,
-                              wait_func=wait_func)
-
-            vol_get_snap.assert_called_once_with(
-                self.context, 'fake-snapshot-id-1')
-            vol_create.assert_called_once_with(
-                self.context, 3, '', '', availability_zone=None,
-                snapshot=snapshot, volume_type=None)
-            vol_delete.assert_called_once_with(self.context, volume['id'])
+        self.volume_api.get_snapshot.return_value = snapshot
+        self.volume_api.create.return_value = volume
+        wait_func = mock.MagicMock()
+        mock_exception = exception.VolumeNotCreated(volume_id=volume['id'],
+                                                    seconds=1,
+                                                    attempts=1,
+                                                    volume_status='error')
+        wait_func.side_effect = mock_exception
+        self.assertRaises(exception.VolumeNotCreated,
+                          test_bdm.attach, context=self.context,
+                          instance=instance,
+                          volume_api=self.volume_api,
+                          virt_driver=self.virt_driver,
+                          wait_func=wait_func)
+
+        self.volume_api.get_snapshot.assert_called_once_with(
+            self.context, 'fake-snapshot-id-1')
+        self.volume_api.create.assert_called_once_with(
+            self.context, 3, '', '', availability_zone=None,
+            snapshot=snapshot, volume_type=None)
+        self.volume_api.delete.assert_called_once_with(
+            self.context, volume['id'])
 
     def test_snapshot_attach_volume(self):
         test_bdm = self.driver_classes['volsnapshot'](
@@ -984,19 +979,17 @@ def test_snapshot_attach_volume(self):
 
         instance = {'id': 'fake_id', 'uuid': uuids.uuid}
 
-        with test.nested(
-            mock.patch.object(self.driver_classes['volume'], 'attach'),
-            mock.patch.object(self.volume_api, 'get_snapshot'),
-            mock.patch.object(self.volume_api, 'create'),
-        ) as (mock_attach, mock_get_snapshot, mock_create):
+        with mock.patch.object(
+            self.driver_classes['volume'], 'attach'
+        ) as mock_attach:
             test_bdm.attach(self.context, instance, self.volume_api,
                             self.virt_driver)
             self.assertEqual('fake-volume-id-2', test_bdm.volume_id)
             mock_attach.assert_called_once_with(
                 self.context, instance, self.volume_api, self.virt_driver)
             # Make sure theses are not called
-            mock_get_snapshot.assert_not_called()
-            mock_create.assert_not_called()
+            self.volume_api.get_snapshot.assert_not_called()
+            self.volume_api.create.assert_not_called()
 
     def test_snapshot_attach_no_volume_and_no_volume_type(self):
         bdm = self.driver_classes['volsnapshot'](self.volsnapshot_bdm)
@@ -1006,15 +999,10 @@ def test_snapshot_attach_no_volume_and_no_volume_type(self):
         original_volume = {'id': uuids.original_volume_id,
                            'volume_type_id': 'original_volume_type'}
         new_volume = {'id': uuids.new_volume_id}
-        with test.nested(
-            mock.patch.object(self.driver_classes['volume'], 'attach'),
-            mock.patch.object(self.volume_api, 'get_snapshot',
-                              return_value=snapshot),
-            mock.patch.object(self.volume_api, 'get',
-                              return_value=original_volume),
-            mock.patch.object(self.volume_api, 'create',
-                              return_value=new_volume),
-        ) as (mock_attach, mock_get_snapshot, mock_get, mock_create):
+        self.volume_api.get_snapshot.return_value = snapshot
+        self.volume_api.get.return_value = original_volume
+        self.volume_api.create.return_value = new_volume
+        with mock.patch.object(self.driver_classes["volume"], "attach"):
             bdm.volume_id = None
             bdm.volume_type = None
             bdm.attach(self.context, instance, self.volume_api,
@@ -1022,10 +1010,11 @@ def test_snapshot_attach_no_volume_and_no_volume_type(self):
 
             # Assert that the original volume type is fetched, stored within
             # the bdm and then used to create the new snapshot based volume.
-            mock_get.assert_called_once_with(self.context,
-                                             uuids.original_volume_id)
+            self.volume_api.get.assert_called_once_with(
+                self.context, uuids.original_volume_id)
             self.assertEqual('original_volume_type', bdm.volume_type)
-            mock_create.assert_called_once_with(self.context, bdm.volume_size,
+            self.volume_api.create.assert_called_once_with(
+                self.context, bdm.volume_size,
                 '', '', volume_type='original_volume_type', snapshot=snapshot,
                 availability_zone=None)
 
@@ -1097,27 +1086,25 @@ def test_image_attach_fail_volume(self):
 
         instance = fake_instance.fake_instance_obj(mock.sentinel.ctx,
                                                    **{'uuid': uuids.uuid})
-        with test.nested(
-            mock.patch.object(self.volume_api, 'create', return_value=volume),
-            mock.patch.object(self.volume_api, 'delete'),
-        ) as (vol_create, vol_delete):
-            wait_func = mock.MagicMock()
-            mock_exception = exception.VolumeNotCreated(volume_id=volume['id'],
-                                                        seconds=1,
-                                                        attempts=1,
-                                                        volume_status='error')
-            wait_func.side_effect = mock_exception
-            self.assertRaises(exception.VolumeNotCreated,
-                              test_bdm.attach, context=self.context,
-                              instance=instance,
-                              volume_api=self.volume_api,
-                              virt_driver=self.virt_driver,
-                              wait_func=wait_func)
-
-            vol_create.assert_called_once_with(
-                self.context, 1, '', '', image_id=image['id'],
-                availability_zone=None, volume_type=None)
-            vol_delete.assert_called_once_with(self.context, volume['id'])
+        self.volume_api.create.return_value = volume
+        wait_func = mock.MagicMock()
+        mock_exception = exception.VolumeNotCreated(volume_id=volume['id'],
+                                                    seconds=1,
+                                                    attempts=1,
+                                                    volume_status='error')
+        wait_func.side_effect = mock_exception
+        self.assertRaises(exception.VolumeNotCreated,
+                          test_bdm.attach, context=self.context,
+                          instance=instance,
+                          volume_api=self.volume_api,
+                          virt_driver=self.virt_driver,
+                          wait_func=wait_func)
+
+        self.volume_api.create.assert_called_once_with(
+            self.context, 1, '', '', image_id=image['id'],
+            availability_zone=None, volume_type=None)
+        self.volume_api.delete.assert_called_once_with(
+            self.context, volume['id'])
 
     def test_image_attach_volume(self):
         test_bdm = self.driver_classes['volimage'](
@@ -1125,19 +1112,17 @@ def test_image_attach_volume(self):
 
         instance = {'id': 'fake_id', 'uuid': uuids.uuid}
 
-        with test.nested(
-            mock.patch.object(self.driver_classes['volume'], 'attach'),
-            mock.patch.object(self.volume_api, 'get_snapshot'),
-            mock.patch.object(self.volume_api, 'create'),
-        ) as (mock_attch, mock_get_snapshot, mock_create):
+        with mock.patch.object(
+            self.driver_classes['volume'], 'attach'
+        ) as mock_attach:
             test_bdm.attach(self.context, instance, self.volume_api,
                             self.virt_driver)
             self.assertEqual('fake-volume-id-2', test_bdm.volume_id)
-            mock_attch.assert_called_once_with(
+            mock_attach.assert_called_once_with(
                 self.context, instance, self.volume_api, self.virt_driver)
             # Make sure theses are not called
-            mock_get_snapshot.assert_not_called()
-            mock_create.assert_not_called()
+            self.volume_api.get_snapshot.assert_not_called()
+            self.volume_api.create.assert_not_called()
 
     def test_blank_attach_fail_volume(self):
         no_blank_volume = self.volblank_bdm_dict.copy()
@@ -1149,30 +1134,26 @@ def test_blank_attach_fail_volume(self):
                                                    **{'uuid': uuids.uuid})
         volume = {'id': 'fake-volume-id-2',
                   'display_name': '%s-blank-vol' % uuids.uuid}
+        self.volume_api.create.return_value = volume
+        wait_func = mock.MagicMock()
+        mock_exception = exception.VolumeNotCreated(volume_id=volume['id'],
+                                                    seconds=1,
+                                                    attempts=1,
+                                                    volume_status='error')
+        wait_func.side_effect = mock_exception
+        self.assertRaises(exception.VolumeNotCreated,
+                          test_bdm.attach, context=self.context,
+                          instance=instance,
+                          volume_api=self.volume_api,
+                          virt_driver=self.virt_driver,
+                          wait_func=wait_func)
 
-        with test.nested(
-            mock.patch.object(self.volume_api, 'create', return_value=volume),
-            mock.patch.object(self.volume_api, 'delete'),
-        ) as (vol_create, vol_delete):
-            wait_func = mock.MagicMock()
-            mock_exception = exception.VolumeNotCreated(volume_id=volume['id'],
-                                                        seconds=1,
-                                                        attempts=1,
-                                                        volume_status='error')
-            wait_func.side_effect = mock_exception
-            self.assertRaises(exception.VolumeNotCreated,
-                              test_bdm.attach, context=self.context,
-                              instance=instance,
-                              volume_api=self.volume_api,
-                              virt_driver=self.virt_driver,
-                              wait_func=wait_func)
-
-            vol_create.assert_called_once_with(
-                self.context, test_bdm.volume_size,
-                '%s-blank-vol' % uuids.uuid,
-                '', volume_type=None, availability_zone=None)
-            vol_delete.assert_called_once_with(
-                self.context, volume['id'])
+        self.volume_api.create.assert_called_once_with(
+            self.context, test_bdm.volume_size,
+            '%s-blank-vol' % uuids.uuid,
+            '', volume_type=None, availability_zone=None)
+        self.volume_api.delete.assert_called_once_with(
+            self.context, volume['id'])
 
     def test_blank_attach_volume(self):
         no_blank_volume = self.volblank_bdm_dict.copy()
@@ -1481,13 +1462,9 @@ def _test_boot_from_volume_source_snapshot_volume_type(
                   'display_name': 'fake-snapshot-vol'}
         self.stub_volume_create(volume)
 
-        with test.nested(
-            mock.patch.object(self.volume_api, 'get_snapshot',
-                              return_value=snapshot),
-            mock.patch.object(volume_class, 'attach')
-        ) as (
-            vol_get_snap, vol_attach
-        ):
+        self.volume_api.get_snapshot.return_value = snapshot
+
+        with mock.patch.object(volume_class, 'attach') as vol_attach:
             test_bdm.attach(self.context, instance, self.volume_api,
                             self.virt_driver)
 
diff --git a/nova/tests/unit/virt/vmwareapi/test_images.py b/nova/tests/unit/virt/vmwareapi/test_images.py
index 7cfec00c97f..b3a3cfd941e 100644
--- a/nova/tests/unit/virt/vmwareapi/test_images.py
+++ b/nova/tests/unit/virt/vmwareapi/test_images.py
@@ -117,13 +117,11 @@ def test_fetch_image_ova(self, mock_tar_open, mock_write_class,
              mock.patch.object(images.IMAGE_API, 'download'),
              mock.patch.object(images, 'image_transfer'),
              mock.patch.object(images, '_build_shadow_vm_config_spec'),
-             mock.patch.object(session, '_call_method'),
              mock.patch.object(vm_util, 'get_vmdk_info')
         ) as (mock_image_api_get,
               mock_image_api_download,
               mock_image_transfer,
               mock_build_shadow_vm_config_spec,
-              mock_call_method,
               mock_get_vmdk_info):
             image_data = {'id': 'fake-id',
                           'disk_format': 'vmdk',
@@ -172,7 +170,7 @@ def fake_extract(name):
                                                         mock_write_handle)
             mock_get_vmdk_info.assert_called_once_with(
                     session, mock.sentinel.vm_ref, 'fake-vm')
-            mock_call_method.assert_called_once_with(
+            session._call_method.assert_called_once_with(
                     session.vim, "UnregisterVM", mock.sentinel.vm_ref)
 
     @mock.patch('oslo_vmware.rw_handles.ImageReadHandle')
@@ -188,13 +186,11 @@ def test_fetch_image_stream_optimized(self,
              mock.patch.object(images.IMAGE_API, 'download'),
              mock.patch.object(images, 'image_transfer'),
              mock.patch.object(images, '_build_shadow_vm_config_spec'),
-             mock.patch.object(session, '_call_method'),
              mock.patch.object(vm_util, 'get_vmdk_info')
         ) as (mock_image_api_get,
               mock_image_api_download,
               mock_image_transfer,
               mock_build_shadow_vm_config_spec,
-              mock_call_method,
               mock_get_vmdk_info):
             image_data = {'id': 'fake-id',
                           'disk_format': 'vmdk',
@@ -220,7 +216,7 @@ def test_fetch_image_stream_optimized(self,
 
             mock_image_transfer.assert_called_once_with(mock_read_handle,
                                                         mock_write_handle)
-            mock_call_method.assert_called_once_with(
+            session._call_method.assert_called_once_with(
                     session.vim, "UnregisterVM", mock.sentinel.vm_ref)
             mock_get_vmdk_info.assert_called_once_with(
                     session, mock.sentinel.vm_ref, 'fake-vm')

From 69667a817cb65c3efbe4e3ada0e8c69c0a106087 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Mon, 8 Aug 2022 19:44:41 +0200
Subject: [PATCH 37/93] Remove double mocking... again

I thought we fixed all the double mocking issues with
I3998d0d49583806ac1c3ae64f1b1fe343cefd20d but I was wrong.

While we used both mock and unittest.mock the fixtures.MockPatch
used the mock lib instead of the unittest.mock lib.
The path Ibf4f36136f2c65adad64f75d665c00cf2de4b400 (Remove the PowerVM driver)
removed the last user of mock lib from nova. So it is also
removed the mock from test-requirements. This triggered that
fixtures.MockPatch athat started using unittest.mock too.

Before Ibf4f36136f2c65adad64f75d665c00cf2de4b400 a function can be mocked
twice once with unittest.mock and once with fixtures.MockPatch (still
using mock). However after that patch both path of such double
mocking goes through unittest.mock and the second one fails.

So this patch fixes double mocking so far hidden behind
fixtures.MockPatch.

This patch made the py310 and functional-py310 jobs voting  on master
however that has been dropped as part of the backport.

Conflicts:
    .zuul.yaml
    nova/tests/unit/virt/libvirt/test_host.py

Change-Id: Ic1352ec31996577a5d0ad18a057339df3e49de25
(cherry picked from commit bf654e3a4a8f690ad0bec0955690bf4fadf98dba)
---
 nova/tests/fixtures/libvirt.py                | 27 ++++++----
 nova/tests/fixtures/nova.py                   | 10 +++-
 .../compute/test_resource_tracker.py          | 10 ----
 nova/tests/functional/libvirt/base.py         |  8 +--
 nova/tests/functional/libvirt/test_reshape.py | 15 ++----
 nova/tests/functional/libvirt/test_vgpu.py    | 10 ++--
 .../regressions/test_bug_1896463.py           |  8 ---
 nova/tests/functional/test_aggregates.py      | 24 ++++-----
 nova/tests/functional/test_images.py          |  8 ++-
 nova/tests/functional/test_server_group.py    | 15 ++----
 nova/tests/functional/test_servers.py         | 19 +++----
 .../openstack/compute/test_create_backup.py   |  4 --
 .../openstack/compute/test_migrate_server.py  |  3 +-
 .../unit/api/openstack/compute/test_quotas.py |  5 +-
 .../compute/test_server_group_quotas.py       |  7 +--
 .../api/openstack/compute/test_servers.py     | 30 +++++------
 .../api/openstack/compute/test_volumes.py     | 10 ++--
 nova/tests/unit/compute/test_api.py           | 52 ++++++++++++-------
 nova/tests/unit/policies/test_servers.py      |  5 +-
 nova/tests/unit/virt/ironic/test_driver.py    |  3 --
 nova/tests/unit/virt/libvirt/test_driver.py   | 34 ++++++------
 nova/tests/unit/virt/libvirt/test_host.py     | 17 +++---
 nova/tests/unit/virt/libvirt/test_vif.py      | 34 ++++++------
 23 files changed, 160 insertions(+), 198 deletions(-)

diff --git a/nova/tests/fixtures/libvirt.py b/nova/tests/fixtures/libvirt.py
index f6d5d656a2e..0684bae7ddd 100644
--- a/nova/tests/fixtures/libvirt.py
+++ b/nova/tests/fixtures/libvirt.py
@@ -2220,8 +2220,8 @@ def setUp(self):
 
         self.useFixture(
             fixtures.MockPatch('nova.virt.libvirt.utils.get_fs_info'))
-        self.useFixture(
-            fixtures.MockPatch('nova.compute.utils.get_machine_ips'))
+        self.mock_get_machine_ips = self.useFixture(
+            fixtures.MockPatch('nova.compute.utils.get_machine_ips')).mock
 
         # libvirt driver needs to call out to the filesystem to get the
         # parent_ifname for the SRIOV VFs.
@@ -2231,20 +2231,25 @@ def setUp(self):
 
         self.useFixture(fixtures.MockPatch(
             'nova.pci.utils.get_mac_by_pci_address',
-            new=self.fake_get_mac_by_pci_address))
+            side_effect=self.fake_get_mac_by_pci_address))
 
         # libvirt calls out to sysfs to get the vfs ID during macvtap plug
-        self.useFixture(fixtures.MockPatch(
-            'nova.pci.utils.get_vf_num_by_pci_address', return_value=1))
+        self.mock_get_vf_num_by_pci_address = self.useFixture(
+            fixtures.MockPatch(
+                'nova.pci.utils.get_vf_num_by_pci_address', return_value=1
+            )
+        ).mock
 
         # libvirt calls out to privsep to set the mac and vlan of a macvtap
-        self.useFixture(fixtures.MockPatch(
-            'nova.privsep.linux_net.set_device_macaddr_and_vlan'))
+        self.mock_set_device_macaddr_and_vlan = self.useFixture(
+            fixtures.MockPatch(
+                'nova.privsep.linux_net.set_device_macaddr_and_vlan')).mock
 
         # libvirt calls out to privsep to set the port state during macvtap
         # plug
-        self.useFixture(fixtures.MockPatch(
-            'nova.privsep.linux_net.set_device_macaddr'))
+        self.mock_set_device_macaddr = self.useFixture(
+            fixtures.MockPatch(
+                'nova.privsep.linux_net.set_device_macaddr')).mock
 
         # Don't assume that the system running tests has a valid machine-id
         self.useFixture(fixtures.MockPatch(
@@ -2259,8 +2264,8 @@ def setUp(self):
         # Ensure tests perform the same on all host architectures
         fake_uname = os_uname(
             'Linux', '', '5.4.0-0-generic', '', obj_fields.Architecture.X86_64)
-        self.useFixture(
-            fixtures.MockPatch('os.uname', return_value=fake_uname))
+        self.mock_uname = self.useFixture(
+            fixtures.MockPatch('os.uname', return_value=fake_uname)).mock
 
         # ...and on all machine types
         fake_loaders = [
diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py
index 810c6f62dde..27ca2fd77d4 100644
--- a/nova/tests/fixtures/nova.py
+++ b/nova/tests/fixtures/nova.py
@@ -1032,9 +1032,15 @@ def setUp(self):
         self.api = client.TestOpenStackClient(
             'fake', base_url, project_id=self.project_id,
             roles=['reader', 'member'])
+        self.alternative_api = client.TestOpenStackClient(
+            'fake', base_url, project_id=self.project_id,
+            roles=['reader', 'member'])
         self.admin_api = client.TestOpenStackClient(
             'admin', base_url, project_id=self.project_id,
             roles=['reader', 'member', 'admin'])
+        self.alternative_admin_api = client.TestOpenStackClient(
+            'admin', base_url, project_id=self.project_id,
+            roles=['reader', 'member', 'admin'])
         self.reader_api = client.TestOpenStackClient(
             'reader', base_url, project_id=self.project_id,
             roles=['reader'])
@@ -1130,9 +1136,9 @@ def evloop(*args, **kwargs):
         # Don't poison the function if it's already mocked
         import nova.virt.libvirt.host
         if not isinstance(nova.virt.libvirt.host.Host._init_events, mock.Mock):
-            self.useFixture(fixtures.MockPatch(
+            self.useFixture(fixtures.MonkeyPatch(
                 'nova.virt.libvirt.host.Host._init_events',
-                side_effect=evloop))
+                evloop))
 
 
 class IndirectionAPIFixture(fixtures.Fixture):
diff --git a/nova/tests/functional/compute/test_resource_tracker.py b/nova/tests/functional/compute/test_resource_tracker.py
index 81b7dfb68cf..758c15f371a 100644
--- a/nova/tests/functional/compute/test_resource_tracker.py
+++ b/nova/tests/functional/compute/test_resource_tracker.py
@@ -29,7 +29,6 @@
 from nova import context
 from nova import objects
 from nova import test
-from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional import fixtures as func_fixtures
 from nova.tests.functional import integrated_helpers
 from nova.virt import driver as virt_driver
@@ -694,15 +693,6 @@ def test_end_to_end(self):
         feature a vm cannot be spawning using a custom trait and then start a
         compute service that provides that trait.
         """
-
-        self.useFixture(nova_fixtures.NeutronFixture(self))
-        self.useFixture(nova_fixtures.GlanceFixture(self))
-
-        # Start nova services.
-        self.api = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version='v2.1')).admin_api
-        self.api.microversion = 'latest'
-        self.start_service('conductor')
         # start nova-compute that will not have the additional trait.
         self._start_compute("fake-host-1")
 
diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py
index 1553e8e59e0..b2f0095f818 100644
--- a/nova/tests/functional/libvirt/base.py
+++ b/nova/tests/functional/libvirt/base.py
@@ -51,12 +51,12 @@ def setUp(self):
         self.useFixture(fixtures.MockPatch(
             'nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
             return_value={'total': 128, 'used': 44, 'free': 84}))
-        self.useFixture(fixtures.MockPatch(
+        self.mock_is_valid_hostname = self.useFixture(fixtures.MockPatch(
             'nova.virt.libvirt.driver.libvirt_utils.is_valid_hostname',
-            return_value=True))
-        self.useFixture(fixtures.MockPatch(
+            return_value=True)).mock
+        self.mock_file_open = self.useFixture(fixtures.MockPatch(
             'nova.virt.libvirt.driver.libvirt_utils.file_open',
-            side_effect=lambda *a, **k: io.BytesIO(b'')))
+            side_effect=lambda *a, **k: io.BytesIO(b''))).mock
         self.useFixture(fixtures.MockPatch(
             'nova.privsep.utils.supports_direct_io',
             return_value=True))
diff --git a/nova/tests/functional/libvirt/test_reshape.py b/nova/tests/functional/libvirt/test_reshape.py
index 5c73ffbf5f7..8249100111b 100644
--- a/nova/tests/functional/libvirt/test_reshape.py
+++ b/nova/tests/functional/libvirt/test_reshape.py
@@ -30,17 +30,7 @@
 
 class VGPUReshapeTests(base.ServersTestBase):
 
-    @mock.patch('nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
-                return_value={'total': 128,
-                              'used': 44,
-                              'free': 84})
-    @mock.patch('nova.virt.libvirt.driver.libvirt_utils.is_valid_hostname',
-                return_value=True)
-    @mock.patch('nova.virt.libvirt.driver.libvirt_utils.file_open',
-                side_effect=[io.BytesIO(b''), io.BytesIO(b''),
-                             io.BytesIO(b'')])
-    def test_create_servers_with_vgpu(
-            self, mock_file_open, mock_valid_hostname, mock_get_fs_info):
+    def test_create_servers_with_vgpu(self):
         """Verify that vgpu reshape works with libvirt driver
 
         1) create two servers with an old tree where the VGPU resource is on
@@ -49,7 +39,8 @@ def test_create_servers_with_vgpu(
         3) check that the allocations of the servers are still valid
         4) create another server now against the new tree
         """
-
+        self.mock_file_open.side_effect = [
+            io.BytesIO(b''), io.BytesIO(b''), io.BytesIO(b'')]
         # NOTE(gibi): We cannot simply ask the virt driver to create an old
         # RP tree with vgpu on the root RP as that code path does not exist
         # any more. So we have to hack a "bit". We will create a compute
diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py
index f25ce442214..e111f50de0d 100644
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@@ -49,11 +49,11 @@ class VGPUTestBase(base.ServersTestBase):
 
     def setUp(self):
         super(VGPUTestBase, self).setUp()
-        self.useFixture(fixtures.MockPatch(
-            'nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
-            return_value={'total': 128,
-                          'used': 44,
-                          'free': 84}))
+        libvirt_driver.LibvirtDriver._get_local_gb_info.return_value = {
+            'total': 128,
+            'used': 44,
+            'free': 84,
+        }
         self.useFixture(fixtures.MockPatch(
             'nova.privsep.libvirt.create_mdev',
             side_effect=self._create_mdev))
diff --git a/nova/tests/functional/regressions/test_bug_1896463.py b/nova/tests/functional/regressions/test_bug_1896463.py
index 6663ebe8cd3..dc74791e0e5 100644
--- a/nova/tests/functional/regressions/test_bug_1896463.py
+++ b/nova/tests/functional/regressions/test_bug_1896463.py
@@ -51,14 +51,6 @@ def setUp(self):
         self.api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
             api_version='v2.1'))
 
-        self.useFixture(fixtures.MockPatch(
-            'nova.pci.utils.get_mac_by_pci_address',
-            return_value='52:54:00:1e:59:c6'))
-
-        self.useFixture(fixtures.MockPatch(
-            'nova.pci.utils.get_vf_num_by_pci_address',
-            return_value=1))
-
         self.admin_api = self.api_fixture.admin_api
         self.admin_api.microversion = 'latest'
         self.api = self.admin_api
diff --git a/nova/tests/functional/test_aggregates.py b/nova/tests/functional/test_aggregates.py
index 8dfb3455782..1ffa3ada92c 100644
--- a/nova/tests/functional/test_aggregates.py
+++ b/nova/tests/functional/test_aggregates.py
@@ -935,11 +935,11 @@ def setUp(self):
 
         # Start nova services.
         self.start_service('conductor')
-        self.admin_api = self.useFixture(
-            nova_fixtures.OSAPIFixture(api_version='v2.1')).admin_api
-        self.api = self.useFixture(
-            nova_fixtures.OSAPIFixture(api_version='v2.1',
-                                       project_id=uuids.non_admin)).api
+        api_fixture = self.useFixture(
+            nova_fixtures.OSAPIFixture(api_version='v2.1'))
+        self.admin_api = api_fixture.admin_api
+        self.api = api_fixture.api
+        self.api.project_id = uuids.non_admin
         # Add the AggregateMultiTenancyIsolation to the list of enabled
         # filters since it is not enabled by default.
         enabled_filters = CONF.filter_scheduler.enabled_filters
@@ -1037,15 +1037,15 @@ def setUp(self):
         self.glance = self.useFixture(nova_fixtures.GlanceFixture(self))
         self.useFixture(nova_fixtures.NeutronFixture(self))
         self.useFixture(func_fixtures.PlacementFixture())
-        # Intentionally keep these separate since we want to create the
-        # server with the non-admin user in a different project.
-        admin_api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+        # Intentionally define different project id for the two client since
+        # we want to create the server with the non-admin user in a different
+        # project.
+        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
             api_version='v2.1', project_id=uuids.admin_project))
-        self.admin_api = admin_api_fixture.admin_api
+        self.admin_api = api_fixture.admin_api
         self.admin_api.microversion = 'latest'
-        user_api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version='v2.1', project_id=uuids.user_project))
-        self.api = user_api_fixture.api
+        self.api = api_fixture.api
+        self.api.project_id = uuids.user_project
         self.api.microversion = 'latest'
 
         self.start_service('conductor')
diff --git a/nova/tests/functional/test_images.py b/nova/tests/functional/test_images.py
index 340e883da96..e7e9f2a6c94 100644
--- a/nova/tests/functional/test_images.py
+++ b/nova/tests/functional/test_images.py
@@ -12,7 +12,6 @@
 
 from oslo_utils.fixture import uuidsentinel as uuids
 
-from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional.api import client
 from nova.tests.functional import integrated_helpers
 
@@ -70,10 +69,9 @@ def test_admin_snapshot_user_image_access_member(self):
         server = self.api.post_server({"server": server})
         server = self._wait_for_state_change(server, 'ACTIVE')
 
-        # Create an admin API fixture with a unique project ID.
-        admin_api = self.useFixture(
-            nova_fixtures.OSAPIFixture(
-                project_id=uuids.admin_project)).admin_api
+        # use an admin API with a unique project ID.
+        admin_api = self.api_fixture.alternative_admin_api
+        admin_api.project_id = uuids.admin_project
 
         # Create a snapshot of the server using the admin project.
         name = 'admin-created-snapshot'
diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py
index 08e47b3971a..a64a04b2c9a 100644
--- a/nova/tests/functional/test_server_group.py
+++ b/nova/tests/functional/test_server_group.py
@@ -64,12 +64,12 @@ def setUp(self):
         self.useFixture(nova_fixtures.NeutronFixture(self))
 
         self.useFixture(func_fixtures.PlacementFixture())
-        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+        self.api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
             api_version='v2.1'))
 
-        self.api = api_fixture.api
+        self.api = self.api_fixture.api
         self.api.microversion = self.microversion
-        self.admin_api = api_fixture.admin_api
+        self.admin_api = self.api_fixture.admin_api
         self.admin_api.microversion = self.microversion
 
         self.start_service('conductor')
@@ -174,13 +174,8 @@ def test_get_groups_all_projects(self):
 
         # Create an API using project 'openstack1'.
         # This is a non-admin API.
-        #
-        # NOTE(sdague): this is actually very much *not* how this
-        # fixture should be used. This actually spawns a whole
-        # additional API server. Should be addressed in the future.
-        api_openstack1 = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version=self.api_major_version,
-            project_id=PROJECT_ID_ALT)).api
+        api_openstack1 = self.api_fixture.alternative_api
+        api_openstack1.project_id = PROJECT_ID_ALT
         api_openstack1.microversion = self.microversion
 
         # Create a server group in project 'openstack'
diff --git a/nova/tests/functional/test_servers.py b/nova/tests/functional/test_servers.py
index e77d4bf1ea2..440195cd196 100644
--- a/nova/tests/functional/test_servers.py
+++ b/nova/tests/functional/test_servers.py
@@ -1253,9 +1253,7 @@ def test_get_servers_detail_non_admin_with_deleted_flag(self):
     def test_get_servers_detail_filters(self):
         # We get the results only from the up cells, this ignoring the down
         # cells if list_records_by_skipping_down_cells config option is True.
-        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version='v2.1'))
-        self.admin_api = api_fixture.admin_api
+        self.admin_api = self.api_fixture.admin_api
         self.admin_api.microversion = '2.69'
         servers = self.admin_api.get_servers(
             search_opts={'hostname': "cell3-inst0"})
@@ -1263,9 +1261,7 @@ def test_get_servers_detail_filters(self):
         self.assertEqual(self.up_cell_insts[2], servers[0]['id'])
 
     def test_get_servers_detail_all_tenants_with_down_cells(self):
-        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version='v2.1'))
-        self.admin_api = api_fixture.admin_api
+        self.admin_api = self.api_fixture.admin_api
         self.admin_api.microversion = '2.69'
         servers = self.admin_api.get_servers(search_opts={'all_tenants': True})
         # 4 servers from the up cells and 4 servers from the down cells
@@ -1523,10 +1519,8 @@ class ServersTestV280(integrated_helpers._IntegratedTestBase):
 
     def setUp(self):
         super(ServersTestV280, self).setUp()
-        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version='v2.1'))
-        self.api = api_fixture.api
-        self.admin_api = api_fixture.admin_api
+        self.api = self.api_fixture.api
+        self.admin_api = self.api_fixture.admin_api
 
         self.api.microversion = '2.80'
         self.admin_api.microversion = '2.80'
@@ -1585,9 +1579,8 @@ def test_get_migrations_after_live_migrate_server_in_different_project(
 
         project_id_1 = '4906260553374bf0a5d566543b320516'
         project_id_2 = 'c850298c1b6b4796a8f197ac310b2469'
-        new_api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
-            api_version=self.api_major_version, project_id=project_id_1))
-        new_admin_api = new_api_fixture.admin_api
+        new_admin_api = self.api_fixture.alternative_admin_api
+        new_admin_api.project_id = project_id_1
         new_admin_api.microversion = '2.80'
 
         post = {
diff --git a/nova/tests/unit/api/openstack/compute/test_create_backup.py b/nova/tests/unit/api/openstack/compute/test_create_backup.py
index f7280a5a370..70978d11dea 100644
--- a/nova/tests/unit/api/openstack/compute/test_create_backup.py
+++ b/nova/tests/unit/api/openstack/compute/test_create_backup.py
@@ -40,10 +40,6 @@ def setUp(self):
         self.controller = getattr(self.create_backup, self.controller_name)()
         self.compute_api = self.controller.compute_api
 
-        patch_get = mock.patch.object(self.compute_api, 'get')
-        self.mock_get = patch_get.start()
-        self.addCleanup(patch_get.stop)
-
     @mock.patch.object(common, 'check_img_metadata_properties_quota')
     @mock.patch.object(api.API, 'backup')
     def test_create_backup_with_metadata(self, mock_backup, mock_check_image):
diff --git a/nova/tests/unit/api/openstack/compute/test_migrate_server.py b/nova/tests/unit/api/openstack/compute/test_migrate_server.py
index 683759eccc5..325b4927b30 100644
--- a/nova/tests/unit/api/openstack/compute/test_migrate_server.py
+++ b/nova/tests/unit/api/openstack/compute/test_migrate_server.py
@@ -530,9 +530,8 @@ def _test_migrate_validation_error(self, body):
                           self.req, fakes.FAKE_UUID, body=body)
 
     def _test_migrate_exception(self, exc_info, expected_result):
-        @mock.patch.object(self.compute_api, 'get')
         @mock.patch.object(self.compute_api, 'resize', side_effect=exc_info)
-        def _test(mock_resize, mock_get):
+        def _test(mock_resize):
             instance = objects.Instance(uuid=uuids.instance)
             self.assertRaises(expected_result,
                               self.controller._migrate,
diff --git a/nova/tests/unit/api/openstack/compute/test_quotas.py b/nova/tests/unit/api/openstack/compute/test_quotas.py
index 6cb8d9c7adb..7e4f9d13747 100644
--- a/nova/tests/unit/api/openstack/compute/test_quotas.py
+++ b/nova/tests/unit/api/openstack/compute/test_quotas.py
@@ -882,7 +882,8 @@ def setUp(self):
                      local_limit.KEY_PAIRS: 100,
                      local_limit.SERVER_GROUPS: 12,
                      local_limit.SERVER_GROUP_MEMBERS: 10}
-        self.useFixture(limit_fixture.LimitFixture(reglimits, {}))
+        self.limit_fixture = self.useFixture(
+            limit_fixture.LimitFixture(reglimits, {}))
 
     @mock.patch.object(placement_limit, "get_legacy_project_limits")
     def test_show_v21(self, mock_proj):
@@ -1098,7 +1099,7 @@ def test_defaults_v21_different_limit_values(self):
                      local_limit.KEY_PAIRS: 1,
                      local_limit.SERVER_GROUPS: 3,
                      local_limit.SERVER_GROUP_MEMBERS: 2}
-        self.useFixture(limit_fixture.LimitFixture(reglimits, {}))
+        self.limit_fixture.reglimits = reglimits
 
         req = fakes.HTTPRequest.blank("")
         response = self.controller.defaults(req, uuids.project_id)
diff --git a/nova/tests/unit/api/openstack/compute/test_server_group_quotas.py b/nova/tests/unit/api/openstack/compute/test_server_group_quotas.py
index a0404baffcf..81d1939e716 100644
--- a/nova/tests/unit/api/openstack/compute/test_server_group_quotas.py
+++ b/nova/tests/unit/api/openstack/compute/test_server_group_quotas.py
@@ -209,7 +209,8 @@ def setUp(self):
         self.flags(driver='nova.quota.UnifiedLimitsDriver', group='quota')
         self.req = fakes.HTTPRequest.blank('')
         self.controller = sg_v21.ServerGroupController()
-        self.useFixture(limit_fixture.LimitFixture({'server_groups': 10}, {}))
+        self.limit_fixture = self.useFixture(
+            limit_fixture.LimitFixture({'server_groups': 10}, {}))
 
     @mock.patch('nova.limit.local.enforce_db_limit')
     def test_create_server_group_during_recheck(self, mock_enforce):
@@ -236,7 +237,7 @@ def test_create_server_group_recheck_disabled(self, mock_enforce):
                                              delta=1)
 
     def test_create_group_fails_with_zero_quota(self):
-        self.useFixture(limit_fixture.LimitFixture({'server_groups': 0}, {}))
+        self.limit_fixture.reglimits = {'server_groups': 0}
         sgroup = {'name': 'test', 'policies': ['anti-affinity']}
         exc = self.assertRaises(webob.exc.HTTPForbidden,
                                 self.controller.create,
@@ -245,7 +246,7 @@ def test_create_group_fails_with_zero_quota(self):
         self.assertIn(msg, str(exc))
 
     def test_create_only_one_group_when_limit_is_one(self):
-        self.useFixture(limit_fixture.LimitFixture({'server_groups': 1}, {}))
+        self.limit_fixture.reglimits = {'server_groups': 1}
         policies = ['anti-affinity']
         sgroup = {'name': 'test', 'policies': policies}
         res_dict = self.controller.create(
diff --git a/nova/tests/unit/api/openstack/compute/test_servers.py b/nova/tests/unit/api/openstack/compute/test_servers.py
index 31739ed7ab2..4e2a694e15f 100644
--- a/nova/tests/unit/api/openstack/compute/test_servers.py
+++ b/nova/tests/unit/api/openstack/compute/test_servers.py
@@ -2087,10 +2087,10 @@ def _get_server_data_dict(self, uuid, image_bookmark, flavor_bookmark,
 
         return server_dict
 
-    @mock.patch('nova.compute.api.API.get_instance_host_status')
-    def _verify_host_status_policy_behavior(self, func, mock_get_host_status):
+    def _verify_host_status_policy_behavior(self, func):
         # Set policy to disallow both host_status cases and verify we don't
         # call the get_instance_host_status compute RPC API.
+        self.mock_get_instance_host_status.reset_mock()
         rules = {
             'os_compute_api:servers:show:host_status': '!',
             'os_compute_api:servers:show:host_status:unknown-only': '!',
@@ -2098,7 +2098,7 @@ def _verify_host_status_policy_behavior(self, func, mock_get_host_status):
         orig_rules = policy.get_rules()
         policy.set_rules(oslo_policy.Rules.from_dict(rules), overwrite=False)
         func()
-        mock_get_host_status.assert_not_called()
+        self.mock_get_instance_host_status.assert_not_called()
         # Restore the original rules.
         policy.set_rules(orig_rules)
 
@@ -2638,15 +2638,13 @@ class ServersControllerTestV275(ControllerTest):
 
     microversion = '2.75'
 
-    @mock.patch('nova.compute.api.API.get_all')
-    def test_get_servers_additional_query_param_old_version(self, mock_get):
+    def test_get_servers_additional_query_param_old_version(self):
         req = fakes.HTTPRequest.blank(self.path_with_query % 'unknown=1',
                                       use_admin_context=True,
                                       version='2.74')
         self.controller.index(req)
 
-    @mock.patch('nova.compute.api.API.get_all')
-    def test_get_servers_ignore_sort_key_old_version(self, mock_get):
+    def test_get_servers_ignore_sort_key_old_version(self):
         req = fakes.HTTPRequest.blank(
                 self.path_with_query % 'sort_key=deleted',
                 use_admin_context=True, version='2.74')
@@ -3584,13 +3582,13 @@ def setUp(self):
             },
         }
 
-    @mock.patch('nova.compute.api.API.get')
-    def _rebuild_server(self, mock_get, certs=None,
-                        conf_enabled=True, conf_certs=None):
+    def _rebuild_server(self, certs=None, conf_enabled=True, conf_certs=None):
         ctx = self.req.environ['nova.context']
-        mock_get.return_value = fakes.stub_instance_obj(ctx,
-            vm_state=vm_states.ACTIVE, trusted_certs=certs,
-            project_id=self.req_project_id, user_id=self.req_user_id)
+        self.mock_get.side_effect = None
+        self.mock_get.return_value = fakes.stub_instance_obj(
+            ctx, vm_state=vm_states.ACTIVE, trusted_certs=certs,
+            project_id=self.req_project_id, user_id=self.req_user_id
+        )
 
         self.flags(default_trusted_certificate_ids=conf_certs, group='glance')
 
@@ -3743,10 +3741,10 @@ def setUp(self):
             }
         }
 
-    @mock.patch('nova.compute.api.API.get')
-    def _rebuild_server(self, mock_get):
+    def _rebuild_server(self):
         ctx = self.req.environ['nova.context']
-        mock_get.return_value = fakes.stub_instance_obj(ctx,
+        self.mock_get.side_effect = None
+        self.mock_get.return_value = fakes.stub_instance_obj(ctx,
             vm_state=vm_states.ACTIVE, project_id=self.req_project_id,
             user_id=self.req_user_id)
         server = self.controller._action_rebuild(
diff --git a/nova/tests/unit/api/openstack/compute/test_volumes.py b/nova/tests/unit/api/openstack/compute/test_volumes.py
index a24c104c933..14d27d85460 100644
--- a/nova/tests/unit/api/openstack/compute/test_volumes.py
+++ b/nova/tests/unit/api/openstack/compute/test_volumes.py
@@ -1889,8 +1889,7 @@ def test_assisted_delete_missing_delete_info(self):
                 req, '5')
 
     def _test_assisted_delete_instance_conflict(self, api_error):
-        # unset the stub on volume_snapshot_delete from setUp
-        self.mock_volume_snapshot_delete.stop()
+        self.mock_volume_snapshot_delete.side_effect = api_error
         params = {
             'delete_info': jsonutils.dumps({'volume_id': '1'}),
         }
@@ -1899,10 +1898,9 @@ def _test_assisted_delete_instance_conflict(self, api_error):
                 urllib.parse.urlencode(params),
                 version=self.microversion)
         req.method = 'DELETE'
-        with mock.patch.object(compute_api.API, 'volume_snapshot_delete',
-                               side_effect=api_error):
-            self.assertRaises(
-                webob.exc.HTTPBadRequest, self.controller.delete, req, '5')
+
+        self.assertRaises(
+            webob.exc.HTTPBadRequest, self.controller.delete, req, '5')
 
     def test_assisted_delete_instance_invalid_state(self):
         api_error = exception.InstanceInvalidState(
diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py
index 9e85ef633d3..14bb80c4c60 100644
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@@ -967,6 +967,31 @@ def _set_delete_shelved_part(self, inst, mock_image_delete):
 
         return snapshot_id
 
+    def _test_delete(self, delete_type, **attrs):
+        delete_time = datetime.datetime(
+            1955, 11, 5, 9, 30, tzinfo=iso8601.UTC)
+        timeutils.set_time_override(delete_time)
+        self.addCleanup(timeutils.clear_time_override)
+
+        with test.nested(
+            mock.patch.object(
+                self.compute_api.compute_rpcapi, 'confirm_resize'),
+            mock.patch.object(
+                self.compute_api.compute_rpcapi, 'terminate_instance'),
+            mock.patch.object(
+                self.compute_api.compute_rpcapi, 'soft_delete_instance'),
+        ) as (
+            mock_confirm, mock_terminate, mock_soft_delete
+        ):
+            self._do_delete(
+                delete_type,
+                mock_confirm,
+                mock_terminate,
+                mock_soft_delete,
+                delete_time,
+                **attrs
+            )
+
     @mock.patch.object(compute_utils,
                        'notify_about_instance_action')
     @mock.patch.object(objects.Migration, 'get_by_instance_and_status')
@@ -986,12 +1011,13 @@ def _set_delete_shelved_part(self, inst, mock_image_delete):
     @mock.patch.object(objects.BlockDeviceMappingList,
                        'get_by_instance_uuid', return_value=[])
     @mock.patch.object(objects.Instance, 'save')
-    def _test_delete(self, delete_type, mock_save, mock_bdm_get, mock_elevated,
-                     mock_get_cn, mock_up, mock_record, mock_inst_update,
-                     mock_deallocate, mock_inst_meta, mock_inst_destroy,
-                     mock_notify_legacy, mock_get_inst,
-                     mock_save_im, mock_image_delete, mock_mig_get,
-                     mock_notify, **attrs):
+    def _do_delete(
+        self, delete_type, mock_confirm, mock_terminate, mock_soft_delete,
+        delete_time, mock_save, mock_bdm_get, mock_elevated, mock_get_cn,
+        mock_up, mock_record, mock_inst_update, mock_deallocate,
+        mock_inst_meta, mock_inst_destroy, mock_notify_legacy, mock_get_inst,
+        mock_save_im, mock_image_delete, mock_mig_get, mock_notify, **attrs
+    ):
         expected_save_calls = [mock.call()]
         expected_record_calls = []
         expected_elevated_calls = []
@@ -1001,17 +1027,11 @@ def _test_delete(self, delete_type, mock_save, mock_bdm_get, mock_elevated,
         deltas = {'instances': -1,
                   'cores': -inst.flavor.vcpus,
                   'ram': -inst.flavor.memory_mb}
-        delete_time = datetime.datetime(1955, 11, 5, 9, 30,
-                                        tzinfo=iso8601.UTC)
-        self.useFixture(utils_fixture.TimeFixture(delete_time))
         task_state = (delete_type == 'soft_delete' and
                       task_states.SOFT_DELETING or task_states.DELETING)
         updates = {'progress': 0, 'task_state': task_state}
         if delete_type == 'soft_delete':
             updates['deleted_at'] = delete_time
-        rpcapi = self.compute_api.compute_rpcapi
-        mock_confirm = self.useFixture(
-            fixtures.MockPatchObject(rpcapi, 'confirm_resize')).mock
 
         def _reset_task_state(context, instance, migration, src_host,
                               cast=False):
@@ -1026,11 +1046,6 @@ def _reset_task_state(context, instance, migration, src_host,
             snapshot_id = self._set_delete_shelved_part(inst,
                                                         mock_image_delete)
 
-        mock_terminate = self.useFixture(
-            fixtures.MockPatchObject(rpcapi, 'terminate_instance')).mock
-        mock_soft_delete = self.useFixture(
-            fixtures.MockPatchObject(rpcapi, 'soft_delete_instance')).mock
-
         if inst.task_state == task_states.RESIZE_FINISH:
             self._test_delete_resizing_part(inst, deltas)
 
@@ -2636,9 +2651,6 @@ def test_pause(self, mock_save, mock_record):
 
         rpcapi = self.compute_api.compute_rpcapi
 
-        mock_pause = self.useFixture(
-            fixtures.MockPatchObject(rpcapi, 'pause_instance')).mock
-
         with mock.patch.object(rpcapi, 'pause_instance') as mock_pause:
             self.compute_api.pause(self.context, instance)
 
diff --git a/nova/tests/unit/policies/test_servers.py b/nova/tests/unit/policies/test_servers.py
index 3ed4bfe085a..2130c62e5f2 100644
--- a/nova/tests/unit/policies/test_servers.py
+++ b/nova/tests/unit/policies/test_servers.py
@@ -1229,10 +1229,9 @@ def fake_create(context, *args, **kwargs):
     @mock.patch('nova.compute.api.API._allow_resize_to_same_host')
     @mock.patch('nova.objects.RequestSpec.get_by_instance_uuid')
     @mock.patch('nova.objects.Instance.save')
-    @mock.patch('nova.api.openstack.common.get_instance')
     @mock.patch('nova.conductor.ComputeTaskAPI.resize_instance')
     def test_cross_cell_resize_server_policy(
-        self, mock_resize, mock_get, mock_save, mock_rs, mock_allow, m_net
+        self, mock_resize, mock_save, mock_rs, mock_allow, m_net
     ):
 
         # 'migrate' policy is checked before 'resize:cross_cell' so
@@ -1262,7 +1261,7 @@ def fake_get(*args, **kwargs):
             )
             return inst
 
-        mock_get.side_effect = fake_get
+        self.mock_get.side_effect = fake_get
 
         def fake_validate(context, instance,
             host_name, allow_cross_cell_resize):
diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py
index 7b377b21c21..0b1cc7d47fc 100644
--- a/nova/tests/unit/virt/ironic/test_driver.py
+++ b/nova/tests/unit/virt/ironic/test_driver.py
@@ -2597,9 +2597,6 @@ def setUp(self):
         # that the thread completes.
         self.useFixture(nova_fixtures.SpawnIsSynchronousFixture())
 
-        self.mock_conn = self.useFixture(
-            fixtures.MockPatchObject(self.driver, '_ironic_connection')).mock
-
     @mock.patch.object(loopingcall, 'FixedIntervalLoopingCall')
     @mock.patch.object(FAKE_CLIENT.node, 'set_provision_state')
     def test_rescue(self, mock_sps, mock_looping):
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 45526830f23..33d16851b4c 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -740,16 +740,14 @@ def setUp(self):
                       'resolve_driver_format',
                       imagebackend.Image._get_driver_format)
 
-        self.useFixture(nova_fixtures.LibvirtFixture())
+        self.libvirt = self.useFixture(nova_fixtures.LibvirtFixture())
 
         # ensure tests perform the same on all host architectures; this is
         # already done by the fakelibvirt fixture but we want to change the
         # architecture in some tests
-        _p = mock.patch('os.uname')
-        self.mock_uname = _p.start()
+        self.mock_uname = self.libvirt.mock_uname
         self.mock_uname.return_value = fakelibvirt.os_uname(
             'Linux', '', '5.4.0-0-generic', '', fields.Architecture.X86_64)
-        self.addCleanup(_p.stop)
 
         self.test_instance = _create_test_instance()
         network_info = objects.InstanceInfoCache(
@@ -2260,6 +2258,8 @@ def test_device_metadata(self, mock_version):
         instance_ref.info_cache = objects.InstanceInfoCache(
             network_info=network_info)
 
+        pci_utils.get_mac_by_pci_address.side_effect = None
+        pci_utils.get_mac_by_pci_address.return_value = 'da:d1:f2:91:95:c1'
         with test.nested(
             mock.patch('nova.objects.VirtualInterfaceList'
                        '.get_by_instance_uuid', return_value=vifs),
@@ -2269,8 +2269,7 @@ def test_device_metadata(self, mock_version):
                        return_value=guest),
             mock.patch.object(nova.virt.libvirt.guest.Guest, 'get_xml_desc',
                               return_value=xml),
-            mock.patch.object(pci_utils, 'get_mac_by_pci_address',
-                              return_value='da:d1:f2:91:95:c1')):
+        ):
             metadata_obj = drvr._build_device_metadata(self.context,
                                                        instance_ref)
             metadata = metadata_obj.devices
@@ -15997,9 +15996,10 @@ def test_get_host_ip_addr(self):
         self.assertEqual(ip, CONF.my_ip)
 
     @mock.patch.object(libvirt_driver.LOG, 'warning')
-    @mock.patch('nova.compute.utils.get_machine_ips')
-    def test_check_my_ip(self, mock_ips, mock_log):
-        mock_ips.return_value = ['8.8.8.8', '75.75.75.75']
+    def test_check_my_ip(self, mock_log):
+
+        self.libvirt.mock_get_machine_ips.return_value = [
+            '8.8.8.8', '75.75.75.75']
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
         drvr._check_my_ip()
         mock_log.assert_called_once_with(u'my_ip address (%(my_ip)s) was '
@@ -16021,6 +16021,7 @@ def test_conn_event_handler(self):
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
         service_mock = mock.MagicMock()
         service_mock.disabled.return_value = False
+        drvr._host._init_events.return_value = None
         with test.nested(
             mock.patch.object(drvr._host, "_connect",
                               side_effect=fakelibvirt.make_libvirtError(
@@ -16028,8 +16029,6 @@ def test_conn_event_handler(self):
                                   "Failed to connect to host",
                                   error_code=
                                   fakelibvirt.VIR_ERR_INTERNAL_ERROR)),
-            mock.patch.object(drvr._host, "_init_events",
-                              return_value=None),
             mock.patch.object(objects.Service, "get_by_compute_host",
                               return_value=service_mock)):
 
@@ -16044,6 +16043,7 @@ def test_command_with_broken_connection(self):
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
         service_mock = mock.MagicMock()
         service_mock.disabled.return_value = False
+        drvr._host._init_events.return_value = None
         with test.nested(
             mock.patch.object(drvr._host, "_connect",
                               side_effect=fakelibvirt.make_libvirtError(
@@ -16051,8 +16051,6 @@ def test_command_with_broken_connection(self):
                                   "Failed to connect to host",
                                   error_code=
                                   fakelibvirt.VIR_ERR_INTERNAL_ERROR)),
-            mock.patch.object(drvr._host, "_init_events",
-                              return_value=None),
             mock.patch.object(host.Host, "has_min_version",
                               return_value=True),
             mock.patch.object(drvr, "_do_quality_warnings",
@@ -16072,11 +16070,10 @@ def test_service_resume_after_broken_connection(self):
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
         service_mock = mock.MagicMock()
         service_mock.disabled.return_value = True
+        drvr._host._init_events.return_value = None
         with test.nested(
             mock.patch.object(drvr._host, "_connect",
                               return_value=mock.MagicMock()),
-            mock.patch.object(drvr._host, "_init_events",
-                              return_value=None),
             mock.patch.object(host.Host, "has_min_version",
                               return_value=True),
             mock.patch.object(drvr, "_do_quality_warnings",
@@ -17564,12 +17561,11 @@ def get_host_capabilities_stub(self):
         got = drvr._get_cpu_info()
         self.assertEqual(want, got)
 
-    @mock.patch.object(pci_utils, 'get_ifname_by_pci_address',
-                return_value='ens1')
     @mock.patch.object(host.Host, 'list_pci_devices',
                        return_value=['pci_0000_04_00_3', 'pci_0000_04_10_7',
                                      'pci_0000_04_11_7'])
-    def test_get_pci_passthrough_devices(self, mock_list, mock_get_ifname):
+    def test_get_pci_passthrough_devices(self, mock_list):
+        pci_utils.get_ifname_by_pci_address.return_value = 'ens1'
 
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
 
@@ -17642,7 +17638,7 @@ def test_get_pci_passthrough_devices(self, mock_list, mock_get_ifname):
 
         # The first call for every VF is to determine parent_ifname and
         # the second call to determine the MAC address.
-        mock_get_ifname.assert_has_calls([
+        pci_utils.get_ifname_by_pci_address.assert_has_calls([
             mock.call('0000:04:10.7', pf_interface=True),
             mock.call('0000:04:11.7', pf_interface=True),
         ])
diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py
index 741d834fb5e..1ac1fe21334 100644
--- a/nova/tests/unit/virt/libvirt/test_host.py
+++ b/nova/tests/unit/virt/libvirt/test_host.py
@@ -71,11 +71,10 @@ def setUp(self):
         self.useFixture(nova_fixtures.LibvirtFixture())
         self.host = host.Host("qemu:///system")
 
-    @mock.patch("nova.virt.libvirt.host.Host._init_events")
-    def test_repeat_initialization(self, mock_init_events):
+    def test_repeat_initialization(self):
         for i in range(3):
             self.host.initialize()
-        mock_init_events.assert_called_once_with()
+        self.host._init_events.assert_called_once_with()
 
     @mock.patch.object(fakelibvirt.virConnect, "registerCloseCallback")
     def test_close_callback(self, mock_close):
@@ -1113,8 +1112,9 @@ def test_get_pcinet_info(self):
         expect_vf = ["rx", "tx", "sg", "tso", "gso", "gro", "rxvlan", "txvlan"]
         self.assertEqual(expect_vf, actualvf)
 
-    @mock.patch.object(pci_utils, 'get_ifname_by_pci_address')
-    def test_get_pcidev_info_non_nic(self, mock_get_ifname):
+    def test_get_pcidev_info_non_nic(self):
+        pci_utils.get_mac_by_pci_address.side_effect = (
+            exception.PciDeviceNotFoundById('0000:04:00.3'))
         dev_name = "pci_0000_04_11_7"
         pci_dev = fakelibvirt.NodeDevice(
             self.host._get_connection(),
@@ -1128,11 +1128,10 @@ def test_get_pcidev_info_non_nic(self, mock_get_ifname):
             'parent_addr': '0000:04:00.3',
         }
         self.assertEqual(expect_vf, actual_vf)
-        mock_get_ifname.assert_not_called()
+        pci_utils.get_ifname_by_pci_address.assert_not_called()
 
-    @mock.patch.object(pci_utils, 'get_ifname_by_pci_address',
-                return_value='ens1')
-    def test_get_pcidev_info(self, mock_get_ifname):
+    def test_get_pcidev_info(self):
+        pci_utils.get_ifname_by_pci_address.return_value = 'ens1'
         devs = {
             "pci_0000_04_00_3", "pci_0000_04_10_7", "pci_0000_04_11_7",
             "pci_0000_04_00_1", "pci_0000_03_00_0", "pci_0000_03_00_1",
diff --git a/nova/tests/unit/virt/libvirt/test_vif.py b/nova/tests/unit/virt/libvirt/test_vif.py
index 43504efeb53..697300b9cfc 100644
--- a/nova/tests/unit/virt/libvirt/test_vif.py
+++ b/nova/tests/unit/virt/libvirt/test_vif.py
@@ -517,18 +517,17 @@ def setup_os_vif_objects(self):
     def setUp(self):
         super(LibvirtVifTestCase, self).setUp()
 
-        self.useFixture(nova_fixtures.LibvirtFixture(stub_os_vif=False))
+        self.libvirt = self.useFixture(
+            nova_fixtures.LibvirtFixture(stub_os_vif=False))
 
         # os_vif.initialize is typically done in nova-compute startup
         os_vif.initialize()
         self.setup_os_vif_objects()
 
         # multiqueue configuration is host OS specific
-        _a = mock.patch('os.uname')
-        self.mock_uname = _a.start()
+        self.mock_uname = self.libvirt.mock_uname
         self.mock_uname.return_value = fakelibvirt.os_uname(
             'Linux', '', '5.10.13-200-generic', '', 'x86_64')
-        self.addCleanup(_a.stop)
 
     def _get_node(self, xml):
         doc = etree.fromstring(xml)
@@ -983,14 +982,9 @@ def test_generic_driver_bridge(self):
                                   self.vif_bridge,
                                   self.vif_bridge['network']['bridge'])
 
-    @mock.patch.object(pci_utils, 'get_ifname_by_pci_address')
-    @mock.patch.object(pci_utils, 'get_vf_num_by_pci_address', return_value=1)
-    @mock.patch('nova.privsep.linux_net.set_device_macaddr')
-    @mock.patch('nova.privsep.linux_net.set_device_macaddr_and_vlan')
-    def _test_hw_veb_op(self, op, vlan, mock_set_macaddr_and_vlan,
-                        mock_set_macaddr, mock_get_vf_num,
-                        mock_get_ifname):
-        mock_get_ifname.side_effect = ['eth1', 'eth13']
+    def _test_hw_veb_op(self, op, vlan):
+        self.libvirt.mock_get_vf_num_by_pci_address.return_value = 1
+        pci_utils.get_ifname_by_pci_address.side_effect = ['eth1', 'eth13']
         vlan_id = int(vlan)
         port_state = 'up' if vlan_id > 0 else 'down'
         mac = ('00:00:00:00:00:00' if op.__name__ == 'unplug'
@@ -1005,10 +999,13 @@ def _test_hw_veb_op(self, op, vlan, mock_set_macaddr_and_vlan,
             'set_macaddr': [mock.call('eth13', mac, port_state=port_state)]
         }
         op(self.instance, self.vif_hw_veb_macvtap)
-        mock_get_ifname.assert_has_calls(calls['get_ifname'])
-        mock_get_vf_num.assert_has_calls(calls['get_vf_num'])
-        mock_set_macaddr.assert_has_calls(calls['set_macaddr'])
-        mock_set_macaddr_and_vlan.assert_called_once_with(
+        pci_utils.get_ifname_by_pci_address.assert_has_calls(
+            calls['get_ifname'])
+        self.libvirt.mock_get_vf_num_by_pci_address.assert_has_calls(
+            calls['get_vf_num'])
+        self.libvirt.mock_set_device_macaddr.assert_has_calls(
+            calls['set_macaddr'])
+        self.libvirt.mock_set_device_macaddr_and_vlan.assert_called_once_with(
             'eth1', 1, mock.ANY, vlan_id)
 
     def test_plug_hw_veb(self):
@@ -1218,9 +1215,8 @@ def test_hostdev_physical_driver(self):
         self.assertEqual(1, len(node))
         self._assertPciEqual(node, self.vif_hostdev_physical)
 
-    @mock.patch.object(pci_utils, 'get_ifname_by_pci_address',
-                       return_value='eth1')
-    def test_hw_veb_driver_macvtap(self, mock_get_ifname):
+    def test_hw_veb_driver_macvtap(self):
+        pci_utils.get_ifname_by_pci_address.return_value = 'eth1'
         d = vif.LibvirtGenericVIFDriver()
         xml = self._get_instance_xml(d, self.vif_hw_veb_macvtap)
         node = self._get_node(xml)

From f98858aa77e4443164fc09fae3667fb0f66edfbf Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Wed, 20 Jul 2022 12:03:45 +0200
Subject: [PATCH 38/93] Add compute restart capability for libvirt func tests

The existing generic restart_compute_service() call in the nova test
base class is not appropriate for the libvirt functional test that needs
to reconfigure the libvirt connection as it is not aware of the libvirt
specific mocking needed when a compute service is started.

So this patch adds a specific restart_compute_service() call
to nova.tests.functional.libvirt.base.ServersTestBase. This will be used
by a later patch testing [pci]device_spec reconfiguration scenarios.

This change showed that some of the existing libvirt functional test
used the incomplete restart_compute_service from the base class. Others
used local mocking to inject new pci config to the restart. I moved all
these to the new function and removed the local mocking.

Change-Id: Ic717dc42ac6b6cace59d344acaf12f9d1ee35564
(cherry picked from commit 57c253a609e859fa21ba05b264f0ba4d0ade7b8b)
---
 nova/tests/functional/libvirt/base.py         | 100 ++++++++++++++++--
 .../libvirt/test_device_bus_migration.py      |   8 +-
 .../libvirt/test_numa_live_migration.py       |  12 +--
 .../functional/libvirt/test_numa_servers.py   |   6 +-
 .../libvirt/test_pci_sriov_servers.py         |  24 ++---
 nova/tests/functional/libvirt/test_reshape.py |   6 +-
 nova/tests/functional/libvirt/test_vgpu.py    |  25 +++--
 7 files changed, 127 insertions(+), 54 deletions(-)

diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py
index b2f0095f818..68c6e294c16 100644
--- a/nova/tests/functional/libvirt/base.py
+++ b/nova/tests/functional/libvirt/base.py
@@ -114,7 +114,7 @@ def _get_connection(
     def start_compute(
         self, hostname='compute1', host_info=None, pci_info=None,
         mdev_info=None, vdpa_info=None, libvirt_version=None,
-        qemu_version=None,
+        qemu_version=None, cell_name=None, connection=None
     ):
         """Start a compute service.
 
@@ -124,16 +124,35 @@ def start_compute(
         :param host_info: A fakelibvirt.HostInfo object for the host. Defaults
             to a HostInfo with 2 NUMA nodes, 2 cores per node, 2 threads per
             core, and 16GB of RAM.
+        :param connection: A fake libvirt connection. You should not provide it
+            directly. However it is used by restart_compute_service to
+            implement restart without loosing the hypervisor state.
         :returns: The hostname of the created service, which can be used to
             lookup the created service and UUID of the assocaited resource
             provider.
         """
+        if connection and (
+            host_info or
+            pci_info or
+            mdev_info or
+            vdpa_info or
+            libvirt_version or
+            qemu_version
+        ):
+            raise ValueError(
+                "Either an existing connection instance can be provided or a "
+                "list of parameters for a new connection"
+            )
 
         def _start_compute(hostname, host_info):
-            fake_connection = self._get_connection(
-                host_info, pci_info, mdev_info, vdpa_info, libvirt_version,
-                qemu_version, hostname,
-            )
+            if connection:
+                fake_connection = connection
+            else:
+                fake_connection = self._get_connection(
+                    host_info, pci_info, mdev_info, vdpa_info, libvirt_version,
+                    qemu_version, hostname,
+                )
+
             # If the compute is configured with PCI devices then we need to
             # make sure that the stubs around sysfs has the MAC address
             # information for the PCI PF devices
@@ -144,7 +163,8 @@ def _start_compute(hostname, host_info):
             # actually start the service.
             orig_con = self.mock_conn.return_value
             self.mock_conn.return_value = fake_connection
-            compute = self.start_service('compute', host=hostname)
+            compute = self.start_service(
+                'compute', host=hostname, cell_name=cell_name)
             # Once that's done, we need to tweak the compute "service" to
             # make sure it returns unique objects.
             compute.driver._host.get_connection = lambda: fake_connection
@@ -165,6 +185,74 @@ def _start_compute(hostname, host_info):
 
         return hostname
 
+    def restart_compute_service(
+        self,
+        hostname,
+        host_info=None,
+        pci_info=None,
+        mdev_info=None,
+        vdpa_info=None,
+        libvirt_version=None,
+        qemu_version=None,
+        keep_hypervisor_state=True,
+    ):
+        """Stops the service and starts a new one to have realistic restart
+
+        :param hostname: the hostname of the nova-compute service to be
+            restarted
+        :param keep_hypervisor_state: If True then we reuse the fake connection
+            from the existing driver. If False a new connection will be created
+            based on the other parameters provided
+        """
+        # We are intentionally not calling super() here. Nova's base test class
+        # defines starting and restarting compute service with a very
+        # different signatures and also those calls are cannot be made aware of
+        # the intricacies of the libvirt fixture. So we simply hide that
+        # implementation.
+
+        if keep_hypervisor_state and (
+            host_info or
+            pci_info or
+            mdev_info or
+            vdpa_info or
+            libvirt_version or
+            qemu_version
+        ):
+            raise ValueError(
+                "Either keep_hypervisor_state=True or a list of libvirt "
+                "parameters can be provided but not both"
+            )
+
+        compute = self.computes.pop(hostname)
+        self.compute_rp_uuids.pop(hostname)
+
+        # NOTE(gibi): The service interface cannot be used to simulate a real
+        # service restart as the manager object will not be recreated after a
+        # service.stop() and service.start() therefore the manager state will
+        # survive. For example the resource tracker will not be recreated after
+        # a stop start. The service.kill() call cannot help as it deletes
+        # the service from the DB which is unrealistic and causes that some
+        # operation that refers to the killed host (e.g. evacuate) fails.
+        # So this helper method will stop the original service and then starts
+        # a brand new compute service for the same host and node. This way
+        # a new ComputeManager instance will be created and initialized during
+        # the service startup.
+        compute.stop()
+
+        # this service was running previously, so we have to make sure that
+        # we restart it in the same cell
+        cell_name = self.host_mappings[compute.host].cell_mapping.name
+
+        old_connection = compute.manager.driver._get_connection()
+
+        self.start_compute(
+            hostname, host_info, pci_info, mdev_info, vdpa_info,
+            libvirt_version, qemu_version, cell_name,
+            old_connection if keep_hypervisor_state else None
+        )
+
+        return self.computes[hostname]
+
 
 class LibvirtMigrationMixin(object):
     """A simple mixin to facilliate successful libvirt live migrations
diff --git a/nova/tests/functional/libvirt/test_device_bus_migration.py b/nova/tests/functional/libvirt/test_device_bus_migration.py
index 82a0d4556e2..3852e31c68b 100644
--- a/nova/tests/functional/libvirt/test_device_bus_migration.py
+++ b/nova/tests/functional/libvirt/test_device_bus_migration.py
@@ -51,7 +51,7 @@ def _assert_stashed_image_properties(self, server_id, properties):
 
     def _assert_stashed_image_properties_persist(self, server, properties):
         # Assert the stashed properties persist across a host reboot
-        self.restart_compute_service(self.compute)
+        self.restart_compute_service(self.compute_hostname)
         self._assert_stashed_image_properties(server['id'], properties)
 
         # Assert the stashed properties persist across a guest reboot
@@ -173,7 +173,7 @@ def test_default_image_property_persists_across_host_flag_changes(self):
         self.flags(pointer_model='ps2mouse')
         # Restart compute to pick up ps2 setting, which means the guest will
         # not get a prescribed pointer device
-        self.restart_compute_service(self.compute)
+        self.restart_compute_service(self.compute_hostname)
 
         # Create a server with default image properties
         default_image_properties1 = {
@@ -187,7 +187,7 @@ def test_default_image_property_persists_across_host_flag_changes(self):
         # Assert the defaults persist across a host flag change
         self.flags(pointer_model='usbtablet')
         # Restart compute to pick up usb setting
-        self.restart_compute_service(self.compute)
+        self.restart_compute_service(self.compute_hostname)
         self._assert_stashed_image_properties(
             server1['id'], default_image_properties1)
 
@@ -216,7 +216,7 @@ def test_default_image_property_persists_across_host_flag_changes(self):
         # https://bugs.launchpad.net/nova/+bug/1866106
         self.flags(pointer_model=None)
         # Restart compute to pick up None setting
-        self.restart_compute_service(self.compute)
+        self.restart_compute_service(self.compute_hostname)
         self._assert_stashed_image_properties(
             server1['id'], default_image_properties1)
         self._assert_stashed_image_properties(
diff --git a/nova/tests/functional/libvirt/test_numa_live_migration.py b/nova/tests/functional/libvirt/test_numa_live_migration.py
index 2f3897d6b2b..0e504d2df25 100644
--- a/nova/tests/functional/libvirt/test_numa_live_migration.py
+++ b/nova/tests/functional/libvirt/test_numa_live_migration.py
@@ -206,10 +206,8 @@ def _test(self, pin_dest):
         # Increase cpu_dedicated_set to 0-3, expecting the live migrated server
         # to end up on 2,3.
         self.flags(cpu_dedicated_set='0-3', group='compute')
-        self.computes['host_a'] = self.restart_compute_service(
-            self.computes['host_a'])
-        self.computes['host_b'] = self.restart_compute_service(
-            self.computes['host_b'])
+        self.restart_compute_service('host_a')
+        self.restart_compute_service('host_b')
 
         # Live migrate, RPC-pinning the destination host if asked
         if pin_dest:
@@ -333,10 +331,8 @@ def _test(self, pin_dest=False):
         # Increase cpu_dedicated_set to 0-3, expecting the live migrated server
         # to end up on 2,3.
         self.flags(cpu_dedicated_set='0-3', group='compute')
-        self.computes['host_a'] = self.restart_compute_service(
-            self.computes['host_a'])
-        self.computes['host_b'] = self.restart_compute_service(
-            self.computes['host_b'])
+        self.restart_compute_service('host_a')
+        self.restart_compute_service('host_b')
 
         # Live migrate, RPC-pinning the destination host if asked. This is a
         # rollback test, so server_a is expected to remain on host_a.
diff --git a/nova/tests/functional/libvirt/test_numa_servers.py b/nova/tests/functional/libvirt/test_numa_servers.py
index fd09a11e20a..8fd97294040 100644
--- a/nova/tests/functional/libvirt/test_numa_servers.py
+++ b/nova/tests/functional/libvirt/test_numa_servers.py
@@ -1187,10 +1187,8 @@ def test_vcpu_to_pcpu_reshape(self):
         self.flags(cpu_dedicated_set='0-7', group='compute')
         self.flags(vcpu_pin_set=None)
 
-        computes = {}
-        for host, compute in self.computes.items():
-            computes[host] = self.restart_compute_service(compute)
-        self.computes = computes
+        for host in list(self.computes.keys()):
+            self.restart_compute_service(host)
 
         # verify that the inventory, usages and allocation are correct after
         # the reshape
diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
index c9d277f498d..49be70aa7bd 100644
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@@ -914,11 +914,8 @@ def test_create_server_after_change_in_nonsriov_pf_to_sriov_pf(self):
         # Disable SRIOV capabilties in PF and delete the VFs
         self._disable_sriov_in_pf(pci_info_no_sriov)
 
-        fake_connection = self._get_connection(pci_info=pci_info_no_sriov,
-                                               hostname='test_compute0')
-        self.mock_conn.return_value = fake_connection
-
-        self.compute = self.start_service('compute', host='test_compute0')
+        self.start_compute('test_compute0', pci_info=pci_info_no_sriov)
+        self.compute = self.computes['test_compute0']
 
         ctxt = context.get_admin_context()
         pci_devices = objects.PciDeviceList.get_by_compute_node(
@@ -930,13 +927,9 @@ def test_create_server_after_change_in_nonsriov_pf_to_sriov_pf(self):
         self.assertEqual(1, len(pci_devices))
         self.assertEqual('type-PCI', pci_devices[0].dev_type)
 
-        # Update connection with original pci info with sriov PFs
-        fake_connection = self._get_connection(pci_info=pci_info,
-                                               hostname='test_compute0')
-        self.mock_conn.return_value = fake_connection
-
-        # Restart the compute service
-        self.restart_compute_service(self.compute)
+        # Restart the compute service with sriov PFs
+        self.restart_compute_service(
+            self.compute.host, pci_info=pci_info, keep_hypervisor_state=False)
 
         # Verify if PCI devices are of type type-PF or type-VF
         pci_devices = objects.PciDeviceList.get_by_compute_node(
@@ -1021,10 +1014,9 @@ def _test_detach_attach(self, first_port_id, second_port_id):
         host_info = fakelibvirt.HostInfo(cpu_nodes=2, cpu_sockets=1,
                                          cpu_cores=2, cpu_threads=2)
         pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=1)
-        fake_connection = self._get_connection(host_info, pci_info)
-        self.mock_conn.return_value = fake_connection
-
-        self.compute = self.start_service('compute', host='test_compute0')
+        self.start_compute(
+            'test_compute0', host_info=host_info, pci_info=pci_info)
+        self.compute = self.computes['test_compute0']
 
         # Create server with a port
         server = self._create_server(networks=[{'port': first_port_id}])
diff --git a/nova/tests/functional/libvirt/test_reshape.py b/nova/tests/functional/libvirt/test_reshape.py
index 8249100111b..d0102f12476 100644
--- a/nova/tests/functional/libvirt/test_reshape.py
+++ b/nova/tests/functional/libvirt/test_reshape.py
@@ -72,11 +72,11 @@ def test_create_servers_with_vgpu(self):
         # ignore the content of the above HostMdevDeviceInfo
         self.flags(enabled_mdev_types='', group='devices')
 
-        hostname = self.start_compute(
+        self.hostname = self.start_compute(
             hostname='compute1',
             mdev_info=fakelibvirt.HostMdevDevicesInfo(devices=mdevs),
         )
-        self.compute = self.computes[hostname]
+        self.compute = self.computes[self.hostname]
 
         # create the VGPU resource in placement manually
         compute_rp_uuid = self.placement.get(
@@ -158,7 +158,7 @@ def test_create_servers_with_vgpu(self):
                 allocations[compute_rp_uuid]['resources'])
 
         # restart compute which will trigger a reshape
-        self.compute = self.restart_compute_service(self.compute)
+        self.compute = self.restart_compute_service(self.hostname)
 
         # verify that the inventory, usages and allocation are correct after
         # the reshape
diff --git a/nova/tests/functional/libvirt/test_vgpu.py b/nova/tests/functional/libvirt/test_vgpu.py
index e111f50de0d..686582120ad 100644
--- a/nova/tests/functional/libvirt/test_vgpu.py
+++ b/nova/tests/functional/libvirt/test_vgpu.py
@@ -113,8 +113,8 @@ def _create_mdev(self, physical_device, mdev_type, uuid=None):
                                                    parent=libvirt_parent)})
         return uuid
 
-    def start_compute(self, hostname):
-        hostname = super().start_compute(
+    def start_compute_with_vgpu(self, hostname):
+        hostname = self.start_compute(
             pci_info=fakelibvirt.HostPCIDevicesInfo(
                 num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
             ),
@@ -197,7 +197,7 @@ def setUp(self):
             enabled_mdev_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
             group='devices')
 
-        self.compute1 = self.start_compute('host1')
+        self.compute1 = self.start_compute_with_vgpu('host1')
 
     def assert_vgpu_usage_for_compute(self, compute, expected):
         self.assert_mdev_usage(compute, expected_amount=expected)
@@ -211,7 +211,7 @@ def test_create_servers_with_vgpu(self):
 
     def test_resize_servers_with_vgpu(self):
         # Add another compute for the sake of resizing
-        self.compute2 = self.start_compute('host2')
+        self.compute2 = self.start_compute_with_vgpu('host2')
         server = self._create_server(
             image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
             flavor_id=self.flavor, host=self.compute1.host,
@@ -337,7 +337,7 @@ def setUp(self):
         # Prepare traits for later on
         self._create_trait('CUSTOM_NVIDIA_11')
         self._create_trait('CUSTOM_NVIDIA_12')
-        self.compute1 = self.start_compute('host1')
+        self.compute1 = self.start_compute_with_vgpu('host1')
 
     def test_create_servers_with_vgpu(self):
         self._create_server(
@@ -369,13 +369,12 @@ def test_create_servers_with_vgpu(self):
 
     def test_create_servers_with_specific_type(self):
         # Regenerate the PCI addresses so both pGPUs now support nvidia-12
-        connection = self.computes[
-            self.compute1.host].driver._host.get_connection()
-        connection.pci_info = fakelibvirt.HostPCIDevicesInfo(
+        pci_info = fakelibvirt.HostPCIDevicesInfo(
             num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
             multiple_gpu_types=True)
         # Make a restart to update the Resource Providers
-        self.compute1 = self.restart_compute_service(self.compute1)
+        self.compute1 = self.restart_compute_service(
+            self.compute1.host, pci_info=pci_info, keep_hypervisor_state=False)
         pgpu1_rp_uuid = self._get_provider_uuid_by_name(
             self.compute1.host + '_' + fakelibvirt.MDEVCAP_DEV1_PCI_ADDR)
         pgpu2_rp_uuid = self._get_provider_uuid_by_name(
@@ -451,7 +450,7 @@ def setUp(self):
                    group='mdev_nvidia-12')
         self.flags(mdev_class='CUSTOM_NOTVGPU', group='mdev_mlx5_core')
 
-        self.compute1 = self.start_compute('host1')
+        self.compute1 = self.start_compute_with_vgpu('host1')
         # Regenerate the PCI addresses so they can support both mlx5 and
         # nvidia-12 types
         connection = self.computes[
@@ -460,7 +459,7 @@ def setUp(self):
             num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
             generic_types=True)
         # Make a restart to update the Resource Providers
-        self.compute1 = self.restart_compute_service(self.compute1)
+        self.compute1 = self.restart_compute_service('host1')
 
     def test_create_servers_with_different_mdev_classes(self):
         physdev1_rp_uuid = self._get_provider_uuid_by_name(
@@ -498,7 +497,7 @@ def test_create_servers_with_different_mdev_classes(self):
 
     def test_resize_servers_with_mlx5(self):
         # Add another compute for the sake of resizing
-        self.compute2 = self.start_compute('host2')
+        self.compute2 = self.start_compute_with_vgpu('host2')
         # Regenerate the PCI addresses so they can support both mlx5 and
         # nvidia-12 types
         connection = self.computes[
@@ -507,7 +506,7 @@ def test_resize_servers_with_mlx5(self):
             num_pci=0, num_pfs=0, num_vfs=0, num_mdevcap=2,
             generic_types=True)
         # Make a restart to update the Resource Providers
-        self.compute2 = self.restart_compute_service(self.compute2)
+        self.compute2 = self.restart_compute_service('host2')
 
         # Use the new flavor for booting
         server = self._create_server(

From 041939361e393b808724b8590eb76b3aa075814e Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Mon, 7 Mar 2022 20:37:57 +0000
Subject: [PATCH 39/93] enable blocked VDPA move operations

This change adds functional test for operations on servers with VDPA
devices that are expected to work but currently blocked due to lack
of testing or qemu bugs.

cold-migrate, resize, evacuate,and shelve are enabled
and tested by this patch

Conflicts:
    nova/tests/functional/libvirt/test_pci_sriov_servers.py

Closes-Bug: #1970467
Change-Id: I6e220cf3231670d156632e075fcf7701df744773
(cherry picked from commit 95f96ed3aa201bc5b90e589b288fa4039bc9c0d2)
---
 doc/source/admin/index.rst                    |   1 +
 doc/source/admin/vdpa.rst                     |  92 ++++++
 nova/compute/api.py                           |   8 -
 .../libvirt/test_pci_sriov_servers.py         | 299 ++++++++++++++++--
 .../notes/vdpa-move-ops-a7b3799807807a92.yaml |  11 +
 5 files changed, 385 insertions(+), 26 deletions(-)
 create mode 100644 doc/source/admin/vdpa.rst
 create mode 100644 releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml

diff --git a/doc/source/admin/index.rst b/doc/source/admin/index.rst
index e83f680df2e..34babb5f152 100644
--- a/doc/source/admin/index.rst
+++ b/doc/source/admin/index.rst
@@ -198,6 +198,7 @@ instance for these kind of workloads.
    virtual-gpu
    file-backed-memory
    ports-with-resource-requests
+   vdpa
    virtual-persistent-memory
    emulated-tpm
    uefi
diff --git a/doc/source/admin/vdpa.rst b/doc/source/admin/vdpa.rst
new file mode 100644
index 00000000000..8583d327ccc
--- /dev/null
+++ b/doc/source/admin/vdpa.rst
@@ -0,0 +1,92 @@
+============================
+Using ports vnic_type='vdpa'
+============================
+.. versionadded:: 23.0.0 (Wallaby)
+
+   Introduced support for vDPA.
+
+.. important::
+   The functionality described below is only supported by the
+   libvirt/KVM virt driver.
+
+The kernel vDPA (virtio Data Path Acceleration) framework
+provides a vendor independent framework for offloading data-plane
+processing to software or hardware virtio device backends.
+While the kernel vDPA framework supports many types of vDPA devices,
+at this time nova only support ``virtio-net`` devices
+using the ``vhost-vdpa`` front-end driver. Support for ``virtio-blk`` or
+``virtio-gpu`` may be added in the future but is not currently planned
+for any specific release.
+
+vDPA device tracking
+~~~~~~~~~~~~~~~~~~~~
+When implementing support for vDPA based neutron ports one of the first
+decisions nova had to make was how to model the availability of vDPA devices
+and the capability to virtualize vDPA devices. As the initial use-case
+for this technology was to offload networking to hardware offload OVS via
+neutron ports the decision was made to extend the existing PCI tracker that
+is used for SR-IOV and pci-passthrough to support vDPA devices. As a result
+a simplification was made to assume that the parent device of a vDPA device
+is an SR-IOV Virtual Function (VF). As a result software only vDPA device such
+as those created by the kernel ``vdpa-sim`` sample module are not supported.
+
+To make vDPA device available to be scheduled to guests the operator should
+include the device using the PCI address or vendor ID and product ID of the
+parent VF in the PCI ``device_spec``.
+See: :nova-doc:`pci-passthrough <admin/pci-passthrough>` for details.
+
+Nova will not create the VFs or vDPA devices automatically. It is expected
+that the operator will allocate them before starting the nova-compute agent.
+While no specific mechanisms is prescribed to do this udev rules or systemd
+service files are generally the recommended approach to ensure the devices
+are created consistently across reboots.
+
+.. note::
+   As vDPA is an offload only for the data plane and not the control plane a
+   vDPA control plane is required to properly support vDPA device passthrough.
+   At the time of writing only hardware offloaded OVS is supported when using
+   vDPA with nova. Because of this vDPA devices cannot be requested using the
+   PCI alias. While nova could allow vDPA devices to be requested by the
+   flavor using a PCI alias we would not be able to correctly configure the
+   device as there would be no suitable control plane. For this reason vDPA
+   devices are currently only consumable via neutron ports.
+
+Virt driver support
+~~~~~~~~~~~~~~~~~~~
+
+Supporting neutron ports with ``vnic_type=vdpa`` depends on the capability
+of the virt driver. At this time only the ``libvirt`` virt driver with KVM
+is fully supported. QEMU may also work but is untested.
+
+vDPA support depends on kernel 5.7+, Libvirt 6.9.0+ and QEMU 5.1+.
+
+vDPA lifecycle operations
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+At this time vDPA ports can only be added to a VM when it is first created.
+To do this the normal SR-IOV workflow is used where by the port is first created
+in neutron and passed into nova as part of the server create request.
+
+.. code-block:: bash
+
+   openstack port create --network <my network> --vnic-type vdpa vdpa-port
+   openstack server create --flavor <my-flavor> --image <my-image> --port <vdpa-port uuid> vdpa-vm
+
+When vDPA support was first introduced no move operations were supported.
+As this documentation was added in the change that enabled some move operations
+The following should be interpreted both as a retrospective and future looking
+viewpoint and treated as a living document which will be updated as functionality evolves.
+
+23.0.0: initial support is added for creating a VM with vDPA ports, move operations
+are blocked in the API but implemented in code.
+26.0.0: support for all move operation except live migration is tested and api blocks are removed.
+25.x.y: (planned) api block removal backported to stable/Yoga
+24.x.y: (planned) api block removal backported to stable/Xena
+23.x.y: (planned) api block removal backported to stable/wallaby
+26.0.0: (in progress) interface attach/detach, suspend/resume and hot plug live migration
+are implemented to fully support all lifecycle operations on instances with vDPA ports.
+
+.. note::
+   The ``(planned)`` and ``(in progress)`` qualifiers will be removed when those items are
+   completed. If your current version of the document contains those qualifiers then those
+   lifecycle operations are unsupported.
diff --git a/nova/compute/api.py b/nova/compute/api.py
index 9a2cbd3325a..b9b6e9de7cc 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -4096,9 +4096,6 @@ def _validate_host_for_cold_migrate(
     # finally split resize and cold migration into separate code paths
     @block_extended_resource_request
     @block_port_accelerators()
-    # FIXME(sean-k-mooney): Cold migrate and resize to different hosts
-    # probably works but they have not been tested so block them for now
-    @reject_vdpa_instances(instance_actions.RESIZE)
     @block_accelerators()
     @check_instance_lock
     @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED])
@@ -4324,10 +4321,7 @@ def _allow_resize_to_same_host(self, cold_migrate, instance):
             allow_same_host = CONF.allow_resize_to_same_host
         return allow_same_host
 
-    # FIXME(sean-k-mooney): Shelve works but unshelve does not due to bug
-    # #1851545, so block it for now
     @block_port_accelerators()
-    @reject_vdpa_instances(instance_actions.SHELVE)
     @reject_vtpm_instances(instance_actions.SHELVE)
     @block_accelerators(until_service=54)
     @check_instance_lock
@@ -5469,8 +5463,6 @@ def live_migrate_abort(self, context, instance, migration_id,
 
     @block_extended_resource_request
     @block_port_accelerators()
-    # FIXME(sean-k-mooney): rebuild works but we have not tested evacuate yet
-    @reject_vdpa_instances(instance_actions.EVACUATE)
     @reject_vtpm_instances(instance_actions.EVACUATE)
     @block_accelerators(until_service=SUPPORT_ACCELERATOR_SERVICE_FOR_REBUILD)
     @check_instance_state(vm_state=[vm_states.ACTIVE, vm_states.STOPPED,
diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
index 49be70aa7bd..c228fb04cf8 100644
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@@ -1105,7 +1105,7 @@ def setUp(self):
         # fixture already stubbed.
         self.neutron = self.useFixture(base.LibvirtNeutronFixture(self))
 
-    def start_compute(self):
+    def start_vdpa_compute(self, hostname='compute-0'):
         vf_ratio = self.NUM_VFS // self.NUM_PFS
 
         pci_info = fakelibvirt.HostPCIDevicesInfo(
@@ -1143,7 +1143,7 @@ def start_compute(self):
                 driver_name='mlx5_core')
             vdpa_info.add_device(f'vdpa_vdpa{idx}', idx, vf)
 
-        return super().start_compute(
+        return super().start_compute(hostname=hostname,
             pci_info=pci_info, vdpa_info=vdpa_info,
             libvirt_version=self.FAKE_LIBVIRT_VERSION,
             qemu_version=self.FAKE_QEMU_VERSION)
@@ -1198,7 +1198,7 @@ def fake_create(cls, xml, host):
             fake_create,
         )
 
-        hostname = self.start_compute()
+        hostname = self.start_vdpa_compute()
         num_pci = self.NUM_PFS + self.NUM_VFS
 
         # both the PF and VF with vDPA capabilities (dev_type=vdpa) should have
@@ -1231,12 +1231,16 @@ def fake_create(cls, xml, host):
             port['binding:profile'],
         )
 
-    def _test_common(self, op, *args, **kwargs):
-        self.start_compute()
-
+    def _create_port_and_server(self):
         # create the port and a server, with the port attached to the server
         vdpa_port = self.create_vdpa_port()
         server = self._create_server(networks=[{'port': vdpa_port['id']}])
+        return vdpa_port, server
+
+    def _test_common(self, op, *args, **kwargs):
+        self.start_vdpa_compute()
+
+        vdpa_port, server = self._create_port_and_server()
 
         # attempt the unsupported action and ensure it fails
         ex = self.assertRaises(
@@ -1247,13 +1251,11 @@ def _test_common(self, op, *args, **kwargs):
             ex.response.text)
 
     def test_attach_interface(self):
-        self.start_compute()
-
+        self.start_vdpa_compute()
         # create the port and a server, but don't attach the port to the server
         # yet
         vdpa_port = self.create_vdpa_port()
         server = self._create_server(networks='none')
-
         # attempt to attach the port to the server
         ex = self.assertRaises(
             client.OpenStackApiException,
@@ -1265,21 +1267,282 @@ def test_attach_interface(self):
     def test_detach_interface(self):
         self._test_common(self._detach_interface, uuids.vdpa_port)
 
-    def test_shelve(self):
-        self._test_common(self._shelve_server)
+    def test_shelve_offload(self):
+        hostname = self.start_vdpa_compute()
+        vdpa_port, server = self._create_port_and_server()
+        # assert the port is bound to the vm and the compute host
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(server['id'], port['device_id'])
+        self.assertEqual(hostname, port['binding:host_id'])
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        # -2 we claim the vdpa device which make the parent PF unavailable
+        self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci - 2)
+        server = self._shelve_server(server)
+        # now that the vm is shelve offloaded it should not be bound
+        # to any host but should still be owned by the vm
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(server['id'], port['device_id'])
+        # FIXME(sean-k-mooney): we should be unbinding the port from
+        # the host when we shelve offload but we don't today.
+        # This is unrelated to vdpa port and is a general issue.
+        self.assertEqual(hostname, port['binding:host_id'])
+        self.assertIn('binding:profile', port)
+        self.assertIsNone(server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        self.assertIsNone(server['OS-EXT-SRV-ATTR:host'])
+        self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci)
 
-    def test_suspend(self):
-        self._test_common(self._suspend_server)
+    def test_unshelve_to_same_host(self):
+        hostname = self.start_vdpa_compute()
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci)
+
+        vdpa_port, server = self._create_port_and_server()
+        self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci - 2)
+        self.assertEqual(
+            hostname, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(hostname, port['binding:host_id'])
+
+        server = self._shelve_server(server)
+        self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci)
+        self.assertIsNone(server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        # FIXME(sean-k-mooney): shelve  offload should unbind the port
+        # self.assertEqual('', port['binding:host_id'])
+        self.assertEqual(hostname, port['binding:host_id'])
+
+        server = self._unshelve_server(server)
+        self.assertPCIDeviceCounts(hostname, total=num_pci, free=num_pci - 2)
+        self.assertEqual(
+            hostname, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(hostname, port['binding:host_id'])
+
+    def test_unshelve_to_different_host(self):
+        source = self.start_vdpa_compute(hostname='source')
+        dest = self.start_vdpa_compute(hostname='dest')
+
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+
+        # ensure we boot the vm on the "source" compute
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'disabled'})
+        vdpa_port, server = self._create_port_and_server()
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        self.assertEqual(
+            source, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(source, port['binding:host_id'])
+
+        server = self._shelve_server(server)
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertIsNone(server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        # FIXME(sean-k-mooney): shelve should unbind the port
+        # self.assertEqual('', port['binding:host_id'])
+        self.assertEqual(source, port['binding:host_id'])
+
+        # force the unshelve to the other host
+        self.api.put_service(
+            self.computes['source'].service_ref.uuid, {'status': 'disabled'})
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'enabled'})
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+        server = self._unshelve_server(server)
+        # the dest devices should be claimed
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+        # and the source host devices should still be free
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertEqual(
+            dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(dest, port['binding:host_id'])
 
     def test_evacute(self):
-        self._test_common(self._evacuate_server)
+        source = self.start_vdpa_compute(hostname='source')
+        dest = self.start_vdpa_compute(hostname='dest')
 
-    def test_resize(self):
-        flavor_id = self._create_flavor()
-        self._test_common(self._resize_server, flavor_id)
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+
+        # ensure we boot the vm on the "source" compute
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'disabled'})
+        vdpa_port, server = self._create_port_and_server()
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        self.assertEqual(
+            source, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(source, port['binding:host_id'])
+
+        # stop the source compute and enable the dest
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'enabled'})
+        self.computes['source'].stop()
+        # Down the source compute to enable the evacuation
+        self.api.put_service(
+            self.computes['source'].service_ref.uuid, {'forced_down': True})
+
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+        server = self._evacuate_server(server)
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+        self.assertEqual(
+            dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+        port = self.neutron.show_port(vdpa_port['id'])['port']
+        self.assertEqual(dest, port['binding:host_id'])
+
+        # as the source compute is offline the pci claims will not be cleaned
+        # up on the source compute.
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        # but if you fix/restart the source node the allocations for evacuated
+        # instances should be released.
+        self.restart_compute_service(source)
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+
+    def test_resize_same_host(self):
+        self.flags(allow_resize_to_same_host=True)
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        source = self.start_vdpa_compute()
+        vdpa_port, server = self._create_port_and_server()
+        # before we resize the vm should be using 1 VF but that will mark
+        # the PF as unavailable so we assert 2 devices are in use.
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        flavor_id = self._create_flavor(name='new-flavor')
+        self.assertNotEqual(server['flavor']['original_name'], 'new-flavor')
+        with mock.patch(
+            'nova.virt.libvirt.driver.LibvirtDriver'
+            '.migrate_disk_and_power_off', return_value='{}',
+        ):
+            server = self._resize_server(server, flavor_id)
+            self.assertEqual(
+                server['flavor']['original_name'], 'new-flavor')
+            # in resize verify the VF claims should be doubled even
+            # for same host resize so assert that 3 are in devices in use
+            # 1 PF and 2 VFs .
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 3)
+            server = self._confirm_resize(server)
+            # but once we confrim it should be reduced back to 1 PF and 1 VF
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+            # assert the hostname has not have changed as part
+            # of the resize.
+            self.assertEqual(
+                source, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+
+    def test_resize_different_host(self):
+        self.flags(allow_resize_to_same_host=False)
+        source = self.start_vdpa_compute(hostname='source')
+        dest = self.start_vdpa_compute(hostname='dest')
+
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+
+        # ensure we boot the vm on the "source" compute
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'disabled'})
+        vdpa_port, server = self._create_port_and_server()
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        flavor_id = self._create_flavor(name='new-flavor')
+        self.assertNotEqual(server['flavor']['original_name'], 'new-flavor')
+        # disable the source compute and enable the dest
+        self.api.put_service(
+            self.computes['source'].service_ref.uuid, {'status': 'disabled'})
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'enabled'})
+        with mock.patch(
+            'nova.virt.libvirt.driver.LibvirtDriver'
+            '.migrate_disk_and_power_off', return_value='{}',
+        ):
+            server = self._resize_server(server, flavor_id)
+            self.assertEqual(
+                server['flavor']['original_name'], 'new-flavor')
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+            self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+            server = self._confirm_resize(server)
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+            self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+            self.assertEqual(
+                dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+
+    def test_resize_revert(self):
+        self.flags(allow_resize_to_same_host=False)
+        source = self.start_vdpa_compute(hostname='source')
+        dest = self.start_vdpa_compute(hostname='dest')
+
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+
+        # ensure we boot the vm on the "source" compute
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'disabled'})
+        vdpa_port, server = self._create_port_and_server()
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        flavor_id = self._create_flavor(name='new-flavor')
+        self.assertNotEqual(server['flavor']['original_name'], 'new-flavor')
+        # disable the source compute and enable the dest
+        self.api.put_service(
+            self.computes['source'].service_ref.uuid, {'status': 'disabled'})
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'enabled'})
+        with mock.patch(
+            'nova.virt.libvirt.driver.LibvirtDriver'
+            '.migrate_disk_and_power_off', return_value='{}',
+        ):
+            server = self._resize_server(server, flavor_id)
+            self.assertEqual(
+                server['flavor']['original_name'], 'new-flavor')
+            # in resize verify both the dest and source pci claims should be
+            # present.
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+            self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+            server = self._revert_resize(server)
+            # but once we revert the dest claims should be freed.
+            self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+            self.assertEqual(
+                source, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
 
     def test_cold_migrate(self):
-        self._test_common(self._migrate_server)
+        source = self.start_vdpa_compute(hostname='source')
+        dest = self.start_vdpa_compute(hostname='dest')
+
+        num_pci = self.NUM_PFS + self.NUM_VFS
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+        self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci)
+
+        # ensure we boot the vm on the "source" compute
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'disabled'})
+        vdpa_port, server = self._create_port_and_server()
+        self.assertEqual(
+            source, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+
+        self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+        # enable the dest we do not need to disable the source since cold
+        # migrate wont happen to the same host in the libvirt driver
+        self.api.put_service(
+            self.computes['dest'].service_ref.uuid, {'status': 'enabled'})
+        with mock.patch(
+            'nova.virt.libvirt.driver.LibvirtDriver'
+            '.migrate_disk_and_power_off', return_value='{}',
+        ):
+            server = self._migrate_server(server)
+            self.assertEqual(
+                dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci - 2)
+            self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+            server = self._confirm_resize(server)
+            self.assertPCIDeviceCounts(source, total=num_pci, free=num_pci)
+            self.assertPCIDeviceCounts(dest, total=num_pci, free=num_pci - 2)
+            self.assertEqual(
+                dest, server['OS-EXT-SRV-ATTR:hypervisor_hostname'])
+
+    def test_suspend(self):
+        self._test_common(self._suspend_server)
 
 
 class PCIServersTest(_PCIServersTestBase):
diff --git a/releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml b/releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml
new file mode 100644
index 00000000000..2580f73d35b
--- /dev/null
+++ b/releasenotes/notes/vdpa-move-ops-a7b3799807807a92.yaml
@@ -0,0 +1,11 @@
+---
+fixes:
+  - |
+    When vDPA was first introduced move operations were implemented in the code
+    but untested either in a real environment or in functional tests. Due to
+    this gap nova elected to block move operations for instance with vDPA
+    devices. All move operations except for live migration have now been tested
+    and found to indeed work so the API blocks have now been removed and
+    functional tests introduced. Other operations such as suspend and
+    live migration require code changes to support and will be enabled as new
+    features in the future.

From 4a2b44c7cf55d1d79d5a2dd638bd0def3af0f5af Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Tue, 16 Aug 2022 06:49:53 +0000
Subject: [PATCH 40/93] Adapt websocketproxy tests for SimpleHTTPServer fix

In response to bug 1927677 we added a workaround to
NovaProxyRequestHandler to respond with a 400 Bad Request if an open
redirect is attempted:

  Ie36401c782f023d1d5f2623732619105dc2cfa24
  I95f68be76330ff09e5eabb5ef8dd9a18f5547866

Recently in python 3.10.6, a fix has landed in cpython to respond with
a 301 Moved Permanently to a sanitized URL that has had extra leading
'/' characters removed.

This breaks our existing unit tests which assume a 400 Bad Request as
the only expected response.

This adds handling of a 301 Moved Permanently response and asserts that
the redirect location is the expected sanitized URL. Doing this instead
of checking for a given python version will enable the tests to continue
to work if and when the cpython fix gets backported to older python
versions.

While updating the tests, the opportunity was taken to commonize the
code of two unit tests that were nearly identical.

Related-Bug: #1927677
Closes-Bug: #1986545

Change-Id: I27441d15cc6fa2ff7715ba15aa900961aadbf54a
(cherry picked from commit 15769b883ed4a86d62b141ea30d3f1590565d8e0)
---
 .../tests/unit/console/test_websocketproxy.py | 61 ++++++++-----------
 1 file changed, 26 insertions(+), 35 deletions(-)

diff --git a/nova/tests/unit/console/test_websocketproxy.py b/nova/tests/unit/console/test_websocketproxy.py
index e05ae520d9a..0c897e3e911 100644
--- a/nova/tests/unit/console/test_websocketproxy.py
+++ b/nova/tests/unit/console/test_websocketproxy.py
@@ -589,12 +589,12 @@ def test_malformed_cookie(self, validate, check_port):
         self.wh.socket.assert_called_with('node1', 10000, connect=True)
         self.wh.do_proxy.assert_called_with('<socket>')
 
-    def test_reject_open_redirect(self):
+    def test_reject_open_redirect(self, url='//example.com/%2F..'):
         # This will test the behavior when an attempt is made to cause an open
         # redirect. It should be rejected.
         mock_req = mock.MagicMock()
         mock_req.makefile().readline.side_effect = [
-            b'GET //example.com/%2F.. HTTP/1.1\r\n',
+            f'GET {url} HTTP/1.1\r\n'.encode('utf-8'),
             b''
         ]
 
@@ -619,41 +619,32 @@ def test_reject_open_redirect(self):
         result = output.readlines()
 
         # Verify no redirect happens and instead a 400 Bad Request is returned.
-        self.assertIn('400 URI must not start with //', result[0].decode())
+        # NOTE: As of python 3.10.6 there is a fix for this vulnerability,
+        # which will cause a 301 Moved Permanently error to be returned
+        # instead that redirects to a sanitized version of the URL with extra
+        # leading '/' characters removed.
+        # See https://github.com/python/cpython/issues/87389 for details.
+        # We will consider either response to be valid for this test. This will
+        # also help if and when the above fix gets backported to older versions
+        # of python.
+        errmsg = result[0].decode()
+        expected_nova = '400 URI must not start with //'
+        expected_cpython = '301 Moved Permanently'
+
+        self.assertTrue(expected_nova in errmsg or expected_cpython in errmsg)
+
+        # If we detect the cpython fix, verify that the redirect location is
+        # now the same url but with extra leading '/' characters removed.
+        if expected_cpython in errmsg:
+            location = result[3].decode()
+            location = location.removeprefix('Location: ').rstrip('\r\n')
+            self.assertTrue(
+                location.startswith('/example.com/%2F..'),
+                msg='Redirect location is not the expected sanitized URL',
+            )
 
     def test_reject_open_redirect_3_slashes(self):
-        # This will test the behavior when an attempt is made to cause an open
-        # redirect. It should be rejected.
-        mock_req = mock.MagicMock()
-        mock_req.makefile().readline.side_effect = [
-            b'GET ///example.com/%2F.. HTTP/1.1\r\n',
-            b''
-        ]
-
-        # Collect the response data to verify at the end. The
-        # SimpleHTTPRequestHandler writes the response data by calling the
-        # request socket sendall() method.
-        self.data = b''
-
-        def fake_sendall(data):
-            self.data += data
-
-        mock_req.sendall.side_effect = fake_sendall
-
-        client_addr = ('8.8.8.8', 54321)
-        mock_server = mock.MagicMock()
-        # This specifies that the server will be able to handle requests other
-        # than only websockets.
-        mock_server.only_upgrade = False
-
-        # Constructing a handler will process the mock_req request passed in.
-        websocketproxy.NovaProxyRequestHandler(
-            mock_req, client_addr, mock_server)
-
-        # Verify no redirect happens and instead a 400 Bad Request is returned.
-        self.data = self.data.decode()
-        self.assertIn('Error code: 400', self.data)
-        self.assertIn('Message: URI must not start with //', self.data)
+        self.test_reject_open_redirect(url='///example.com/%2F..')
 
     @mock.patch('nova.objects.ConsoleAuthToken.validate')
     def test_no_compute_rpcapi_with_invalid_token(self, mock_validate):

From b881dd25b4abb3c54934d8ebbccb2ac602c83177 Mon Sep 17 00:00:00 2001
From: John Garbutt <john.garbutt@stackhpc.com>
Date: Wed, 18 May 2022 19:06:36 +0100
Subject: [PATCH 41/93] Ironic: retry when node not available

After a baremetal instance is deleted, and its allocation is removed
in placement, the ironic node might start cleaning. Eventually nova
will notice and update the inventory to be reserved.
During this window, a new instance may have already picked this
ironic node.

When that race happens today the build fails with an error:
"Failed to reserve node ..."

This change tries to ensure the remaining alternative hosts are
attempted before aborting the build.
Clearly the race is still there, but this makes it less painful.

Related-Bug: #1974070
Change-Id: Ie5cdc17219c86927ab3769605808cb9d9fa9fa4d
(cherry picked from commit 8a476061c5e034016668cd9e5a20c4430ef6b68d)
(cherry picked from commit d71e9f6ec4933f9430db55537a36678b16ce895a)
---
 nova/compute/manager.py                     |  3 +-
 nova/tests/unit/compute/test_compute_mgr.py | 36 +++++++++++++++++++++
 nova/tests/unit/virt/ironic/test_driver.py  | 22 +++++++++++--
 nova/virt/ironic/driver.py                  | 12 +++++++
 4 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 0762098328a..258e0c39ffa 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -2708,7 +2708,8 @@ def _build_resources(self, context, instance, requested_networks,
                     block_device_mapping)
             resources['block_device_info'] = block_device_info
         except (exception.InstanceNotFound,
-                exception.UnexpectedDeletingTaskStateError):
+                exception.UnexpectedDeletingTaskStateError,
+                exception.ComputeResourcesUnavailable):
             with excutils.save_and_reraise_exception():
                 self._build_resources_cleanup(instance, network_info)
         except (exception.UnexpectedTaskStateError,
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 147e4480199..e948bafca48 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -7667,6 +7667,42 @@ def test_failed_bdm_prep_from_delete_raises_unexpected(self, mock_clean,
         mock_prepspawn.assert_called_once_with(self.instance)
         mock_failedspawn.assert_called_once_with(self.instance)
 
+    @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
+    @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
+    @mock.patch.object(virt_driver.ComputeDriver,
+                       'prepare_networks_before_block_device_mapping')
+    @mock.patch.object(virt_driver.ComputeDriver,
+                       'clean_networks_preparation')
+    def test_failed_prepare_for_spawn(self, mock_clean, mock_prepnet,
+                                      mock_prepspawn, mock_failedspawn):
+        mock_prepspawn.side_effect = exception.ComputeResourcesUnavailable(
+                reason="asdf")
+        with mock.patch.object(self.compute,
+                    '_build_networks_for_instance',
+                    return_value=self.network_info
+                ) as _build_networks_for_instance:
+
+            try:
+                with self.compute._build_resources(self.context, self.instance,
+                        self.requested_networks, self.security_groups,
+                        self.image, self.block_device_mapping,
+                        self.resource_provider_mapping, self.accel_uuids):
+                    pass
+            except Exception as e:
+                self.assertIsInstance(e,
+                    exception.ComputeResourcesUnavailable)
+
+            _build_networks_for_instance.assert_has_calls(
+                    [mock.call(self.context, self.instance,
+                        self.requested_networks, self.security_groups,
+                        self.resource_provider_mapping,
+                        self.network_arqs)])
+
+        mock_prepnet.assert_not_called()
+        mock_clean.assert_called_once_with(self.instance, self.network_info)
+        mock_prepspawn.assert_called_once_with(self.instance)
+        mock_failedspawn.assert_called_once_with(self.instance)
+
     @mock.patch.object(virt_driver.ComputeDriver, 'failed_spawn_cleanup')
     @mock.patch.object(virt_driver.ComputeDriver, 'prepare_for_spawn')
     @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
diff --git a/nova/tests/unit/virt/ironic/test_driver.py b/nova/tests/unit/virt/ironic/test_driver.py
index 0b1cc7d47fc..ea5c0dbc05a 100644
--- a/nova/tests/unit/virt/ironic/test_driver.py
+++ b/nova/tests/unit/virt/ironic/test_driver.py
@@ -2499,7 +2499,10 @@ def test_ironicclient_bad_response(self, mock_error):
 
     @mock.patch.object(cw.IronicClientWrapper, 'call')
     def test_prepare_for_spawn(self, mock_call):
-        node = ironic_utils.get_test_node(driver='fake')
+        node = ironic_utils.get_test_node(
+            driver='fake', instance_uuid=None,
+            provision_state=ironic_states.AVAILABLE,
+            power_state=ironic_states.POWER_OFF)
         self.mock_conn.get_node.return_value = node
         instance = fake_instance.fake_instance_obj(self.ctx,
                                                    node=node.uuid)
@@ -2531,7 +2534,10 @@ def test_prepare_for_spawn_invalid_instance(self):
                           instance)
 
     def test_prepare_for_spawn_conflict(self):
-        node = ironic_utils.get_test_node(driver='fake')
+        node = ironic_utils.get_test_node(
+            driver='fake', instance_uuid=None,
+            provision_state=ironic_states.AVAILABLE,
+            power_state=ironic_states.POWER_OFF)
         self.mock_conn.get_node.return_value = node
         self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
         instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
@@ -2539,6 +2545,18 @@ def test_prepare_for_spawn_conflict(self):
                           self.driver.prepare_for_spawn,
                           instance)
 
+    def test_prepare_for_spawn_not_available(self):
+        node = ironic_utils.get_test_node(
+            driver='fake', instance_uuid=None,
+            provision_state=ironic_states.CLEANWAIT,
+            power_state=ironic_states.POWER_OFF)
+        self.mock_conn.get_node.return_value = node
+        self.mock_conn.update_node.side_effect = sdk_exc.ConflictException
+        instance = fake_instance.fake_instance_obj(self.ctx, node=node.id)
+        self.assertRaises(exception.ComputeResourcesUnavailable,
+                          self.driver.prepare_for_spawn,
+                          instance)
+
     @mock.patch.object(ironic_driver.IronicDriver, '_cleanup_deploy')
     def test_failed_spawn_cleanup(self, mock_cleanup):
         node = ironic_utils.get_test_node(driver='fake')
diff --git a/nova/virt/ironic/driver.py b/nova/virt/ironic/driver.py
index 7970f185412..f21694da478 100644
--- a/nova/virt/ironic/driver.py
+++ b/nova/virt/ironic/driver.py
@@ -397,6 +397,18 @@ def prepare_for_spawn(self, instance):
                 _("Ironic node uuid not supplied to "
                   "driver for instance %s.") % instance.uuid)
         node = self._get_node(node_uuid)
+
+        # Its possible this node has just moved from deleting
+        # to cleaning. Placement will update the inventory
+        # as all reserved, but this instance might have got here
+        # before that happened, but after the previous allocation
+        # got deleted. We trigger a re-schedule to another node.
+        if (self._node_resources_used(node) or
+                self._node_resources_unavailable(node)):
+            msg = "Chosen ironic node %s is not available" % node_uuid
+            LOG.info(msg, instance=instance)
+            raise exception.ComputeResourcesUnavailable(reason=msg)
+
         self._set_instance_id(node, instance)
 
     def failed_spawn_cleanup(self, instance):

From 4954f993680c75fd9d3d507f2dcd00300c9b3d44 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Fri, 15 Jul 2022 12:43:58 +0200
Subject: [PATCH 42/93] Reproduce bug 1981813 in func env

There stable/yoga only change in test_pci_sriov_servers.py due to
unittest.mock switch[1] only happened in zed.

[1] https://review.opendev.org/q/topic:unittest.mock+status:merged+project:openstack/nova

Related-Bug: #1981813
Change-Id: I9367b7ed475917bdb05eb3f209ce1a4e646534e2
(cherry picked from commit f8c91eb75fc5504a37fc3b4be1d65d33dbc9b511)
---
 nova/tests/fixtures/libvirt.py                |  9 ++-
 .../libvirt/test_pci_sriov_servers.py         | 74 +++++++++++++++++++
 2 files changed, 80 insertions(+), 3 deletions(-)

diff --git a/nova/tests/fixtures/libvirt.py b/nova/tests/fixtures/libvirt.py
index 0684bae7ddd..5ccf01e40f9 100644
--- a/nova/tests/fixtures/libvirt.py
+++ b/nova/tests/fixtures/libvirt.py
@@ -2225,9 +2225,12 @@ def setUp(self):
 
         # libvirt driver needs to call out to the filesystem to get the
         # parent_ifname for the SRIOV VFs.
-        self.useFixture(fixtures.MockPatch(
-            'nova.pci.utils.get_ifname_by_pci_address',
-            return_value='fake_pf_interface_name'))
+        self.mock_get_ifname_by_pci_address = self.useFixture(
+            fixtures.MockPatch(
+                "nova.pci.utils.get_ifname_by_pci_address",
+                return_value="fake_pf_interface_name",
+            )
+        ).mock
 
         self.useFixture(fixtures.MockPatch(
             'nova.pci.utils.get_mac_by_pci_address',
diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
index c228fb04cf8..c1618751a9e 100644
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@@ -28,6 +28,7 @@
 
 import nova
 from nova import context
+from nova import exception
 from nova.network import constants
 from nova import objects
 from nova.objects import fields
@@ -951,6 +952,79 @@ def test_create_server_after_change_in_nonsriov_pf_to_sriov_pf(self):
             ],
         )
 
+    def test_change_bound_port_vnic_type_kills_compute_at_restart(self):
+        """Create a server with a direct port and change the vnic_type of the
+        bound port to macvtap. Then restart the compute service.
+
+        As the vnic_type is changed on the port but the vif_type is hwveb
+        instead of macvtap the vif plug logic will try to look up the netdev
+        of the parent VF. Howvere that VF consumed by the instance so the
+        netdev does not exists. This causes that the compute service will fail
+        with an exception during startup
+        """
+        pci_info = fakelibvirt.HostPCIDevicesInfo(num_pfs=1, num_vfs=2)
+        self.start_compute(pci_info=pci_info)
+
+        # create a direct port
+        port = self.neutron.network_4_port_1
+        self.neutron.create_port({'port': port})
+
+        # create a server using the VF via neutron
+        server = self._create_server(networks=[{'port': port['id']}])
+
+        # update the vnic_type of the port in neutron
+        port = copy.deepcopy(port)
+        port['binding:vnic_type'] = 'macvtap'
+        self.neutron.update_port(port['id'], {"port": port})
+
+        compute = self.computes['compute1']
+
+        # Force an update on the instance info cache to ensure nova gets the
+        # information about the updated port
+        with context.target_cell(
+            context.get_admin_context(),
+            self.host_mappings['compute1'].cell_mapping
+        ) as cctxt:
+            compute.manager._heal_instance_info_cache(cctxt)
+
+        def fake_get_ifname_by_pci_address(pci_addr: str, pf_interface=False):
+            # we want to fail the netdev lookup only if the pci_address is
+            # already consumed by our instance. So we look into the instance
+            # definition to see if the device is attached to the instance as VF
+            conn = compute.manager.driver._host.get_connection()
+            dom = conn.lookupByUUIDString(server['id'])
+            dev = dom._def['devices']['nics'][0]
+            lookup_addr = pci_addr.replace(':', '_').replace('.', '_')
+            if (
+                dev['type'] == 'hostdev' and
+                dev['source'] == 'pci_' + lookup_addr
+            ):
+                # nova tried to look up the netdev of an already consumed VF.
+                # So we have to fail
+                raise exception.PciDeviceNotFoundById(id=pci_addr)
+
+        # We need to simulate the actual failure manually as in our functional
+        # environment all the PCI lookup is mocked. In reality nova tries to
+        # look up the netdev of the pci device on the host used by the port as
+        # the parent of the macvtap. However, as the originally direct port is
+        # bound to the instance, the VF pci device is already consumed by the
+        # instance and therefore there is no netdev for the VF.
+        with mock.patch(
+            'nova.pci.utils.get_ifname_by_pci_address',
+            side_effect=fake_get_ifname_by_pci_address,
+        ):
+            # This is bug 1981813 as the compute service fails to start with an
+            # exception.
+            # Nova cannot prevent the vnic_type change on a bound port. Neutron
+            # should prevent that instead. But the nova-compute should still
+            # be able to start up and only log an ERROR for this instance in
+            # inconsistent state.
+            self.assertRaises(
+                exception.PciDeviceNotFoundById,
+                self.restart_compute_service,
+                'compute1',
+            )
+
 
 class SRIOVAttachDetachTest(_PCIServersTestBase):
     # no need for aliases as these test will request SRIOV via neutron

From a28c82719545d5c8ee7f3ff1361b3a796e05095a Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Fri, 15 Jul 2022 13:48:46 +0200
Subject: [PATCH 43/93] Gracefully ERROR in _init_instance if vnic_type changed

If the vnic_type of a bound port changes from "direct" to "macvtap" and
then the compute service is restarted then during _init_instance nova
tries to plug the vif of the changed port. However as it now has macvtap
vnic_type nova tries to look up the netdev of the parent VF. Still that
VF is consumed by the instance so there is no such netdev on the host
OS. This error killed the compute service at startup due to unhandled
exception. This patch adds the exception handler, logs an ERROR and
continue initializing other instances on the host.

Also this patch adds a detailed ERROR log when nova detects that the
vnic_type changed during _heal_instance_info_cache periodic.

Closes-Bug: #1981813
Change-Id: I1719f8eda04e8d15a3b01f0612977164c4e55e85
(cherry picked from commit e43bf900dc8ca66578603bed333c56b215b1876e)
---
 nova/compute/manager.py                       |  14 ++
 nova/network/neutron.py                       |  34 ++++
 .../libvirt/test_pci_sriov_servers.py         |  23 ++-
 nova/tests/unit/compute/test_compute_mgr.py   |  30 ++++
 nova/tests/unit/network/test_neutron.py       | 149 ++++++++++++++++++
 ...813-vnic-type-change-9f3e16fae885b57f.yaml |   9 ++
 6 files changed, 252 insertions(+), 7 deletions(-)
 create mode 100644 releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 0762098328a..435578d40c3 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1242,6 +1242,20 @@ def _init_instance(self, context, instance):
                           'updated.', instance=instance)
             self._set_instance_obj_error_state(instance)
             return
+        except exception.PciDeviceNotFoundById:
+            # This is bug 1981813 where the bound port vnic_type has changed
+            # from direct to macvtap. Nova does not support that and it
+            # already printed an ERROR when the change is detected during
+            # _heal_instance_info_cache. Now we print an ERROR again and skip
+            # plugging the vifs but let the service startup continue to init
+            # the other instances
+            LOG.exception(
+                'Virtual interface plugging failed for instance. Probably the '
+                'vnic_type of the bound port has been changed. Nova does not '
+                'support such change.',
+                instance=instance
+            )
+            return
 
         if instance.task_state == task_states.RESIZE_MIGRATING:
             # We crashed during resize/migration, so roll back for safety
diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index 3c9da4e9370..1e703658f87 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -3383,6 +3383,25 @@ def _build_vif_model(self, context, client, current_neutron_port,
             delegate_create=True,
         )
 
+    def _log_error_if_vnic_type_changed(
+        self, port_id, old_vnic_type, new_vnic_type, instance
+    ):
+        if old_vnic_type and old_vnic_type != new_vnic_type:
+            LOG.error(
+                'The vnic_type of the bound port %s has '
+                'been changed in neutron from "%s" to '
+                '"%s". Changing vnic_type of a bound port '
+                'is not supported by Nova. To avoid '
+                'breaking the connectivity of the instance '
+                'please change the port vnic_type back to '
+                '"%s".',
+                port_id,
+                old_vnic_type,
+                new_vnic_type,
+                old_vnic_type,
+                instance=instance
+            )
+
     def _build_network_info_model(self, context, instance, networks=None,
                                   port_ids=None, admin_client=None,
                                   preexisting_port_ids=None,
@@ -3456,6 +3475,12 @@ def _build_network_info_model(self, context, instance, networks=None,
                         preexisting_port_ids)
                     for index, vif in enumerate(nw_info):
                         if vif['id'] == refresh_vif_id:
+                            self._log_error_if_vnic_type_changed(
+                                vif['id'],
+                                vif['vnic_type'],
+                                refreshed_vif['vnic_type'],
+                                instance,
+                            )
                             # Update the existing entry.
                             nw_info[index] = refreshed_vif
                             LOG.debug('Updated VIF entry in instance network '
@@ -3505,6 +3530,7 @@ def _build_network_info_model(self, context, instance, networks=None,
             networks, port_ids = self._gather_port_ids_and_networks(
                     context, instance, networks, port_ids, client)
 
+        old_nw_info = instance.get_network_info()
         nw_info = network_model.NetworkInfo()
         for port_id in port_ids:
             current_neutron_port = current_neutron_port_map.get(port_id)
@@ -3512,6 +3538,14 @@ def _build_network_info_model(self, context, instance, networks=None,
                 vif = self._build_vif_model(
                     context, client, current_neutron_port, networks,
                     preexisting_port_ids)
+                for old_vif in old_nw_info:
+                    if old_vif['id'] == port_id:
+                        self._log_error_if_vnic_type_changed(
+                            port_id,
+                            old_vif['vnic_type'],
+                            vif['vnic_type'],
+                            instance,
+                        )
                 nw_info.append(vif)
             elif nw_info_refresh:
                 LOG.info('Port %s from network info_cache is no '
diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
index c1618751a9e..c8add15480f 100644
--- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py
+++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py
@@ -986,6 +986,14 @@ def test_change_bound_port_vnic_type_kills_compute_at_restart(self):
             self.host_mappings['compute1'].cell_mapping
         ) as cctxt:
             compute.manager._heal_instance_info_cache(cctxt)
+            self.assertIn(
+                'The vnic_type of the bound port %s has been changed in '
+                'neutron from "direct" to "macvtap". Changing vnic_type of a '
+                'bound port is not supported by Nova. To avoid breaking the '
+                'connectivity of the instance please change the port '
+                'vnic_type back to "direct".' % port['id'],
+                self.stdlog.logger.output,
+            )
 
         def fake_get_ifname_by_pci_address(pci_addr: str, pf_interface=False):
             # we want to fail the netdev lookup only if the pci_address is
@@ -1013,17 +1021,18 @@ def fake_get_ifname_by_pci_address(pci_addr: str, pf_interface=False):
             'nova.pci.utils.get_ifname_by_pci_address',
             side_effect=fake_get_ifname_by_pci_address,
         ):
-            # This is bug 1981813 as the compute service fails to start with an
-            # exception.
             # Nova cannot prevent the vnic_type change on a bound port. Neutron
             # should prevent that instead. But the nova-compute should still
             # be able to start up and only log an ERROR for this instance in
             # inconsistent state.
-            self.assertRaises(
-                exception.PciDeviceNotFoundById,
-                self.restart_compute_service,
-                'compute1',
-            )
+            self.restart_compute_service('compute1')
+
+        self.assertIn(
+            'Virtual interface plugging failed for instance. Probably the '
+            'vnic_type of the bound port has been changed. Nova does not '
+            'support such change.',
+            self.stdlog.logger.output,
+        )
 
 
 class SRIOVAttachDetachTest(_PCIServersTestBase):
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 147e4480199..1b0469b8e65 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -1306,6 +1306,36 @@ def test_init_instance_with_binding_failed_vif_type(self):
             self.compute._init_instance(self.context, instance)
             set_error_state.assert_called_once_with(instance)
 
+    def test_init_instance_vif_plug_fails_missing_pci(self):
+        instance = fake_instance.fake_instance_obj(
+                self.context,
+                uuid=uuids.instance,
+                info_cache=None,
+                power_state=power_state.RUNNING,
+                vm_state=vm_states.ACTIVE,
+                task_state=None,
+                host=self.compute.host,
+                expected_attrs=['info_cache'])
+
+        with test.nested(
+            mock.patch.object(context, 'get_admin_context',
+                return_value=self.context),
+            mock.patch.object(objects.Instance, 'get_network_info',
+                return_value=network_model.NetworkInfo()),
+            mock.patch.object(self.compute.driver, 'plug_vifs',
+                side_effect=exception.PciDeviceNotFoundById("pci-addr")),
+            mock.patch("nova.compute.manager.LOG.exception"),
+        ) as (get_admin_context, get_nw_info, plug_vifs, log_exception):
+            # as this does not raise, we are sure that the compute service
+            # continues initializing the rest of the instances
+            self.compute._init_instance(self.context, instance)
+            log_exception.assert_called_once_with(
+                "Virtual interface plugging failed for instance. Probably the "
+                "vnic_type of the bound port has been changed. Nova does not "
+                "support such change.",
+                instance=instance
+            )
+
     def _test__validate_pinning_configuration(self, supports_pcpus=True):
         instance_1 = fake_instance.fake_instance_obj(
             self.context, uuid=uuids.instance_1)
diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py
index 03e65bb6081..5cde8d482d3 100644
--- a/nova/tests/unit/network/test_neutron.py
+++ b/nova/tests/unit/network/test_neutron.py
@@ -3383,6 +3383,155 @@ def test_build_network_info_model_empty(
         mocked_client.list_ports.assert_called_once_with(
             tenant_id=uuids.fake, device_id=uuids.instance)
 
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_physnet_tunneled_info',
+        new=mock.Mock(return_value=(None, False)))
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_preexisting_port_ids',
+        new=mock.Mock(return_value=[]))
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_subnets_from_port',
+        new=mock.Mock(return_value=[model.Subnet(cidr='1.0.0.0/8')]))
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_floating_ips_by_fixed_and_port',
+        new=mock.Mock(return_value=[{'floating_ip_address': '10.0.0.1'}]))
+    @mock.patch.object(neutronapi, 'get_client')
+    def test_build_network_info_model_full_vnic_type_change(
+        self, mock_get_client
+    ):
+        mocked_client = mock.create_autospec(client.Client)
+        mock_get_client.return_value = mocked_client
+        fake_inst = objects.Instance()
+        fake_inst.project_id = uuids.fake
+        fake_inst.uuid = uuids.instance
+        fake_ports = [
+            {
+                "id": "port1",
+                "network_id": "net-id",
+                "tenant_id": uuids.fake,
+                "admin_state_up": True,
+                "status": "ACTIVE",
+                "fixed_ips": [{"ip_address": "1.1.1.1"}],
+                "mac_address": "de:ad:be:ef:00:01",
+                "binding:vif_type": model.VIF_TYPE_BRIDGE,
+                "binding:vnic_type": model.VNIC_TYPE_DIRECT,
+                "binding:vif_details": {},
+            },
+        ]
+        mocked_client.list_ports.return_value = {'ports': fake_ports}
+        fake_inst.info_cache = objects.InstanceInfoCache.new(
+            self.context, uuids.instance)
+        fake_inst.info_cache.network_info = model.NetworkInfo.hydrate([])
+
+        # build the network info first
+        nw_infos = self.api._build_network_info_model(
+            self.context,
+            fake_inst,
+            force_refresh=True,
+        )
+
+        self.assertEqual(1, len(nw_infos))
+        fake_inst.info_cache.network_info = nw_infos
+
+        # change the vnic_type of the port and rebuild the network info
+        fake_ports[0]["binding:vnic_type"] = model.VNIC_TYPE_MACVTAP
+        with mock.patch(
+            "nova.network.neutron.API._log_error_if_vnic_type_changed"
+        ) as mock_log:
+            nw_infos = self.api._build_network_info_model(
+                self.context,
+                fake_inst,
+                force_refresh=True,
+            )
+
+        mock_log.assert_called_once_with(
+            fake_ports[0]["id"], "direct", "macvtap", fake_inst)
+        self.assertEqual(1, len(nw_infos))
+
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_physnet_tunneled_info',
+        new=mock.Mock(return_value=(None, False)))
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_preexisting_port_ids',
+        new=mock.Mock(return_value=[]))
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_subnets_from_port',
+        new=mock.Mock(return_value=[model.Subnet(cidr='1.0.0.0/8')]))
+    @mock.patch.object(
+        neutronapi.API,
+        '_get_floating_ips_by_fixed_and_port',
+        new=mock.Mock(return_value=[{'floating_ip_address': '10.0.0.1'}]))
+    @mock.patch.object(neutronapi, 'get_client')
+    def test_build_network_info_model_single_vnic_type_change(
+        self, mock_get_client
+    ):
+        mocked_client = mock.create_autospec(client.Client)
+        mock_get_client.return_value = mocked_client
+        fake_inst = objects.Instance()
+        fake_inst.project_id = uuids.fake
+        fake_inst.uuid = uuids.instance
+        fake_ports = [
+            {
+                "id": "port1",
+                "network_id": "net-id",
+                "tenant_id": uuids.fake,
+                "admin_state_up": True,
+                "status": "ACTIVE",
+                "fixed_ips": [{"ip_address": "1.1.1.1"}],
+                "mac_address": "de:ad:be:ef:00:01",
+                "binding:vif_type": model.VIF_TYPE_BRIDGE,
+                "binding:vnic_type": model.VNIC_TYPE_DIRECT,
+                "binding:vif_details": {},
+            },
+        ]
+        fake_nets = [
+            {
+                "id": "net-id",
+                "name": "foo",
+                "tenant_id": uuids.fake,
+            }
+        ]
+        mocked_client.list_ports.return_value = {'ports': fake_ports}
+        fake_inst.info_cache = objects.InstanceInfoCache.new(
+            self.context, uuids.instance)
+        fake_inst.info_cache.network_info = model.NetworkInfo.hydrate([])
+
+        # build the network info first
+        nw_infos = self.api._build_network_info_model(
+            self.context,
+            fake_inst,
+            fake_nets,
+            [fake_ports[0]["id"]],
+            refresh_vif_id=fake_ports[0]["id"],
+        )
+
+        self.assertEqual(1, len(nw_infos))
+        fake_inst.info_cache.network_info = nw_infos
+
+        # change the vnic_type of the port and rebuild the network info
+        fake_ports[0]["binding:vnic_type"] = model.VNIC_TYPE_MACVTAP
+        with mock.patch(
+                "nova.network.neutron.API._log_error_if_vnic_type_changed"
+        ) as mock_log:
+            nw_infos = self.api._build_network_info_model(
+                self.context,
+                fake_inst,
+                fake_nets,
+                [fake_ports[0]["id"]],
+                refresh_vif_id=fake_ports[0]["id"],
+            )
+
+        mock_log.assert_called_once_with(
+            fake_ports[0]["id"], "direct", "macvtap", fake_inst)
+        self.assertEqual(1, len(nw_infos))
+
     @mock.patch.object(neutronapi, 'get_client')
     def test_get_subnets_from_port(self, mock_get_client):
         mocked_client = mock.create_autospec(client.Client)
diff --git a/releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml b/releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml
new file mode 100644
index 00000000000..a5a3b7c8c2c
--- /dev/null
+++ b/releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml
@@ -0,0 +1,9 @@
+---
+fixes:
+  - |
+    `Bug #1981813 <https://bugs.launchpad.net/nova/+bug/1981813>`_: Now nova
+    detects if the ``vnic_type`` of a bound port has been changed in neutron
+    and leaves an ERROR message in the compute service log as such change on a
+    bound port is not supported. Also the restart of the nova-compute service
+    will not crash any more after such port change. Nova will log an ERROR and
+    skip the  initialization of the instance with such port during the startup.

From 9f6ca77a184379e90e10d6705fbd78208debb612 Mon Sep 17 00:00:00 2001
From: Jorge San Emeterio <jsanemet@redhat.com>
Date: Tue, 11 Oct 2022 13:14:12 +0200
Subject: [PATCH 44/93] Improving logging at '_allocate_mdevs'.

Adding both 'info' and 'debug' messages with the intention of telling
which mdevs are available, which get allocated and whether new ones
are created.

Closes-Bug: #1992451
Change-Id: Ibd331df51fd4eaeed4831a98469f06a4ce0cd452
(cherry picked from commit 6feb3350b048606297068841e3feba110bb0b0ab)
(cherry picked from commit 03374cf4a2ff98c938691a209d6a3fb14a06d3a0)
---
 nova/virt/libvirt/driver.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 7c0abcb1506..cc5e4b5da52 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -8115,6 +8115,7 @@ def _get_existing_mdevs_not_assigned(self, parent, requested_types=None):
         :param requested_types: Filter out the result for only mediated devices
                                 having those types.
         """
+        LOG.debug('Searching for available mdevs...')
         allocated_mdevs = self._get_all_assigned_mediated_devices()
         mdevs = self._get_mediated_devices(requested_types)
         available_mdevs = set()
@@ -8130,6 +8131,7 @@ def _get_existing_mdevs_not_assigned(self, parent, requested_types=None):
                 available_mdevs.add(mdev["uuid"])
 
         available_mdevs -= set(allocated_mdevs)
+        LOG.info('Available mdevs at: %s.', available_mdevs)
         return available_mdevs
 
     def _create_new_mediated_device(self, parent, uuid=None):
@@ -8141,6 +8143,7 @@ def _create_new_mediated_device(self, parent, uuid=None):
 
         :returns: the newly created mdev UUID or None if not possible
         """
+        LOG.debug('Attempting to create new mdev...')
         supported_types = self.supported_vgpu_types
         # Try to see if we can still create a new mediated device
         devices = self._get_mdev_capable_devices(supported_types)
@@ -8152,6 +8155,7 @@ def _create_new_mediated_device(self, parent, uuid=None):
                 # The device is not the one that was called, not creating
                 # the mdev
                 continue
+            LOG.debug('Trying on: %s.', dev_name)
             dev_supported_type = self._get_vgpu_type_per_pgpu(dev_name)
             if dev_supported_type and device['types'][
                     dev_supported_type]['availableInstances'] > 0:
@@ -8161,7 +8165,13 @@ def _create_new_mediated_device(self, parent, uuid=None):
                 pci_addr = "{}:{}:{}.{}".format(*dev_name[4:].split('_'))
                 chosen_mdev = nova.privsep.libvirt.create_mdev(
                     pci_addr, dev_supported_type, uuid=uuid)
+                LOG.info('Created mdev: %s on pGPU: %s.',
+                         chosen_mdev, pci_addr)
                 return chosen_mdev
+            LOG.debug('Failed: No available instances on device.')
+        LOG.info('Failed to create mdev. '
+                 'No free space found among the following devices: %s.',
+                 [dev['dev_id'] for dev in devices])
 
     @utils.synchronized(VGPU_RESOURCE_SEMAPHORE)
     def _allocate_mdevs(self, allocations):
@@ -8244,6 +8254,8 @@ def _allocate_mdevs(self, allocations):
                 # Take the first available mdev
                 chosen_mdev = mdevs_available.pop()
             else:
+                LOG.debug('No available mdevs where found. '
+                          'Creating an new one...')
                 chosen_mdev = self._create_new_mediated_device(parent_device)
             if not chosen_mdev:
                 # If we can't find devices having available VGPUs, just raise
@@ -8251,6 +8263,7 @@ def _allocate_mdevs(self, allocations):
                     reason='mdev-capable resource is not available')
             else:
                 chosen_mdevs.append(chosen_mdev)
+                LOG.info('Allocated mdev: %s.', chosen_mdev)
         return chosen_mdevs
 
     def _detach_mediated_devices(self, guest):

From 516f0de1f6a54cd24d8ebc906c1e3fd3bab0d32e Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Thu, 10 Nov 2022 09:55:48 -0800
Subject: [PATCH 45/93] [stable-only][cve] Check VMDK create-type against an
 allowed list

NOTE(sbauza): Stable policy allows us to proactively merge a backport without waiting for the parent patch to be merged (exception to rule #4 in [1]. Marking [stable-only] in order to silence nova-tox-validate-backport

[1] https://docs.openstack.org/project-team-guide/stable-branches.html#appropriate-fixes

Related-Bug: #1996188
Change-Id: I5a399f1d3d702bfb76c067893e9c924904c8c360
---
 nova/conf/compute.py                |  9 ++++++
 nova/tests/unit/virt/test_images.py | 46 +++++++++++++++++++++++++++++
 nova/virt/images.py                 | 31 +++++++++++++++++++
 3 files changed, 86 insertions(+)

diff --git a/nova/conf/compute.py b/nova/conf/compute.py
index 5abe7694f80..352080011ad 100644
--- a/nova/conf/compute.py
+++ b/nova/conf/compute.py
@@ -1007,6 +1007,15 @@
 * ``[scheduler]query_placement_for_image_type_support`` - enables
   filtering computes based on supported image types, which is required
   to be enabled for this to take effect.
+"""),
+    cfg.ListOpt('vmdk_allowed_types',
+                default=['streamOptimized', 'monolithicSparse'],
+                help="""
+A list of strings describing allowed VMDK "create-type" subformats
+that will be allowed. This is recommended to only include
+single-file-with-sparse-header variants to avoid potential host file
+exposure due to processing named extents. If this list is empty, then no
+form of VMDK image will be allowed.
 """),
     cfg.BoolOpt('packing_host_numa_cells_allocation_strategy',
         default=True,
diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py
index 085b169db3c..563330b5414 100644
--- a/nova/tests/unit/virt/test_images.py
+++ b/nova/tests/unit/virt/test_images.py
@@ -16,6 +16,8 @@
 
 import mock
 from oslo_concurrency import processutils
+from oslo_serialization import jsonutils
+from oslo_utils import imageutils
 
 from nova.compute import utils as compute_utils
 from nova import exception
@@ -135,3 +137,47 @@ def test_convert_image_without_direct_io_support(self, mock_execute,
                     '-O', 'out_format', '-f', 'in_format', 'source', 'dest')
         mock_disk_op_sema.__enter__.assert_called_once()
         self.assertTupleEqual(expected, mock_execute.call_args[0])
+
+    def test_convert_image_vmdk_allowed_list_checking(self):
+        info = {'format': 'vmdk',
+                'format-specific': {
+                    'type': 'vmdk',
+                    'data': {
+                        'create-type': 'monolithicFlat',
+                }}}
+
+        # If the format is not in the allowed list, we should get an error
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.check_vmdk_image, 'foo',
+                          imageutils.QemuImgInfo(jsonutils.dumps(info),
+                                                 format='json'))
+
+        # With the format in the allowed list, no error
+        self.flags(vmdk_allowed_types=['streamOptimized', 'monolithicFlat',
+                                       'monolithicSparse'],
+                   group='compute')
+        images.check_vmdk_image('foo',
+                                imageutils.QemuImgInfo(jsonutils.dumps(info),
+                                                       format='json'))
+
+        # With an empty list, allow nothing
+        self.flags(vmdk_allowed_types=[], group='compute')
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.check_vmdk_image, 'foo',
+                          imageutils.QemuImgInfo(jsonutils.dumps(info),
+                                                 format='json'))
+
+    @mock.patch.object(images, 'fetch')
+    @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info')
+    def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch):
+        info = {'format': 'vmdk',
+                'format-specific': {
+                    'type': 'vmdk',
+                    'data': {
+                        'create-type': 'monolithicFlat',
+                }}}
+        mock_info.return_value = jsonutils.dumps(info)
+        with mock.patch('os.path.exists', return_value=True):
+            e = self.assertRaises(exception.ImageUnacceptable,
+                                  images.fetch_to_raw, None, 'foo', 'anypath')
+            self.assertIn('Invalid VMDK create-type specified', str(e))
diff --git a/nova/virt/images.py b/nova/virt/images.py
index 5358f3766ac..f13c8722909 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -110,6 +110,34 @@ def get_info(context, image_href):
     return IMAGE_API.get(context, image_href)
 
 
+def check_vmdk_image(image_id, data):
+    # Check some rules about VMDK files. Specifically we want to make
+    # sure that the "create-type" of the image is one that we allow.
+    # Some types of VMDK files can reference files outside the disk
+    # image and we do not want to allow those for obvious reasons.
+
+    types = CONF.compute.vmdk_allowed_types
+
+    if not len(types):
+        LOG.warning('Refusing to allow VMDK image as vmdk_allowed_'
+                    'types is empty')
+        msg = _('Invalid VMDK create-type specified')
+        raise exception.ImageUnacceptable(image_id=image_id, reason=msg)
+
+    try:
+        create_type = data.format_specific['data']['create-type']
+    except KeyError:
+        msg = _('Unable to determine VMDK create-type')
+        raise exception.ImageUnacceptable(image_id=image_id, reason=msg)
+
+    if create_type not in CONF.compute.vmdk_allowed_types:
+        LOG.warning('Refusing to process VMDK file with create-type of %r '
+                    'which is not in allowed set of: %s', create_type,
+                    ','.join(CONF.compute.vmdk_allowed_types))
+        msg = _('Invalid VMDK create-type specified')
+        raise exception.ImageUnacceptable(image_id=image_id, reason=msg)
+
+
 def fetch_to_raw(context, image_href, path, trusted_certs=None):
     path_tmp = "%s.part" % path
     fetch(context, image_href, path_tmp, trusted_certs)
@@ -129,6 +157,9 @@ def fetch_to_raw(context, image_href, path, trusted_certs=None):
                 reason=(_("fmt=%(fmt)s backed by: %(backing_file)s") %
                         {'fmt': fmt, 'backing_file': backing_file}))
 
+        if fmt == 'vmdk':
+            check_vmdk_image(image_href, data)
+
         if fmt != "raw" and CONF.force_raw_images:
             staged = "%s.converted" % path
             LOG.debug("%s was %s, converting to raw", image_href, fmt)

From c07495d9d64dd0635d72fc7ff67d73a656a40d13 Mon Sep 17 00:00:00 2001
From: Kashyap Chamarthy <kchamart@redhat.com>
Date: Tue, 26 Jul 2022 16:02:17 +0200
Subject: [PATCH 46/93] Add a workaround to skip hypervisor version check on LM

When turned on, this will disable the version-checking of hypervisors
during live-migration.  This can be useful for operators in certain
scenarios when upgrading.  E.g. if you want to relocate all instances
off a compute node due to an emergency hardware issue, and you only have
another old compute node ready at the time.

Note, though: libvirt will do its own internal compatibility checks, and
might still reject live migration if the destination is incompatible.

Closes-Bug: #1982853

Change-Id: Iec387dcbc49ddb91ebf5cfd188224eaf6021c0e1
Signed-off-by: Kashyap Chamarthy <kchamart@redhat.com>
(cherry picked from commit 00ed8a232bc22f48011e95a0b47750520a5b4d47)
---
 nova/conductor/tasks/live_migrate.py          |  5 ++--
 nova/conf/workarounds.py                      |  7 +++++
 .../unit/conductor/tasks/test_live_migrate.py | 30 +++++++++++++++++++
 ...-version-check-on-lm-a87f2dcb4f8bf0f2.yaml | 13 ++++++++
 4 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 releasenotes/notes/skip-hypervisor-version-check-on-lm-a87f2dcb4f8bf0f2.yaml

diff --git a/nova/conductor/tasks/live_migrate.py b/nova/conductor/tasks/live_migrate.py
index 1acae88b264..f8819b0dc85 100644
--- a/nova/conductor/tasks/live_migrate.py
+++ b/nova/conductor/tasks/live_migrate.py
@@ -347,8 +347,9 @@ def _check_compatible_with_source_hypervisor(self, destination):
 
         source_version = source_info.hypervisor_version
         destination_version = destination_info.hypervisor_version
-        if source_version > destination_version:
-            raise exception.DestinationHypervisorTooOld()
+        if not CONF.workarounds.skip_hypervisor_version_check_on_lm:
+            if source_version > destination_version:
+                raise exception.DestinationHypervisorTooOld()
         return source_info, destination_info
 
     def _call_livem_checks_on_host(self, destination, provider_mapping):
diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py
index 6c52eae8e5d..2ec53282cdb 100644
--- a/nova/conf/workarounds.py
+++ b/nova/conf/workarounds.py
@@ -409,6 +409,13 @@
 with the destination host. When using QEMU >= 2.9 and libvirt >=
 4.4.0, libvirt will do the correct thing with respect to checking CPU
 compatibility on the destination host during live migration.
+"""),
+    cfg.BoolOpt(
+        'skip_hypervisor_version_check_on_lm',
+        default=False,
+        help="""
+When this is enabled, it will skip version-checking of hypervisors
+during live migration.
 """),
 ]
 
diff --git a/nova/tests/unit/conductor/tasks/test_live_migrate.py b/nova/tests/unit/conductor/tasks/test_live_migrate.py
index cb40c076c82..dd4ee7c3fec 100644
--- a/nova/tests/unit/conductor/tasks/test_live_migrate.py
+++ b/nova/tests/unit/conductor/tasks/test_live_migrate.py
@@ -345,6 +345,36 @@ def test_check_compatible_fails_with_hypervisor_too_old(
                           mock.call(self.destination)],
                          mock_get_info.call_args_list)
 
+    @mock.patch.object(live_migrate.LiveMigrationTask, '_get_compute_info')
+    def test_skip_hypervisor_version_check_on_lm_raise_ex(self, mock_get_info):
+        host1 = {'hypervisor_type': 'a', 'hypervisor_version': 7}
+        host2 = {'hypervisor_type': 'a', 'hypervisor_version': 6}
+        self.flags(group='workarounds',
+                   skip_hypervisor_version_check_on_lm=False)
+        mock_get_info.side_effect = [objects.ComputeNode(**host1),
+                                     objects.ComputeNode(**host2)]
+        self.assertRaises(exception.DestinationHypervisorTooOld,
+                          self.task._check_compatible_with_source_hypervisor,
+                          self.destination)
+        self.assertEqual([mock.call(self.instance_host),
+                          mock.call(self.destination)],
+                         mock_get_info.call_args_list)
+
+    @mock.patch.object(live_migrate.LiveMigrationTask, '_get_compute_info')
+    def test_skip_hypervisor_version_check_on_lm_do_not_raise_ex(
+        self, mock_get_info
+    ):
+        host1 = {'hypervisor_type': 'a', 'hypervisor_version': 7}
+        host2 = {'hypervisor_type': 'a', 'hypervisor_version': 6}
+        self.flags(group='workarounds',
+                   skip_hypervisor_version_check_on_lm=True)
+        mock_get_info.side_effect = [objects.ComputeNode(**host1),
+                                     objects.ComputeNode(**host2)]
+        self.task._check_compatible_with_source_hypervisor(self.destination)
+        self.assertEqual([mock.call(self.instance_host),
+                          mock.call(self.destination)],
+                         mock_get_info.call_args_list)
+
     @mock.patch.object(compute_rpcapi.ComputeAPI,
                        'check_can_live_migrate_destination')
     def test_check_requested_destination(self, mock_check):
diff --git a/releasenotes/notes/skip-hypervisor-version-check-on-lm-a87f2dcb4f8bf0f2.yaml b/releasenotes/notes/skip-hypervisor-version-check-on-lm-a87f2dcb4f8bf0f2.yaml
new file mode 100644
index 00000000000..00fe6a24c70
--- /dev/null
+++ b/releasenotes/notes/skip-hypervisor-version-check-on-lm-a87f2dcb4f8bf0f2.yaml
@@ -0,0 +1,13 @@
+---
+feature:
+  - |
+    Adds a workaround that allows one to disable hypervisor
+    version-check on live migration.  This workaround option can be
+    useful in certain scenarios when upgrading.  E.g. if you want to
+    relocate all instances off a compute node due to an emergency
+    hardware issue, and you only have another old compute node ready at
+    the time.
+
+    To enable this, use the config attribute
+    ``[workarounds]skip_hypervisor_version_check_on_lm=True`` in
+    ``nova.conf``.  The option defaults to ``False``.

From 286aadf289ef1b001ba7816e20b8cd53ba7a8618 Mon Sep 17 00:00:00 2001
From: Rajesh Tailor <ratailor@redhat.com>
Date: Tue, 11 Oct 2022 18:01:17 +0530
Subject: [PATCH 47/93] Handle InstanceInvalidState exception

When instance task state is 'deleting' or 'migrating', then
get_vnc_console throws 500 error, as InstanceInvalidState
exception is not handled there.

This change handles InstanceInvalidState in api layer in
get_vnc_console call.

Closes-Bug: #1968618
Change-Id: Ia738a0972b050f549f446c85171d3f33e60ada4f
(cherry picked from commit ec40d5aee34e9428e2a19231fc3df4d23d75b779)
(cherry picked from commit 71855163a944e437f9c48a5765f683b55a28c720)
---
 nova/api/openstack/compute/remote_consoles.py |  3 +++
 .../api_sample_tests/test_remote_consoles.py  | 20 +++++++++++++++++++
 .../openstack/compute/test_remote_consoles.py | 12 +++++++++++
 3 files changed, 35 insertions(+)

diff --git a/nova/api/openstack/compute/remote_consoles.py b/nova/api/openstack/compute/remote_consoles.py
index 36015542aa3..7d374ef432e 100644
--- a/nova/api/openstack/compute/remote_consoles.py
+++ b/nova/api/openstack/compute/remote_consoles.py
@@ -56,6 +56,9 @@ def get_vnc_console(self, req, id, body):
             raise webob.exc.HTTPNotFound(explanation=e.format_message())
         except exception.InstanceNotReady as e:
             raise webob.exc.HTTPConflict(explanation=e.format_message())
+        except exception.InstanceInvalidState as e:
+            common.raise_http_conflict_for_instance_invalid_state(
+                e, 'get_vnc_console', id)
         except NotImplementedError:
             common.raise_feature_not_supported()
 
diff --git a/nova/tests/functional/api_sample_tests/test_remote_consoles.py b/nova/tests/functional/api_sample_tests/test_remote_consoles.py
index 986826bfee0..e304402ee94 100644
--- a/nova/tests/functional/api_sample_tests/test_remote_consoles.py
+++ b/nova/tests/functional/api_sample_tests/test_remote_consoles.py
@@ -13,6 +13,10 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
+from unittest import mock
+
+from nova.compute import api as compute
+from nova import exception
 from nova.tests.functional.api_sample_tests import test_servers
 
 HTTP_RE = r'(https?://)([\w\d:#@%/;$()~_?\+-=\\.&](#!)?)*'
@@ -38,6 +42,22 @@ def test_get_vnc_console(self):
         self._verify_response('get-vnc-console-post-resp', {'url': HTTP_RE},
                               response, 200)
 
+    @mock.patch.object(compute.API, 'get_vnc_console')
+    def test_get_vnc_console_instance_invalid_state(self,
+                                                    mock_get_vnc_console):
+        uuid = self._post_server()
+
+        def fake_get_vnc_console(*args, **kwargs):
+            raise exception.InstanceInvalidState(
+                attr='fake_attr', state='fake_state', method='fake_method',
+                instance_uuid=uuid)
+
+        mock_get_vnc_console.side_effect = fake_get_vnc_console
+        response = self._do_post('servers/%s/action' % uuid,
+                                 'get-vnc-console-post-req',
+                                 {'action': 'os-getVNCConsole'})
+        self.assertEqual(409, response.status_code)
+
     def test_get_spice_console(self):
         uuid = self._post_server()
         response = self._do_post('servers/%s/action' % uuid,
diff --git a/nova/tests/unit/api/openstack/compute/test_remote_consoles.py b/nova/tests/unit/api/openstack/compute/test_remote_consoles.py
index 6427b1abf03..f62093bbb79 100644
--- a/nova/tests/unit/api/openstack/compute/test_remote_consoles.py
+++ b/nova/tests/unit/api/openstack/compute/test_remote_consoles.py
@@ -103,6 +103,18 @@ def test_get_vnc_console_no_instance_on_console_get(self):
             'get_vnc_console',
             exception.InstanceNotFound(instance_id=fakes.FAKE_UUID))
 
+    def test_get_vnc_console_instance_invalid_state(self):
+        body = {'os-getVNCConsole': {'type': 'novnc'}}
+        self._check_console_failure(
+            self.controller.get_vnc_console,
+            webob.exc.HTTPConflict,
+            body,
+            'get_vnc_console',
+            exception.InstanceInvalidState(
+                attr='fake-attr', state='fake-state', method='fake-method',
+                instance_uuid=fakes.FAKE_UUID)
+        )
+
     def test_get_vnc_console_invalid_type(self):
         body = {'os-getVNCConsole': {'type': 'invalid'}}
         self._check_console_failure(

From 4073aa51f79be54e2e6e8143666a7c1f9a00e03d Mon Sep 17 00:00:00 2001
From: Rajesh Tailor <ratailor@redhat.com>
Date: Wed, 10 Aug 2022 18:15:04 +0530
Subject: [PATCH 48/93] Fix rescue volume-based instance

As of now, when attempting to rescue a volume-based instance
using an image without the hw_rescue_device and/or hw_rescue_bus
properties set, the rescue api call fails (as non-stable rescue
for volume-based instances are not supported) leaving the instance
in error state.

This change checks for hw_rescue_device/hw_rescue_bus image
properties before attempting to rescue and if the property
is not set, then fail with proper error message, without changing
instance state.

Related-Bug: #1978958
Closes-Bug: #1926601
Change-Id: Id4c8c5f3b32985ac7d3d7c833b82e0876f7367c1
(cherry picked from commit 6eed55bf55469f4ceaa7d4d4eb1be635e14bc73b)
(cherry picked from commit d00a848a735f98b028f5930798ee69ef205c8e2e)
---
 nova/compute/api.py                           |   6 +
 nova/tests/functional/test_server_rescue.py   |  86 +++++++++--
 nova/tests/unit/compute/test_api.py           | 133 +++++++++++++++++-
 ...olume-based-instance-c6e3fba236d90be7.yaml |   6 +
 4 files changed, 220 insertions(+), 11 deletions(-)
 create mode 100644 releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml

diff --git a/nova/compute/api.py b/nova/compute/api.py
index b9b6e9de7cc..6a581339b4c 100644
--- a/nova/compute/api.py
+++ b/nova/compute/api.py
@@ -4544,6 +4544,7 @@ def rescue(self, context, instance, rescue_password=None,
                allow_bfv_rescue=False):
         """Rescue the given instance."""
 
+        image_meta = None
         if rescue_image_ref:
             try:
                 image_meta = image_meta_obj.ImageMeta.from_image_ref(
@@ -4564,6 +4565,8 @@ def rescue(self, context, instance, rescue_password=None,
                             "image properties set")
                 raise exception.UnsupportedRescueImage(
                     image=rescue_image_ref)
+        else:
+            image_meta = instance.image_meta
 
         bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(
                     context, instance.uuid)
@@ -4572,6 +4575,9 @@ def rescue(self, context, instance, rescue_password=None,
         volume_backed = compute_utils.is_volume_backed_instance(
             context, instance, bdms)
 
+        allow_bfv_rescue &= 'hw_rescue_bus' in image_meta.properties and \
+            'hw_rescue_device' in image_meta.properties
+
         if volume_backed and allow_bfv_rescue:
             cn = objects.ComputeNode.get_by_host_and_nodename(
                 context, instance.host, instance.node)
diff --git a/nova/tests/functional/test_server_rescue.py b/nova/tests/functional/test_server_rescue.py
index fa96c10344a..8f5b9129437 100644
--- a/nova/tests/functional/test_server_rescue.py
+++ b/nova/tests/functional/test_server_rescue.py
@@ -10,6 +10,10 @@
 # License for the specific language governing permissions and limitations
 # under the License.
 
+import datetime
+
+from oslo_utils.fixture import uuidsentinel as uuids
+
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional.api import client
 from nova.tests.functional import integrated_helpers
@@ -23,7 +27,37 @@ def setUp(self):
         self.useFixture(nova_fixtures.CinderFixture(self))
         self._start_compute(host='host1')
 
-    def _create_bfv_server(self):
+    def _create_image(self, metadata=None):
+        image = {
+            'id': uuids.stable_rescue_image,
+            'name': 'fake-image-rescue-property',
+            'created_at': datetime.datetime(2011, 1, 1, 1, 2, 3),
+            'updated_at': datetime.datetime(2011, 1, 1, 1, 2, 3),
+            'deleted_at': None,
+            'deleted': False,
+            'status': 'active',
+            'is_public': False,
+            'container_format': 'raw',
+            'disk_format': 'raw',
+            'size': '25165824',
+            'min_ram': 0,
+            'min_disk': 0,
+            'protected': False,
+            'visibility': 'public',
+            'tags': ['tag1', 'tag2'],
+            'properties': {
+                'kernel_id': 'nokernel',
+                'ramdisk_id': 'nokernel',
+                'hw_rescue_device': 'disk',
+                'hw_rescue_bus': 'scsi',
+            },
+        }
+        if metadata:
+            image['properties'].update(metadata)
+        return self.glance.create(None, image)
+
+    def _create_bfv_server(self, metadata=None):
+        image = self._create_image(metadata=metadata)
         server_request = self._build_server(networks=[])
         server_request.pop('imageRef')
         server_request['block_device_mapping_v2'] = [{
@@ -33,7 +67,7 @@ def _create_bfv_server(self):
             'destination_type': 'volume'}]
         server = self.api.post_server({'server': server_request})
         self._wait_for_state_change(server, 'ACTIVE')
-        return server
+        return server, image
 
 
 class DisallowBFVRescuev286(BFVRescue):
@@ -43,10 +77,10 @@ class DisallowBFVRescuev286(BFVRescue):
     microversion = '2.86'
 
     def test_bfv_rescue_not_supported(self):
-        server = self._create_bfv_server()
+        server, image = self._create_bfv_server()
         ex = self.assertRaises(client.OpenStackApiException,
             self.api.post_server_action, server['id'], {'rescue': {
-            'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}})
+            'rescue_image_ref': image['id']}})
         self.assertEqual(400, ex.response.status_code)
         self.assertIn('Cannot rescue a volume-backed instance',
                       ex.response.text)
@@ -60,10 +94,10 @@ class DisallowBFVRescuev286WithTrait(BFVRescue):
     microversion = '2.86'
 
     def test_bfv_rescue_not_supported(self):
-        server = self._create_bfv_server()
+        server, image = self._create_bfv_server()
         ex = self.assertRaises(client.OpenStackApiException,
             self.api.post_server_action, server['id'], {'rescue': {
-            'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}})
+            'rescue_image_ref': image['id']}})
         self.assertEqual(400, ex.response.status_code)
         self.assertIn('Cannot rescue a volume-backed instance',
                       ex.response.text)
@@ -77,10 +111,10 @@ class DisallowBFVRescuev287WithoutTrait(BFVRescue):
     microversion = '2.87'
 
     def test_bfv_rescue_not_supported(self):
-        server = self._create_bfv_server()
+        server, image = self._create_bfv_server()
         ex = self.assertRaises(client.OpenStackApiException,
             self.api.post_server_action, server['id'], {'rescue': {
-            'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}})
+            'rescue_image_ref': image['id']}})
         self.assertEqual(400, ex.response.status_code)
         self.assertIn('Host unable to rescue a volume-backed instance',
                       ex.response.text)
@@ -94,7 +128,41 @@ class AllowBFVRescuev287WithTrait(BFVRescue):
     microversion = '2.87'
 
     def test_bfv_rescue_supported(self):
-        server = self._create_bfv_server()
+        server, image = self._create_bfv_server()
         self.api.post_server_action(server['id'], {'rescue': {
+            'rescue_image_ref': image['id']}})
+        self._wait_for_state_change(server, 'RESCUE')
+
+
+class DisallowBFVRescuev287WithoutRescueImageProperties(BFVRescue):
+    """Asserts that BFV rescue requests fail with microversion 2.87 (or later)
+    when the required hw_rescue_device and hw_rescue_bus image properties
+    are not set on the image.
+    """
+    compute_driver = 'fake.MediumFakeDriver'
+    microversion = '2.87'
+
+    def test_bfv_rescue_failed(self):
+        server, image = self._create_bfv_server()
+        # try rescue without hw_rescue_device and hw_rescue_bus properties set
+        ex = self.assertRaises(client.OpenStackApiException,
+            self.api.post_server_action, server['id'], {'rescue': {
             'rescue_image_ref': '155d900f-4e14-4e4c-a73d-069cbf4541e6'}})
+        self.assertEqual(400, ex.response.status_code)
+        self.assertIn('Cannot rescue a volume-backed instance',
+                      ex.response.text)
+
+
+class AllowBFVRescuev287WithRescueImageProperties(BFVRescue):
+    """Asserts that BFV rescue requests pass with microversion 2.87 (or later)
+    when the required hw_rescue_device and hw_rescue_bus image properties
+    are set on the image.
+    """
+    compute_driver = 'fake.RescueBFVDriver'
+    microversion = '2.87'
+
+    def test_bfv_rescue_done(self):
+        server, image = self._create_bfv_server()
+        self.api.post_server_action(server['id'], {'rescue': {
+            'rescue_image_ref': image['id']}})
         self._wait_for_state_change(server, 'RESCUE')
diff --git a/nova/tests/unit/compute/test_api.py b/nova/tests/unit/compute/test_api.py
index 14bb80c4c60..089ef74a880 100644
--- a/nova/tests/unit/compute/test_api.py
+++ b/nova/tests/unit/compute/test_api.py
@@ -5635,7 +5635,10 @@ def test_rescue_bfv_with_required_trait(self, mock_get_bdms,
                     destination_type='volume', volume_type=None,
                     snapshot_id=None, volume_id=uuids.volume_id,
                     volume_size=None)])
-        rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({})
+        rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({
+            'properties': {'hw_rescue_device': 'disk',
+                           'hw_rescue_bus': 'scsi'}
+        })
 
         with test.nested(
             mock.patch.object(self.compute_api.placementclient,
@@ -5687,6 +5690,7 @@ def test_rescue_bfv_with_required_trait(self, mock_get_bdms,
             # Assert that the instance task state as set in the compute API
             self.assertEqual(task_states.RESCUING, instance.task_state)
 
+    @mock.patch('nova.objects.instance.Instance.image_meta')
     @mock.patch('nova.objects.compute_node.ComputeNode'
                 '.get_by_host_and_nodename')
     @mock.patch('nova.compute.utils.is_volume_backed_instance',
@@ -5695,7 +5699,8 @@ def test_rescue_bfv_with_required_trait(self, mock_get_bdms,
                 '.get_by_instance_uuid')
     def test_rescue_bfv_without_required_trait(self, mock_get_bdms,
                                                mock_is_volume_backed,
-                                               mock_get_cn):
+                                               mock_get_cn,
+                                               mock_image_meta):
         instance = self._create_instance_obj()
         bdms = objects.BlockDeviceMappingList(objects=[
                 objects.BlockDeviceMapping(
@@ -5703,6 +5708,12 @@ def test_rescue_bfv_without_required_trait(self, mock_get_bdms,
                     destination_type='volume', volume_type=None,
                     snapshot_id=None, volume_id=uuids.volume_id,
                     volume_size=None)])
+
+        instance.image_meta = image_meta_obj.ImageMeta.from_dict({
+            'properties': {'hw_rescue_device': 'disk',
+                           'hw_rescue_bus': 'scsi'}
+        })
+
         with test.nested(
             mock.patch.object(self.compute_api.placementclient,
                               'get_provider_traits'),
@@ -5740,6 +5751,124 @@ def test_rescue_bfv_without_required_trait(self, mock_get_bdms,
             mock_get_traits.assert_called_once_with(
                 self.context, uuids.cn)
 
+    @mock.patch('nova.objects.image_meta.ImageMeta.from_image_ref')
+    @mock.patch('nova.objects.compute_node.ComputeNode'
+                '.get_by_host_and_nodename')
+    @mock.patch('nova.compute.utils.is_volume_backed_instance',
+                return_value=True)
+    @mock.patch('nova.objects.block_device.BlockDeviceMappingList'
+                '.get_by_instance_uuid')
+    def test_rescue_bfv_with_required_image_properties(
+            self, mock_get_bdms, mock_is_volume_backed, mock_get_cn,
+            mock_image_meta_obj_from_ref):
+        instance = self._create_instance_obj()
+        bdms = objects.BlockDeviceMappingList(objects=[
+            objects.BlockDeviceMapping(
+                boot_index=0, image_id=uuids.image_id, source_type='image',
+                destination_type='volume', volume_type=None,
+                snapshot_id=None, volume_id=uuids.volume_id,
+                volume_size=None)])
+        rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({
+            'properties': {'hw_rescue_device': 'disk',
+                           'hw_rescue_bus': 'scsi'}
+        })
+
+        with test.nested(
+            mock.patch.object(self.compute_api.placementclient,
+                              'get_provider_traits'),
+            mock.patch.object(self.compute_api.volume_api, 'get'),
+            mock.patch.object(self.compute_api.volume_api, 'check_attached'),
+            mock.patch.object(instance, 'save'),
+            mock.patch.object(self.compute_api, '_record_action_start'),
+            mock.patch.object(self.compute_api.compute_rpcapi,
+                              'rescue_instance')
+        ) as (
+                mock_get_traits, mock_get_volume, mock_check_attached,
+                mock_instance_save, mock_record_start, mock_rpcapi_rescue
+        ):
+            # Mock out the returned compute node, image_meta, bdms and volume
+            mock_image_meta_obj_from_ref.return_value = rescue_image_meta_obj
+            mock_get_bdms.return_value = bdms
+            mock_get_volume.return_value = mock.sentinel.volume
+            mock_get_cn.return_value = mock.Mock(uuid=uuids.cn)
+
+            # Ensure the required trait is returned, allowing BFV rescue
+            mock_trait_info = mock.Mock(traits=[ot.COMPUTE_RESCUE_BFV])
+            mock_get_traits.return_value = mock_trait_info
+
+            # Try to rescue the instance
+            self.compute_api.rescue(self.context, instance,
+                                    rescue_image_ref=uuids.rescue_image_id,
+                                    allow_bfv_rescue=True)
+
+            # Assert all of the calls made in the compute API
+            mock_get_bdms.assert_called_once_with(self.context, instance.uuid)
+            mock_get_volume.assert_called_once_with(
+                self.context, uuids.volume_id)
+            mock_check_attached.assert_called_once_with(
+                self.context, mock.sentinel.volume)
+            mock_is_volume_backed.assert_called_once_with(
+                self.context, instance, bdms)
+            mock_get_cn.assert_called_once_with(
+                self.context, instance.host, instance.node)
+            mock_get_traits.assert_called_once_with(self.context, uuids.cn)
+            mock_instance_save.assert_called_once_with(
+                expected_task_state=[None])
+            mock_record_start.assert_called_once_with(
+                self.context, instance, instance_actions.RESCUE)
+            mock_rpcapi_rescue.assert_called_once_with(
+                self.context, instance=instance, rescue_password=None,
+                rescue_image_ref=uuids.rescue_image_id, clean_shutdown=True)
+
+            # Assert that the instance task state as set in the compute API
+            self.assertEqual(task_states.RESCUING, instance.task_state)
+
+    @mock.patch('nova.objects.image_meta.ImageMeta.from_image_ref')
+    @mock.patch('nova.compute.utils.is_volume_backed_instance',
+                return_value=True)
+    @mock.patch('nova.objects.block_device.BlockDeviceMappingList'
+                '.get_by_instance_uuid')
+    def test_rescue_bfv_without_required_image_properties(
+            self, mock_get_bdms, mock_is_volume_backed,
+            mock_image_meta_obj_from_ref):
+        instance = self._create_instance_obj()
+        bdms = objects.BlockDeviceMappingList(objects=[
+            objects.BlockDeviceMapping(
+                boot_index=0, image_id=uuids.image_id, source_type='image',
+                destination_type='volume', volume_type=None,
+                snapshot_id=None, volume_id=uuids.volume_id,
+                volume_size=None)])
+        rescue_image_meta_obj = image_meta_obj.ImageMeta.from_dict({
+            'properties': {}
+        })
+
+        with test.nested(
+            mock.patch.object(self.compute_api.volume_api, 'get'),
+            mock.patch.object(self.compute_api.volume_api, 'check_attached'),
+        ) as (
+            mock_get_volume, mock_check_attached
+        ):
+            # Mock out the returned bdms, volume and image_meta
+            mock_get_bdms.return_value = bdms
+            mock_get_volume.return_value = mock.sentinel.volume
+            mock_image_meta_obj_from_ref.return_value = rescue_image_meta_obj
+
+            # Assert that any attempt to rescue a bfv instance on a compute
+            # node that does not report the COMPUTE_RESCUE_BFV trait fails and
+            # raises InstanceNotRescuable
+            self.assertRaises(exception.InstanceNotRescuable,
+                              self.compute_api.rescue, self.context, instance,
+                              rescue_image_ref=None, allow_bfv_rescue=True)
+
+            # Assert the calls made in the compute API prior to the failure
+            mock_get_bdms.assert_called_once_with(self.context, instance.uuid)
+            mock_get_volume.assert_called_once_with(
+                self.context, uuids.volume_id)
+            mock_check_attached.assert_called_once_with(
+                self.context, mock.sentinel.volume)
+            mock_is_volume_backed.assert_called_once_with(
+                self.context, instance, bdms)
+
     @mock.patch('nova.compute.utils.is_volume_backed_instance',
                 return_value=True)
     @mock.patch('nova.objects.block_device.BlockDeviceMappingList'
diff --git a/releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml b/releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml
new file mode 100644
index 00000000000..7e80059b801
--- /dev/null
+++ b/releasenotes/notes/rescue-volume-based-instance-c6e3fba236d90be7.yaml
@@ -0,0 +1,6 @@
+---
+fixes:
+  - |
+    Fix rescuing volume based instance by adding a check for 'hw_rescue_disk'
+    and 'hw_rescue_device' properties in image metadata before attempting
+    to rescue instance.

From f57900ad20d21aac5cb91aed87edc121f9008115 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Tue, 21 Jun 2022 12:04:20 +0100
Subject: [PATCH 49/93] add repoducer test for bug 1890244

This change adds a test to simulate validating
a instnace group policy where the group has been
deleted but is still referenced in the scheduler hint.

Change-Id: I803e6286a773d9e53639ab0cd449fc72bb3be613
Related-Bug: #1890244
(cherry picked from commit 84a84f7f2fff58cf6254d6267af0ca5cee64c53b)
---
 nova/tests/unit/compute/test_compute_mgr.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index 1b0469b8e65..c6be4b49562 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -7615,6 +7615,24 @@ def test_validate_instance_group_policy_handles_hint_list(self, mock_get):
                                                      instance, hints)
         mock_get.assert_called_once_with(self.context, uuids.group_hint)
 
+    @mock.patch('nova.objects.InstanceGroup.get_by_hint')
+    def test_validate_instance_group_policy_deleted_group(self, mock_get):
+        """Tests that _validate_instance_group_policy handles the case
+        where the scheduler hint has a group but that group has been deleted.
+        This tests is a reproducer for bug: #1890244
+        """
+        instance = objects.Instance(uuid=uuids.instance)
+        hints = {'group': [uuids.group_hint]}
+        mock_get.side_effect = exception.InstanceGroupNotFound(
+            group_uuid=uuids.group_hint
+        )
+        # FIXME(sean-k-mooney): this should not leak the exception
+        self.assertRaises(
+            exception.InstanceGroupNotFound,
+            self.compute._validate_instance_group_policy, self.context,
+            instance, hints)
+        mock_get.assert_called_once_with(self.context, uuids.group_hint)
+
     @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
     @mock.patch('nova.objects.InstanceList.get_uuids_by_host')
     @mock.patch('nova.objects.InstanceGroup.get_by_hint')

From 7934b9ec57d7060fbcf27706aa98ebf5a83f920a Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Tue, 21 Jun 2022 12:23:45 +0100
Subject: [PATCH 50/93] ignore deleted server groups in validation

This change simply catches the exception raised when
we lookup a servergroup via a hint and the validation
upcall is enabled.

Change-Id: I858b4da35382a9f4dcf88f4b6db340e1f34eb82d
Closes-Bug: #1890244
(cherry picked from commit cd2c2f359bbd4913cfe73199847bc35b2664aaa9)
---
 nova/compute/manager.py                       | 21 ++--
 nova/objects/request_spec.py                  |  1 +
 .../regressions/test_bug_1890244.py           | 96 +++++++++++++++++++
 nova/tests/unit/compute/test_compute_mgr.py   | 13 ++-
 nova/tests/unit/objects/test_request_spec.py  | 24 +++++
 ...-with-deleted-groups-4f685fd1d6b84192.yaml | 13 +++
 6 files changed, 155 insertions(+), 13 deletions(-)
 create mode 100644 nova/tests/functional/regressions/test_bug_1890244.py
 create mode 100644 releasenotes/notes/fix-group-policy-validation-with-deleted-groups-4f685fd1d6b84192.yaml

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 435578d40c3..2be17277a52 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1732,27 +1732,32 @@ def _validate_instance_group_policy(self, context, instance,
         # hosts.  This is a validation step to make sure that starting the
         # instance here doesn't violate the policy.
         if scheduler_hints is not None:
-            # only go through here if scheduler_hints is provided, even if it
-            # is empty.
+            # only go through here if scheduler_hints is provided,
+            # even if it is empty.
             group_hint = scheduler_hints.get('group')
             if not group_hint:
                 return
             else:
-                # The RequestSpec stores scheduler_hints as key=list pairs so
-                # we need to check the type on the value and pull the single
-                # entry out. The API request schema validates that
+                # The RequestSpec stores scheduler_hints as key=list pairs
+                # so we need to check the type on the value and pull the
+                # single entry out. The API request schema validates that
                 # the 'group' hint is a single value.
                 if isinstance(group_hint, list):
                     group_hint = group_hint[0]
-
-                group = objects.InstanceGroup.get_by_hint(context, group_hint)
+                try:
+                    group = objects.InstanceGroup.get_by_hint(
+                        context, group_hint
+                    )
+                except exception.InstanceGroupNotFound:
+                    return
         else:
             # TODO(ganso): a call to DB can be saved by adding request_spec
             # to rpcapi payload of live_migration, pre_live_migration and
             # check_can_live_migrate_destination
             try:
                 group = objects.InstanceGroup.get_by_instance_uuid(
-                    context, instance.uuid)
+                    context, instance.uuid
+                )
             except exception.InstanceGroupNotFound:
                 return
 
diff --git a/nova/objects/request_spec.py b/nova/objects/request_spec.py
index 9ce77a40435..cc542932318 100644
--- a/nova/objects/request_spec.py
+++ b/nova/objects/request_spec.py
@@ -645,6 +645,7 @@ def _from_db_object(context, spec, db_spec):
             except exception.InstanceGroupNotFound:
                 # NOTE(danms): Instance group may have been deleted
                 spec.instance_group = None
+                spec.scheduler_hints.pop('group', None)
 
         if data_migrated:
             spec.save()
diff --git a/nova/tests/functional/regressions/test_bug_1890244.py b/nova/tests/functional/regressions/test_bug_1890244.py
new file mode 100644
index 00000000000..bf969eebe77
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1890244.py
@@ -0,0 +1,96 @@
+# Copyright 2017 Ericsson
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nova import context
+from nova import objects
+from nova import test
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional import fixtures as func_fixtures
+from nova.tests.functional import integrated_helpers
+
+
+class IgnoreDeletedServerGroupsTest(
+    test.TestCase, integrated_helpers.InstanceHelperMixin,
+):
+    """Regression test for bug 1890244
+
+    If instance are created as member of server groups it
+    should be possibel to evacuate them if the server groups are
+    deleted prior to the host failure.
+    """
+
+    def setUp(self):
+        super().setUp()
+        # Stub out external dependencies.
+        self.useFixture(nova_fixtures.NeutronFixture(self))
+        self.useFixture(nova_fixtures.GlanceFixture(self))
+        self.useFixture(func_fixtures.PlacementFixture())
+        # Start nova controller services.
+        api_fixture = self.useFixture(nova_fixtures.OSAPIFixture(
+            api_version='v2.1'))
+        self.api = api_fixture.admin_api
+        self.start_service('conductor')
+        # Use a custom weigher to make sure that we have a predictable
+        # scheduling sort order.
+        self.useFixture(nova_fixtures.HostNameWeigherFixture())
+        self.start_service('scheduler')
+        # Start two computes, one where the server will be created and another
+        # where we'll evacuate it to.
+        self.src = self._start_compute('host1')
+        self.dest = self._start_compute('host2')
+        self.notifier = self.useFixture(
+            nova_fixtures.NotificationFixture(self)
+        )
+
+    def test_evacuate_after_group_delete(self):
+        # Create an anti-affinity group for the server.
+        body = {
+            'server_group': {
+                'name': 'test-group',
+                'policies': ['anti-affinity']
+            }
+        }
+        group_id = self.api.api_post(
+            '/os-server-groups', body).body['server_group']['id']
+
+        # Create a server in the group which should land on host1 due to our
+        # custom weigher.
+        body = {'server': self._build_server()}
+        body['os:scheduler_hints'] = {'group': group_id}
+        server = self.api.post_server(body)
+        server = self._wait_for_state_change(server, 'ACTIVE')
+        self.assertEqual('host1', server['OS-EXT-SRV-ATTR:host'])
+
+        # Down the source compute to enable the evacuation
+        self.api.microversion = '2.11'     # Cap for the force-down call.
+        self.api.force_down_service('host1', 'nova-compute', True)
+        self.api.microversion = 'latest'
+        self.src.stop()
+
+        # assert the server currently has a server group
+        reqspec = objects.RequestSpec.get_by_instance_uuid(
+            context.get_admin_context(), server['id'])
+        self.assertIsNotNone(reqspec.instance_group)
+        self.assertIn('group', reqspec.scheduler_hints)
+        # then delete it so that we need to clean it up on evac
+        self.api.api_delete(f'/os-server-groups/{group_id}')
+
+        # Initiate evacuation
+        server = self._evacuate_server(
+            server, expected_host='host2', expected_migration_status='done'
+        )
+        reqspec = objects.RequestSpec.get_by_instance_uuid(
+            context.get_admin_context(), server['id'])
+        self.assertIsNone(reqspec.instance_group)
+        self.assertNotIn('group', reqspec.scheduler_hints)
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index c6be4b49562..62f15d0d93d 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -7626,11 +7626,14 @@ def test_validate_instance_group_policy_deleted_group(self, mock_get):
         mock_get.side_effect = exception.InstanceGroupNotFound(
             group_uuid=uuids.group_hint
         )
-        # FIXME(sean-k-mooney): this should not leak the exception
-        self.assertRaises(
-            exception.InstanceGroupNotFound,
-            self.compute._validate_instance_group_policy, self.context,
-            instance, hints)
+        # This implicitly asserts that no exception is raised since
+        # uncaught exceptions would be treated as a test failure.
+        self.compute._validate_instance_group_policy(
+            self.context, instance, hints
+        )
+        # and this just assert that we did in fact invoke the method
+        # that raises to ensure that if we refactor in the future this
+        # this test will fail if the function we mock is no longer called.
         mock_get.assert_called_once_with(self.context, uuids.group_hint)
 
     @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
diff --git a/nova/tests/unit/objects/test_request_spec.py b/nova/tests/unit/objects/test_request_spec.py
index 31797f8133b..e51b5c33686 100644
--- a/nova/tests/unit/objects/test_request_spec.py
+++ b/nova/tests/unit/objects/test_request_spec.py
@@ -615,6 +615,30 @@ def test_get_by_instance_uuid(self, mock_get_ig, get_by_uuid):
         self.assertIsInstance(req_obj.instance_group, objects.InstanceGroup)
         self.assertEqual('fresh', req_obj.instance_group.name)
 
+    @mock.patch.object(
+        request_spec.RequestSpec, '_get_by_instance_uuid_from_db'
+    )
+    @mock.patch('nova.objects.InstanceGroup.get_by_uuid')
+    def test_get_by_instance_uuid_deleted_group(
+            self, mock_get_ig, get_by_uuid
+    ):
+        fake_spec_obj = fake_request_spec.fake_spec_obj()
+        fake_spec_obj.scheduler_hints['group'] = ['fresh']
+        fake_spec = fake_request_spec.fake_db_spec(fake_spec_obj)
+        get_by_uuid.return_value = fake_spec
+        mock_get_ig.side_effect = exception.InstanceGroupNotFound(
+            group_uuid=uuids.instgroup
+        )
+
+        req_obj = request_spec.RequestSpec.get_by_instance_uuid(
+            self.context, fake_spec['instance_uuid']
+        )
+        # assert that both the instance_group object and scheduler hint
+        # are cleared if the instance_group was deleted since the request
+        # spec was last saved to the db.
+        self.assertIsNone(req_obj.instance_group, objects.InstanceGroup)
+        self.assertEqual({'hint': ['over-there']}, req_obj.scheduler_hints)
+
     @mock.patch('nova.objects.request_spec.RequestSpec.save')
     @mock.patch.object(
         request_spec.RequestSpec, '_get_by_instance_uuid_from_db')
diff --git a/releasenotes/notes/fix-group-policy-validation-with-deleted-groups-4f685fd1d6b84192.yaml b/releasenotes/notes/fix-group-policy-validation-with-deleted-groups-4f685fd1d6b84192.yaml
new file mode 100644
index 00000000000..7f7d42bd0e0
--- /dev/null
+++ b/releasenotes/notes/fix-group-policy-validation-with-deleted-groups-4f685fd1d6b84192.yaml
@@ -0,0 +1,13 @@
+---
+fixes:
+  - |
+    When the server group policy validation upcall is enabled
+    nova will assert that the policy is not violated on move operations
+    and initial instance creation. As noted in `bug 1890244`_, if a
+    server was created in a server group and that group was later deleted
+    the validation upcall would fail due to an uncaught excpetion if the
+    server group was deleted. This prevented evacuate and other move
+    operations form functioning. This has now been fixed and nova will
+    ignore deleted server groups.
+
+    .. _bug 1890244: https://bugs.launchpad.net/nova/+bug/1890244

From ce2cc54bfe236554badb9f6bf53a958417e5525d Mon Sep 17 00:00:00 2001
From: Stephen Finucane <sfinucan@redhat.com>
Date: Fri, 8 Apr 2022 11:38:54 +0100
Subject: [PATCH 51/93] db: Resolve additional SAWarning warnings

Resolving the following SAWarning warnings:

  Coercing Subquery object into a select() for use in IN(); please pass
  a select() construct explicitly

  SELECT statement has a cartesian product between FROM element(s)
  "foo" and FROM element "bar". Apply join condition(s) between each
  element to resolve.

While the first of these was a trivial fix, the second one is a little
more involved. It was caused by attempting to build a query across
tables that had no relationship as part of our archive logic. For
example, consider the following queries, generated early in
'_get_fk_stmts':

  SELECT instances.uuid
  FROM instances, security_group_instance_association
  WHERE security_group_instance_association.instance_uuid = instances.uuid
    AND instances.id IN (__[POSTCOMPILE_id_1])

  SELECT security_groups.id
  FROM security_groups, security_group_instance_association, instances
  WHERE security_group_instance_association.security_group_id = security_groups.id
    AND instances.id IN (__[POSTCOMPILE_id_1])

While the first of these is fine, the second is clearly wrong: why are
we filtering on a field that is of no relevance to our join? These were
generated because we were attempting to archive one or more instances
(in this case, the instance with id=1) and needed to find related tables
to archive at the same time. A related table is any table that
references our "source" table - 'instances' here - by way of a foreign
key. For each of *these* tables, we then lookup each foreign key and
join back to the source table, filtering by matching entries in the
source table. The issue here is that we're looking up every foreign key.
What we actually want to do is lookup only the foreign keys that point
back to our source table. This flaw is why we were generating the second
SELECT above: the 'security_group_instance_association' has two foreign
keys, one pointing to our 'instances' table but also another pointing to
the 'security_groups' table. We want the first but not the second.

Resolve this by checking if the table that each foreign key points to is
actually the source table and simply skip if not. With this issue
resolved, we can enable errors on SAWarning warnings in general without
any filters.

Conflicts:
    nova/tests/fixtures/nova.py

NOTE(melwitt): The conflict is because change
Ia1da21577d859885838de10110dd473f72af285d (db: Trivial rewrapping of
warning filters) is not in Yoga.

Change-Id: I63208c7bd5f9f4c3d5e4a40bd0f6253d0f042a37
Signed-off-by: Stephen Finucane <sfinucan@redhat.com>
(cherry picked from commit 8142b9dc47b7096ab9d8180f0b5b1e52d513e2dc)
---
 nova/db/main/api.py          |  7 +++++++
 nova/objects/cell_mapping.py | 12 ++++++++----
 nova/tests/fixtures/nova.py  | 10 ++++++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/nova/db/main/api.py b/nova/db/main/api.py
index 4c40be905ef..39775d4f461 100644
--- a/nova/db/main/api.py
+++ b/nova/db/main/api.py
@@ -4176,6 +4176,12 @@ def _get_fk_stmts(metadata, conn, table, column, records):
             fk_column = fk_table.c.id
 
         for fk in fk_table.foreign_keys:
+            if table != fk.column.table:
+                # if the foreign key doesn't actually point to the table we're
+                # archiving entries from then it's not relevant; trying to
+                # resolve this would result in a cartesian product
+                continue
+
             # We need to find the records in the referring (child) table that
             # correspond to the records in our (parent) table so we can archive
             # them.
@@ -4225,6 +4231,7 @@ def _get_fk_stmts(metadata, conn, table, column, records):
                 # deque.
                 fk_delete = fk_table.delete().where(fk_column.in_(fk_records))
                 deletes.appendleft(fk_delete)
+
         # Repeat for any possible nested child tables.
         i, d = _get_fk_stmts(metadata, conn, fk_table, fk_column, fk_records)
         inserts.extendleft(i)
diff --git a/nova/objects/cell_mapping.py b/nova/objects/cell_mapping.py
index 595ec43e480..13551824205 100644
--- a/nova/objects/cell_mapping.py
+++ b/nova/objects/cell_mapping.py
@@ -279,11 +279,15 @@ def _get_by_project_id_from_db(context, project_id):
         # SELECT DISTINCT cell_id FROM instance_mappings \
         #   WHERE project_id = $project_id;
         cell_ids = context.session.query(
-            api_db_models.InstanceMapping.cell_id).filter_by(
-            project_id=project_id).distinct().subquery()
+            api_db_models.InstanceMapping.cell_id
+        ).filter_by(
+            project_id=project_id
+        ).distinct()
         # SELECT cell_mappings WHERE cell_id IN ($cell_ids);
-        return context.session.query(api_db_models.CellMapping).filter(
-            api_db_models.CellMapping.id.in_(cell_ids)).all()
+        return context.session.query(
+            api_db_models.CellMapping).filter(
+            api_db_models.CellMapping.id.in_(cell_ids)
+        ).all()
 
     @classmethod
     def get_by_project_id(cls, context, project_id):
diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py
index 27ca2fd77d4..f9e011dd67d 100644
--- a/nova/tests/fixtures/nova.py
+++ b/nova/tests/fixtures/nova.py
@@ -904,6 +904,16 @@ def setUp(self):
             message='Implicit coercion of SELECT and textual SELECT .*',
             category=sqla_exc.SADeprecationWarning)
 
+        # Enable general SQLAlchemy warnings also to ensure we're not doing
+        # silly stuff. It's possible that we'll need to filter things out here
+        # with future SQLAlchemy versions, but that's a good thing
+
+        warnings.filterwarnings(
+            'error',
+            module='nova',
+            category=sqla_exc.SAWarning,
+        )
+
         self.addCleanup(self._reset_warning_filters)
 
     def _reset_warning_filters(self):

From 71aa17a487136be5e938192857721d9119222811 Mon Sep 17 00:00:00 2001
From: Sylvain Bauza <sbauza@redhat.com>
Date: Thu, 21 Jul 2022 18:21:51 +0200
Subject: [PATCH 52/93] Reproducer for bug 1951656

Due to a new mdev naming, we can't parse it.

Change-Id: I0f785178b132dfef668829558dea9f7e674abadb
Related-Bug: #1951656
(cherry picked from commit 185201974775bab966f4e5ca3bbdc31b8269fa4c)
(cherry picked from commit 857df72d3166a8f7e8a8cdfeabb62ad6ead46565)
---
 .../regressions/test_bug_1951656.py           | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 nova/tests/functional/regressions/test_bug_1951656.py

diff --git a/nova/tests/functional/regressions/test_bug_1951656.py b/nova/tests/functional/regressions/test_bug_1951656.py
new file mode 100644
index 00000000000..9aad191072c
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_1951656.py
@@ -0,0 +1,83 @@
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+
+from oslo_utils import uuidutils
+
+
+from nova.tests.fixtures import libvirt as fakelibvirt
+from nova.tests.functional.libvirt import test_vgpu
+from nova.virt.libvirt import utils as libvirt_utils
+
+
+class VGPUTestsLibvirt7_7(test_vgpu.VGPUTestBase):
+
+    def _create_mdev(self, physical_device, mdev_type, uuid=None):
+        # We need to fake the newly created sysfs object by adding a new
+        # FakeMdevDevice in the existing persisted Connection object so
+        # when asking to get the existing mdevs, we would see it.
+        if not uuid:
+            uuid = uuidutils.generate_uuid()
+        mdev_name = libvirt_utils.mdev_uuid2name(uuid)
+        libvirt_parent = self.pci2libvirt_address(physical_device)
+
+        # Libvirt 7.7 now creates mdevs with a parent_addr suffix.
+        new_mdev_name = '_'.join([mdev_name, libvirt_parent])
+
+        # Here, we get the right compute thanks by the self.current_host that
+        # was modified just before
+        connection = self.computes[
+            self._current_host].driver._host.get_connection()
+        connection.mdev_info.devices.update(
+            {mdev_name: fakelibvirt.FakeMdevDevice(dev_name=new_mdev_name,
+                                                   type_id=mdev_type,
+                                                   parent=libvirt_parent)})
+        return uuid
+
+    def setUp(self):
+        super(VGPUTestsLibvirt7_7, self).setUp()
+        extra_spec = {"resources:VGPU": "1"}
+        self.flavor = self._create_flavor(extra_spec=extra_spec)
+
+        # Start compute1 supporting only nvidia-11
+        self.flags(
+            enabled_mdev_types=fakelibvirt.NVIDIA_11_VGPU_TYPE,
+            group='devices')
+
+        self.compute1 = self.start_compute_with_vgpu('host1')
+
+    def test_create_servers_with_vgpu(self):
+
+        # Create a single instance against a specific compute node.
+        self._create_server(
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            flavor_id=self.flavor, host=self.compute1.host,
+            networks='auto', expected_state='ACTIVE')
+
+        # TODO(sbauza): Modify this once bug #1851656 is fixed.
+        # mdev_name2uuid() raises a badly formed hexadecimal UUID string error
+        self.assertRaises(ValueError,
+                          self.assert_mdev_usage,
+                          self.compute1, expected_amount=1)
+
+        # Now, the problem is that we can't create new instances with VGPUs
+        # from this host.
+        server = self._create_server(
+            image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
+            flavor_id=self.flavor, host=self.compute1.host,
+            networks='auto', expected_state='ERROR')
+        # The error is due to a bad mdev name parsing
+        self.assertIn('fault', server)
+        # since we only have one host, we have a RescheduledException as this
+        # service was creating an exception and we can't use another one.
+        self.assertIn('Exceeded maximum number of retries',
+                      server['fault']['message'])

From 28053917200e3e242148672efda0e1a2b043dc48 Mon Sep 17 00:00:00 2001
From: Billy Olsen <billy.olsen@gmail.com>
Date: Thu, 21 Apr 2022 19:42:27 -0700
Subject: [PATCH 53/93] Handle mdev devices in libvirt 7.7+

Libvirt 7.7 changed the mdev device naming to include the parent PCI
device when listing node devices. The domain, however, will still only
see the UUID and not see the parent PCI device. Changing the parsing to
simply drop the PCI identifier is not enough as the device cannot be
found when attempting to lookup the new ID.

Modify the Libvirt Driver's _get_mediated_device_information to tolerate
different formats of the mdev name. This first uses the legacy behavior
by trying to lookup the device name that is passed in (typically
mdev_<uuid> format) and if that is not found, iterates the list of mdev
node devices until the right UUID is found and selects that one.

Note that the lookup of the mdev device by UUID are needed in order
to keep the ability to recreate assigned mediated devices on a reboot of
the compute node.

Additionally, the libvirt utils parsing method mdev_name2uuid, has
been updated to tolerate both mdev_<uuid> and mdev_<uuid>_<pciid>
formats.

Closes-Bug: 1951656

Change-Id: Ifed0fa16053228990a6a8df8d4c666521db7e329
(cherry picked from commit a28b907c4f0dbba6e141a8fbea807e6cb0438977)
(cherry picked from commit 98d8c9eaa3c415cc234193e6a9115db887751363)
---
 .../regressions/test_bug_1951656.py           | 22 +++-------
 nova/tests/unit/virt/libvirt/test_config.py   | 26 +++++++++++
 nova/virt/libvirt/config.py                   |  3 ++
 nova/virt/libvirt/driver.py                   | 43 +++++++++++++++++--
 nova/virt/libvirt/host.py                     |  2 +-
 nova/virt/libvirt/utils.py                    | 28 +++++++++---
 6 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/nova/tests/functional/regressions/test_bug_1951656.py b/nova/tests/functional/regressions/test_bug_1951656.py
index 9aad191072c..d705ff6fe31 100644
--- a/nova/tests/functional/regressions/test_bug_1951656.py
+++ b/nova/tests/functional/regressions/test_bug_1951656.py
@@ -63,21 +63,11 @@ def test_create_servers_with_vgpu(self):
             flavor_id=self.flavor, host=self.compute1.host,
             networks='auto', expected_state='ACTIVE')
 
-        # TODO(sbauza): Modify this once bug #1851656 is fixed.
-        # mdev_name2uuid() raises a badly formed hexadecimal UUID string error
-        self.assertRaises(ValueError,
-                          self.assert_mdev_usage,
-                          self.compute1, expected_amount=1)
-
-        # Now, the problem is that we can't create new instances with VGPUs
-        # from this host.
-        server = self._create_server(
+        self.assert_mdev_usage(self.compute1, expected_amount=1)
+
+        self._create_server(
             image_uuid='155d900f-4e14-4e4c-a73d-069cbf4541e6',
             flavor_id=self.flavor, host=self.compute1.host,
-            networks='auto', expected_state='ERROR')
-        # The error is due to a bad mdev name parsing
-        self.assertIn('fault', server)
-        # since we only have one host, we have a RescheduledException as this
-        # service was creating an exception and we can't use another one.
-        self.assertIn('Exceeded maximum number of retries',
-                      server['fault']['message'])
+            networks='auto', expected_state='ACTIVE')
+
+        self.assert_mdev_usage(self.compute1, expected_amount=2)
diff --git a/nova/tests/unit/virt/libvirt/test_config.py b/nova/tests/unit/virt/libvirt/test_config.py
index 396edfd0248..c7577745ab4 100644
--- a/nova/tests/unit/virt/libvirt/test_config.py
+++ b/nova/tests/unit/virt/libvirt/test_config.py
@@ -3135,6 +3135,32 @@ def test_config_mdev_device(self):
                               config.LibvirtConfigNodeDeviceMdevInformation)
         self.assertEqual("nvidia-11", obj.mdev_information.type)
         self.assertEqual(12, obj.mdev_information.iommu_group)
+        self.assertIsNone(obj.mdev_information.uuid)
+
+    def test_config_mdev_device_uuid(self):
+        xmlin = """
+        <device>
+          <name>mdev_b2107403_110c_45b0_af87_32cc91597b8a_0000_41_00_0</name>
+          <path>/sys/devices/pci0000:40/0000:40:03.1/0000:41:00.0/b2107403-110c-45b0-af87-32cc91597b8a</path>
+          <parent>pci_0000_41_00_0</parent>
+          <driver>
+            <name>vfio_mdev</name>
+          </driver>
+          <capability type='mdev'>
+            <type id='nvidia-442'/>
+            <uuid>b2107403-110c-45b0-af87-32cc91597b8a</uuid>
+            <iommuGroup number='57'/>
+          </capability>
+        </device>"""
+
+        obj = config.LibvirtConfigNodeDevice()
+        obj.parse_str(xmlin)
+        self.assertIsInstance(obj.mdev_information,
+                              config.LibvirtConfigNodeDeviceMdevInformation)
+        self.assertEqual("nvidia-442", obj.mdev_information.type)
+        self.assertEqual(57, obj.mdev_information.iommu_group)
+        self.assertEqual("b2107403-110c-45b0-af87-32cc91597b8a",
+                         obj.mdev_information.uuid)
 
     def test_config_vdpa_device(self):
         xmlin = """
diff --git a/nova/virt/libvirt/config.py b/nova/virt/libvirt/config.py
index 1a81be3ade5..47e92e3ca91 100644
--- a/nova/virt/libvirt/config.py
+++ b/nova/virt/libvirt/config.py
@@ -3299,6 +3299,7 @@ def __init__(self, **kwargs):
                                         root_name="capability", **kwargs)
         self.type = None
         self.iommu_group = None
+        self.uuid = None
 
     def parse_dom(self, xmldoc):
         super(LibvirtConfigNodeDeviceMdevInformation,
@@ -3308,6 +3309,8 @@ def parse_dom(self, xmldoc):
                 self.type = c.get('id')
             if c.tag == "iommuGroup":
                 self.iommu_group = int(c.get('number'))
+            if c.tag == "uuid":
+                self.uuid = c.text
 
 
 class LibvirtConfigNodeDeviceVpdCap(LibvirtConfigObject):
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index cc5e4b5da52..e792655a7de 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -8019,15 +8019,52 @@ def _get_mdev_capable_devices(self, types=None):
 
     def _get_mediated_device_information(self, devname):
         """Returns a dict of a mediated device."""
-        virtdev = self._host.device_lookup_by_name(devname)
+        # LP #1951656 - In Libvirt 7.7, the mdev name now includes the PCI
+        # address of the parent device (e.g. mdev_<uuid>_<pci_address>) due to
+        # the mdevctl allowing for multiple mediated devs having the same UUID
+        # defined (only one can be active at a time). Since the guest
+        # information doesn't have the parent ID, try to lookup which
+        # mediated device is available that matches the UUID. If multiple
+        # devices are found that match the UUID, then this is an error
+        # condition.
+        try:
+            virtdev = self._host.device_lookup_by_name(devname)
+        except libvirt.libvirtError as ex:
+            if ex.get_error_code() != libvirt.VIR_ERR_NO_NODE_DEVICE:
+                raise
+            mdevs = [dev for dev in self._host.list_mediated_devices()
+                     if dev.startswith(devname)]
+            # If no matching devices are found, simply raise the original
+            # exception indicating that no devices are found.
+            if not mdevs:
+                raise
+            elif len(mdevs) > 1:
+                msg = ("The mediated device name %(devname)s refers to a UUID "
+                       "that is present in multiple libvirt mediated devices. "
+                       "Matching libvirt mediated devices are %(devices)s. "
+                       "Mediated device UUIDs must be unique for Nova." %
+                       {'devname': devname,
+                        'devices': ', '.join(mdevs)})
+                raise exception.InvalidLibvirtMdevConfig(reason=msg)
+
+            LOG.debug('Found requested device %s as %s. Using that.',
+                      devname, mdevs[0])
+            virtdev = self._host.device_lookup_by_name(mdevs[0])
         xmlstr = virtdev.XMLDesc(0)
         cfgdev = vconfig.LibvirtConfigNodeDevice()
         cfgdev.parse_str(xmlstr)
+        # Starting with Libvirt 7.3, the uuid information is available in the
+        # node device information. If its there, use that. Otherwise,
+        # fall back to the previous behavior of parsing the uuid from the
+        # devname.
+        if cfgdev.mdev_information.uuid:
+            mdev_uuid = cfgdev.mdev_information.uuid
+        else:
+            mdev_uuid = libvirt_utils.mdev_name2uuid(cfgdev.name)
 
         device = {
             "dev_id": cfgdev.name,
-            # name is like mdev_00ead764_fdc0_46b6_8db9_2963f5c815b4
-            "uuid": libvirt_utils.mdev_name2uuid(cfgdev.name),
+            "uuid": mdev_uuid,
             # the physical GPU PCI device
             "parent": cfgdev.parent,
             "type": cfgdev.mdev_information.type,
diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py
index 92f58e6899a..ebcc1125345 100644
--- a/nova/virt/libvirt/host.py
+++ b/nova/virt/libvirt/host.py
@@ -1503,7 +1503,7 @@ def list_mdev_capable_devices(self, flags=0):
     def list_mediated_devices(self, flags=0):
         """Lookup mediated devices.
 
-        :returns: a list of virNodeDevice instance
+        :returns: a list of strings with the name of the instance
         """
         return self._list_devices("mdev", flags=flags)
 
diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py
index 834f242c792..a1b9459b7e6 100644
--- a/nova/virt/libvirt/utils.py
+++ b/nova/virt/libvirt/utils.py
@@ -581,17 +581,31 @@ def get_default_machine_type(arch: str) -> ty.Optional[str]:
 
 
 def mdev_name2uuid(mdev_name: str) -> str:
-    """Convert an mdev name (of the form mdev_<uuid_with_underscores>) to a
-    uuid (of the form 8-4-4-4-12).
+    """Convert an mdev name (of the form mdev_<uuid_with_underscores> or
+    mdev_<uuid_with_underscores>_<pciaddress>) to a uuid
+    (of the form 8-4-4-4-12).
+
+    :param mdev_name: the name of the mdev to parse the UUID from
+    :returns: string containing the uuid
     """
-    return str(uuid.UUID(mdev_name[5:].replace('_', '-')))
+    mdev_uuid = mdev_name[5:].replace('_', '-')
+    # Unconditionnally remove the PCI address from the name
+    mdev_uuid = mdev_uuid[:36]
+    return str(uuid.UUID(mdev_uuid))
+
 
+def mdev_uuid2name(mdev_uuid: str, parent: str = None) -> str:
+    """Convert an mdev uuid (of the form 8-4-4-4-12) and optionally its parent
+    device to a name (of the form mdev_<uuid_with_underscores>[_<pciid>]).
 
-def mdev_uuid2name(mdev_uuid: str) -> str:
-    """Convert an mdev uuid (of the form 8-4-4-4-12) to a name (of the form
-    mdev_<uuid_with_underscores>).
+    :param mdev_uuid: the uuid of the mediated device
+    :param parent: the parent device id for the mediated device
+    :returns: name of the mdev to reference in libvirt
     """
-    return "mdev_" + mdev_uuid.replace('-', '_')
+    name = "mdev_" + mdev_uuid.replace('-', '_')
+    if parent and parent.startswith('pci_'):
+        name = name + parent[4:]
+    return name
 
 
 def get_flags_by_flavor_specs(flavor: 'objects.Flavor') -> ty.Set[str]:

From 6c1b862274546a32a43e1184f24101ebb6c30680 Mon Sep 17 00:00:00 2001
From: Alexey Stupnikov <aleksey.stupnikov@gmail.com>
Date: Fri, 8 Jul 2022 17:56:38 +0200
Subject: [PATCH 54/93] Remove deleted projects from flavor access list

Previously Nova was unable to remove deleted projects from flavor's
access lists. This patch lifts described limitation and improves
logic of nova/api/openstack/identity.py library by introducing two
separate kinds of exceptions:

- webob.exc.HTTPInternalServerError is raised when Keystone identity
  service version 3.0 was not found.
- webob.exc.HTTPBadRequest is raised when specified project is not
  found.

Closes-bug: #1980845
Change-Id: Icbf3bdd944f9a6c38f25ddea0b521ca48ee87a7f
(cherry picked from commit 8c6daaacbedc33e738ce85aec0ead5f6947d60bf)
(cherry picked from commit 2ea2b556da5f10d662641bd96b0a07735d2b9607)
---
 nova/api/openstack/compute/flavor_access.py   |  9 ++++++-
 nova/api/openstack/identity.py                | 22 +++++++++-------
 .../openstack/compute/test_flavor_access.py   | 25 ++++++++++++++++++-
 3 files changed, 45 insertions(+), 11 deletions(-)

diff --git a/nova/api/openstack/compute/flavor_access.py b/nova/api/openstack/compute/flavor_access.py
index e17e6f0ddcd..fc8df15db5b 100644
--- a/nova/api/openstack/compute/flavor_access.py
+++ b/nova/api/openstack/compute/flavor_access.py
@@ -93,7 +93,14 @@ def _remove_tenant_access(self, req, id, body):
 
         vals = body['removeTenantAccess']
         tenant = vals['tenant']
-        identity.verify_project_id(context, tenant)
+        # It doesn't really matter if project exists or not: we can delete
+        # it from flavor's access list in both cases.
+        try:
+            identity.verify_project_id(context, tenant)
+        except webob.exc.HTTPBadRequest as identity_exc:
+            msg = "Project ID %s is not a valid project." % tenant
+            if msg not in identity_exc.explanation:
+                raise
 
         # NOTE(gibi): We have to load a flavor from the db here as
         # flavor.remove_access() will try to emit a notification and that needs
diff --git a/nova/api/openstack/identity.py b/nova/api/openstack/identity.py
index 7ffc623fede..15ec884aea8 100644
--- a/nova/api/openstack/identity.py
+++ b/nova/api/openstack/identity.py
@@ -27,24 +27,27 @@ def verify_project_id(context, project_id):
     """verify that a project_id exists.
 
     This attempts to verify that a project id exists. If it does not,
-    an HTTPBadRequest is emitted.
+    an HTTPBadRequest is emitted. Also HTTPBadRequest is emitted
+    if Keystone identity service version 3.0 is not found.
 
     """
     adap = utils.get_ksa_adapter(
         'identity', ksa_auth=context.get_auth_plugin(),
         min_version=(3, 0), max_version=(3, 'latest'))
 
-    failure = webob.exc.HTTPBadRequest(
-            explanation=_("Project ID %s is not a valid project.") %
-            project_id)
     try:
         resp = adap.get('/projects/%s' % project_id)
     except kse.EndpointNotFound:
         LOG.error(
-            "Keystone identity service version 3.0 was not found. This might "
-            "be because your endpoint points to the v2.0 versioned endpoint "
-            "which is not supported. Please fix this.")
-        raise failure
+            "Keystone identity service version 3.0 was not found. This "
+            "might be caused by Nova misconfiguration or Keystone "
+            "problems.")
+        msg = _("Nova was unable to find Keystone service endpoint.")
+        # TODO(astupnik). It may be reasonable to switch to HTTP 503
+        # (HTTP Service Unavailable) instead of HTTP Bad Request here.
+        # If proper Keystone servie is inaccessible, then technially
+        # this is a server side error and not an error in Nova.
+        raise webob.exc.HTTPBadRequest(explanation=msg)
     except kse.ClientException:
         # something is wrong, like there isn't a keystone v3 endpoint,
         # or nova isn't configured for the interface to talk to it;
@@ -57,7 +60,8 @@ def verify_project_id(context, project_id):
         return True
     elif resp.status_code == 404:
         # we got access, and we know this project is not there
-        raise failure
+        msg = _("Project ID %s is not a valid project.") % project_id
+        raise webob.exc.HTTPBadRequest(explanation=msg)
     elif resp.status_code == 403:
         # we don't have enough permission to verify this, so default
         # to "it's ok".
diff --git a/nova/tests/unit/api/openstack/compute/test_flavor_access.py b/nova/tests/unit/api/openstack/compute/test_flavor_access.py
index 8c25a2efc27..1c5c34e758d 100644
--- a/nova/tests/unit/api/openstack/compute/test_flavor_access.py
+++ b/nova/tests/unit/api/openstack/compute/test_flavor_access.py
@@ -353,14 +353,37 @@ def test_add_tenant_access_with_invalid_tenant(self, mock_verify):
         mock_verify.assert_called_once_with(
             req.environ['nova.context'], 'proj2')
 
+    @mock.patch('nova.objects.Flavor.remove_access')
     @mock.patch('nova.api.openstack.identity.verify_project_id',
                 side_effect=exc.HTTPBadRequest(
                     explanation="Project ID proj2 is not a valid project."))
-    def test_remove_tenant_access_with_invalid_tenant(self, mock_verify):
+    def test_remove_tenant_access_with_invalid_tenant(self,
+                                                      mock_verify,
+                                                      mock_remove_access):
         """Tests the case that the tenant does not exist in Keystone."""
         req = fakes.HTTPRequest.blank(self._prefix + '/flavors/2/action',
                                       use_admin_context=True)
         body = {'removeTenantAccess': {'tenant': 'proj2'}}
+
+        self.flavor_action_controller._remove_tenant_access(
+            req, '2', body=body)
+        mock_verify.assert_called_once_with(
+            req.environ['nova.context'], 'proj2')
+        mock_remove_access.assert_called_once_with('proj2')
+
+    @mock.patch('nova.api.openstack.identity.verify_project_id',
+                side_effect=exc.HTTPBadRequest(
+                    explanation="Nova was unable to find Keystone "
+                                "service endpoint."))
+    def test_remove_tenant_access_missing_keystone_endpoint(self,
+                                                            mock_verify):
+        """Tests the case that Keystone identity service endpoint
+        version 3.0 was not found.
+        """
+        req = fakes.HTTPRequest.blank(self._prefix + '/flavors/2/action',
+                                      use_admin_context=True)
+        body = {'removeTenantAccess': {'tenant': 'proj2'}}
+
         self.assertRaises(exc.HTTPBadRequest,
                           self.flavor_action_controller._remove_tenant_access,
                           req, '2', body=body)

From 4d8efa2d196f72fdde33136a0b50c4ee8da3c941 Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Wed, 15 Feb 2023 22:37:40 +0000
Subject: [PATCH 55/93] Use force=True for os-brick disconnect during delete

The 'force' parameter of os-brick's disconnect_volume() method allows
callers to ignore flushing errors and ensure that devices are being
removed from the host.

We should use force=True when we are going to delete an instance to
avoid leaving leftover devices connected to the compute host which
could then potentially be reused to map to volumes to an instance that
should not have access to those volumes.

We can use force=True even when disconnecting a volume that will not be
deleted on termination because os-brick will always attempt to flush
and disconnect gracefully before forcefully removing devices.

Closes-Bug: #2004555

Change-Id: I3629b84d3255a8fe9d8a7cea8c6131d7c40899e8
(cherry picked from commit db455548a12beac1153ce04eca5e728d7b773901)
(cherry picked from commit efb01985db88d6333897018174649b425feaa1b4)
(cherry picked from commit 8b4b99149a35663fc11d7d163082747b1b210b4d)
---
 .../admin/configuration/cross-cell-resize.rst |  2 +-
 doc/source/admin/configuration/index.rst      |  1 +
 .../configuration/service-user-token.rst      | 59 +++++++++++++++++
 doc/source/admin/live-migration-usage.rst     |  2 +-
 .../admin/migrate-instance-with-snapshot.rst  |  2 +-
 doc/source/admin/support-compute.rst          | 64 -------------------
 doc/source/install/compute-install-obs.rst    | 20 ++++++
 doc/source/install/compute-install-rdo.rst    | 20 ++++++
 doc/source/install/compute-install-ubuntu.rst | 20 ++++++
 doc/source/install/controller-install-obs.rst | 20 ++++++
 doc/source/install/controller-install-rdo.rst | 20 ++++++
 .../install/controller-install-ubuntu.rst     | 20 ++++++
 nova/cmd/status.py                            | 11 ++++
 nova/tests/unit/cmd/test_status.py            | 16 +++++
 nova/tests/unit/virt/hyperv/test_vmops.py     |  2 +-
 nova/tests/unit/virt/hyperv/test_volumeops.py | 26 ++++++--
 nova/tests/unit/virt/libvirt/test_driver.py   | 61 ++++++++++++++++--
 .../virt/libvirt/volume/test_fibrechannel.py  | 20 ++++++
 .../unit/virt/libvirt/volume/test_iscsi.py    |  9 +++
 .../unit/virt/libvirt/volume/test_lightos.py  |  8 ++-
 .../unit/virt/libvirt/volume/test_nvme.py     |  8 ++-
 .../unit/virt/libvirt/volume/test_scaleio.py  |  8 ++-
 .../unit/virt/libvirt/volume/test_storpool.py | 16 ++++-
 .../virt/libvirt/volume/test_vzstorage.py     |  8 ++-
 nova/virt/hyperv/vmops.py                     |  2 +-
 nova/virt/hyperv/volumeops.py                 | 12 ++--
 nova/virt/libvirt/driver.py                   |  7 +-
 nova/virt/libvirt/volume/fibrechannel.py      |  7 +-
 nova/virt/libvirt/volume/fs.py                |  2 +-
 nova/virt/libvirt/volume/iscsi.py             |  7 +-
 nova/virt/libvirt/volume/lightos.py           |  7 +-
 nova/virt/libvirt/volume/nvme.py              |  6 +-
 nova/virt/libvirt/volume/quobyte.py           |  2 +-
 nova/virt/libvirt/volume/scaleio.py           |  7 +-
 nova/virt/libvirt/volume/smbfs.py             |  2 +-
 nova/virt/libvirt/volume/storpool.py          |  5 +-
 nova/virt/libvirt/volume/volume.py            |  2 +-
 nova/virt/libvirt/volume/vzstorage.py         |  5 +-
 .../service-user-token-421d067c16257782.yaml  | 11 ++++
 39 files changed, 413 insertions(+), 114 deletions(-)
 create mode 100644 doc/source/admin/configuration/service-user-token.rst
 create mode 100644 releasenotes/notes/service-user-token-421d067c16257782.yaml

diff --git a/doc/source/admin/configuration/cross-cell-resize.rst b/doc/source/admin/configuration/cross-cell-resize.rst
index e51e4257748..0c34fd13f51 100644
--- a/doc/source/admin/configuration/cross-cell-resize.rst
+++ b/doc/source/admin/configuration/cross-cell-resize.rst
@@ -284,7 +284,7 @@ Troubleshooting
 Timeouts
 ~~~~~~~~
 
-Configure a :ref:`service user <user_token_timeout>` in case the user token
+Configure a :ref:`service user <service_user_token>` in case the user token
 times out, e.g. during the snapshot and download of a large server image.
 
 If RPC calls are timing out with a ``MessagingTimeout`` error in the logs,
diff --git a/doc/source/admin/configuration/index.rst b/doc/source/admin/configuration/index.rst
index 233597b1fe4..f5b6fde9dac 100644
--- a/doc/source/admin/configuration/index.rst
+++ b/doc/source/admin/configuration/index.rst
@@ -19,6 +19,7 @@ A list of config options based on different topics can be found below:
 .. toctree::
    :maxdepth: 1
 
+   /admin/configuration/service-user-token
    /admin/configuration/api
    /admin/configuration/resize
    /admin/configuration/cross-cell-resize
diff --git a/doc/source/admin/configuration/service-user-token.rst b/doc/source/admin/configuration/service-user-token.rst
new file mode 100644
index 00000000000..740730af1d0
--- /dev/null
+++ b/doc/source/admin/configuration/service-user-token.rst
@@ -0,0 +1,59 @@
+.. _service_user_token:
+
+===================
+Service User Tokens
+===================
+
+.. note::
+
+   Configuration of service user tokens is **required** for every Nova service
+   for security reasons. See https://bugs.launchpad.net/nova/+bug/2004555 for
+   details.
+
+Configure Nova to send service user tokens alongside regular user tokens when
+making REST API calls to other services. The identity service (Keystone) will
+authenticate a request using the service user token if the regular user token
+has expired.
+
+This is important when long-running operations such as live migration or
+snapshot take long enough to exceed the expiry of the user token. Without the
+service token, if a long-running operation exceeds the expiry of the user
+token, post operations such as cleanup after a live migration could fail when
+Nova calls other service APIs like block-storage (Cinder) or networking
+(Neutron).
+
+The service token is also used by services to validate whether the API caller
+is a service. Some service APIs are restricted to service users only.
+
+To set up service tokens, create a ``nova`` service user and ``service`` role
+in the identity service (Keystone) and assign the ``service`` role to the
+``nova`` service user.
+
+Then, configure the :oslo.config:group:`service_user` section of the Nova
+configuration file, for example:
+
+.. code-block:: ini
+
+   [service_user]
+   send_service_user_token = true
+   auth_url = https://104.130.216.102/identity
+   auth_strategy = keystone
+   auth_type = password
+   project_domain_name = Default
+   project_name = service
+   user_domain_name = Default
+   username = nova
+   password = secretservice
+   ...
+
+And configure the other identity options as necessary for the service user,
+much like you would configure nova to work with the image service (Glance) or
+networking service (Neutron).
+
+.. note::
+
+   Please note that the role assigned to the :oslo.config:group:`service_user`
+   needs to be in the configured
+   :oslo.config:option:`keystone_authtoken.service_token_roles` of other
+   services such as block-storage (Cinder), image (Glance), and networking
+   (Neutron).
diff --git a/doc/source/admin/live-migration-usage.rst b/doc/source/admin/live-migration-usage.rst
index 783ab5e27c2..a1e7f187566 100644
--- a/doc/source/admin/live-migration-usage.rst
+++ b/doc/source/admin/live-migration-usage.rst
@@ -320,4 +320,4 @@ To make live-migration succeed, you have several options:
 
 If live migrations routinely timeout or fail during cleanup operations due
 to the user token timing out, consider configuring nova to use
-:ref:`service user tokens <user_token_timeout>`.
+:ref:`service user tokens <service_user_token>`.
diff --git a/doc/source/admin/migrate-instance-with-snapshot.rst b/doc/source/admin/migrate-instance-with-snapshot.rst
index 65059679abb..230431091e0 100644
--- a/doc/source/admin/migrate-instance-with-snapshot.rst
+++ b/doc/source/admin/migrate-instance-with-snapshot.rst
@@ -67,7 +67,7 @@ Create a snapshot of the instance
 
    If snapshot operations routinely fail because the user token times out
    while uploading a large disk image, consider configuring nova to use
-   :ref:`service user tokens <user_token_timeout>`.
+   :ref:`service user tokens <service_user_token>`.
 
 #. Use the :command:`openstack image list` command to check the status
    until the status is ``ACTIVE``:
diff --git a/doc/source/admin/support-compute.rst b/doc/source/admin/support-compute.rst
index 8522e51d795..31e32fd1ddc 100644
--- a/doc/source/admin/support-compute.rst
+++ b/doc/source/admin/support-compute.rst
@@ -478,67 +478,3 @@ Ensure the ``compute`` endpoint in the identity service catalog is pointing
 at ``/v2.1`` instead of ``/v2``. The former route supports microversions,
 while the latter route is considered the legacy v2.0 compatibility-mode
 route which renders all requests as if they were made on the legacy v2.0 API.
-
-
-.. _user_token_timeout:
-
-User token times out during long-running operations
----------------------------------------------------
-
-Problem
-~~~~~~~
-
-Long-running operations such as live migration or snapshot can sometimes
-overrun the expiry of the user token. In such cases, post operations such
-as cleaning up after a live migration can fail when the nova-compute service
-needs to cleanup resources in other services, such as in the block-storage
-(cinder) or networking (neutron) services.
-
-For example:
-
-.. code-block:: console
-
-  2018-12-17 13:47:29.591 16987 WARNING nova.virt.libvirt.migration [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Live migration not completed after 2400 sec
-  2018-12-17 13:47:30.097 16987 WARNING nova.virt.libvirt.driver [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Migration operation was cancelled
-  2018-12-17 13:47:30.299 16987 ERROR nova.virt.libvirt.driver [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Live Migration failure: operation aborted: migration job: canceled by client: libvirtError: operation aborted: migration job: canceled by client
-  2018-12-17 13:47:30.685 16987 INFO nova.compute.manager [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] [instance: ead8ecc3-f473-4672-a67b-c44534c6042d] Swapping old allocation on 3e32d595-bd1f-4136-a7f4-c6703d2fbe18 held by migration 17bec61d-544d-47e0-a1c1-37f9d7385286 for instance
-  2018-12-17 13:47:32.450 16987 ERROR nova.volume.cinder [req-7bc758de-b2e4-461b-a971-f79be6cd4703 313d1247d7b845da9c731eec53e50a26 2f693c782fa748c2baece8db95b4ba5b - default default] Delete attachment failed for attachment 58997d5b-24f0-4073-819e-97916fb1ee19. Error: The request you have made requires authentication. (HTTP 401) Code: 401: Unauthorized: The request you have made requires authentication. (HTTP 401)
-
-Solution
-~~~~~~~~
-
-Configure nova to use service user tokens to supplement the regular user token
-used to initiate the operation. The identity service (keystone) will then
-authenticate a request using the service user token if the user token has
-already expired.
-
-To use, create a service user in the identity service similar as you would when
-creating the ``nova`` service user.
-
-Then configure the :oslo.config:group:`service_user` section of the nova
-configuration file, for example:
-
-.. code-block:: ini
-
-  [service_user]
-  send_service_user_token = True
-  auth_type = password
-  project_domain_name = Default
-  project_name = service
-  user_domain_name = Default
-  password = secretservice
-  username = nova
-  auth_url = https://104.130.216.102/identity
-  ...
-
-And configure the other identity options as necessary for the service user,
-much like you would configure nova to work with the image service (glance)
-or networking service.
-
-.. note::
-
-  Please note that the role of the :oslo.config:group:`service_user` you
-  configure needs to be a superset of
-  :oslo.config:option:`keystone_authtoken.service_token_roles` (The option
-  :oslo.config:option:`keystone_authtoken.service_token_roles` is configured
-  in cinder, glance and neutron).
diff --git a/doc/source/install/compute-install-obs.rst b/doc/source/install/compute-install-obs.rst
index c5c1d29fb3d..c227b6eba43 100644
--- a/doc/source/install/compute-install-obs.rst
+++ b/doc/source/install/compute-install-obs.rst
@@ -92,6 +92,26 @@ Install and configure components
         Comment out or remove any other options in the ``[keystone_authtoken]``
         section.
 
+   * In the ``[service_user]`` section, configure :ref:`service user
+     tokens <service_user_token>`:
+
+     .. path /etc/nova/nova.conf
+     .. code-block:: ini
+
+        [service_user]
+        send_service_user_token = true
+        auth_url = https://controller/identity
+        auth_strategy = keystone
+        auth_type = password
+        project_domain_name = Default
+        project_name = service
+        user_domain_name = Default
+        username = nova
+        password = NOVA_PASS
+
+     Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in
+     the Identity service.
+
    * In the ``[DEFAULT]`` section, configure the ``my_ip`` option:
 
      .. path /etc/nova/nova.conf
diff --git a/doc/source/install/compute-install-rdo.rst b/doc/source/install/compute-install-rdo.rst
index 0a5ad685a62..0c6203a6673 100644
--- a/doc/source/install/compute-install-rdo.rst
+++ b/doc/source/install/compute-install-rdo.rst
@@ -84,6 +84,26 @@ Install and configure components
         Comment out or remove any other options in the ``[keystone_authtoken]``
         section.
 
+   * In the ``[service_user]`` section, configure :ref:`service user
+     tokens <service_user_token>`:
+
+     .. path /etc/nova/nova.conf
+     .. code-block:: ini
+
+        [service_user]
+        send_service_user_token = true
+        auth_url = https://controller/identity
+        auth_strategy = keystone
+        auth_type = password
+        project_domain_name = Default
+        project_name = service
+        user_domain_name = Default
+        username = nova
+        password = NOVA_PASS
+
+     Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in
+     the Identity service.
+
    * In the ``[DEFAULT]`` section, configure the ``my_ip`` option:
 
      .. path /etc/nova/nova.conf
diff --git a/doc/source/install/compute-install-ubuntu.rst b/doc/source/install/compute-install-ubuntu.rst
index 8605c73316e..baf0585e52b 100644
--- a/doc/source/install/compute-install-ubuntu.rst
+++ b/doc/source/install/compute-install-ubuntu.rst
@@ -74,6 +74,26 @@ Install and configure components
         Comment out or remove any other options in the
         ``[keystone_authtoken]`` section.
 
+   * In the ``[service_user]`` section, configure :ref:`service user
+     tokens <service_user_token>`:
+
+     .. path /etc/nova/nova.conf
+     .. code-block:: ini
+
+        [service_user]
+        send_service_user_token = true
+        auth_url = https://controller/identity
+        auth_strategy = keystone
+        auth_type = password
+        project_domain_name = Default
+        project_name = service
+        user_domain_name = Default
+        username = nova
+        password = NOVA_PASS
+
+     Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in
+     the Identity service.
+
    * In the ``[DEFAULT]`` section, configure the ``my_ip`` option:
 
      .. path /etc/nova/nova.conf
diff --git a/doc/source/install/controller-install-obs.rst b/doc/source/install/controller-install-obs.rst
index 18499612c3e..01b7bb0f5ab 100644
--- a/doc/source/install/controller-install-obs.rst
+++ b/doc/source/install/controller-install-obs.rst
@@ -260,6 +260,26 @@ Install and configure components
         Comment out or remove any other options in the ``[keystone_authtoken]``
         section.
 
+   * In the ``[service_user]`` section, configure :ref:`service user
+     tokens <service_user_token>`:
+
+     .. path /etc/nova/nova.conf
+     .. code-block:: ini
+
+        [service_user]
+        send_service_user_token = true
+        auth_url = https://controller/identity
+        auth_strategy = keystone
+        auth_type = password
+        project_domain_name = Default
+        project_name = service
+        user_domain_name = Default
+        username = nova
+        password = NOVA_PASS
+
+     Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in
+     the Identity service.
+
    * In the ``[DEFAULT]`` section, configure the ``my_ip`` option to use the
      management interface IP address of the controller node:
 
diff --git a/doc/source/install/controller-install-rdo.rst b/doc/source/install/controller-install-rdo.rst
index fd2419631ec..b6098f1776b 100644
--- a/doc/source/install/controller-install-rdo.rst
+++ b/doc/source/install/controller-install-rdo.rst
@@ -247,6 +247,26 @@ Install and configure components
         Comment out or remove any other options in the ``[keystone_authtoken]``
         section.
 
+   * In the ``[service_user]`` section, configure :ref:`service user
+     tokens <service_user_token>`:
+
+     .. path /etc/nova/nova.conf
+     .. code-block:: ini
+
+        [service_user]
+        send_service_user_token = true
+        auth_url = https://controller/identity
+        auth_strategy = keystone
+        auth_type = password
+        project_domain_name = Default
+        project_name = service
+        user_domain_name = Default
+        username = nova
+        password = NOVA_PASS
+
+     Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in
+     the Identity service.
+
    * In the ``[DEFAULT]`` section, configure the ``my_ip`` option to use the
      management interface IP address of the controller node:
 
diff --git a/doc/source/install/controller-install-ubuntu.rst b/doc/source/install/controller-install-ubuntu.rst
index 7282b0b2e22..1363a98ba8b 100644
--- a/doc/source/install/controller-install-ubuntu.rst
+++ b/doc/source/install/controller-install-ubuntu.rst
@@ -237,6 +237,26 @@ Install and configure components
         Comment out or remove any other options in the ``[keystone_authtoken]``
         section.
 
+   * In the ``[service_user]`` section, configure :ref:`service user
+     tokens <service_user_token>`:
+
+     .. path /etc/nova/nova.conf
+     .. code-block:: ini
+
+        [service_user]
+        send_service_user_token = true
+        auth_url = https://controller/identity
+        auth_strategy = keystone
+        auth_type = password
+        project_domain_name = Default
+        project_name = service
+        user_domain_name = Default
+        username = nova
+        password = NOVA_PASS
+
+     Replace ``NOVA_PASS`` with the password you chose for the ``nova`` user in
+     the Identity service.
+
    * In the ``[DEFAULT]`` section, configure the ``my_ip`` option to use the
      management interface IP address of the controller node:
 
diff --git a/nova/cmd/status.py b/nova/cmd/status.py
index 8a7041b062b..2f310f08714 100644
--- a/nova/cmd/status.py
+++ b/nova/cmd/status.py
@@ -336,6 +336,15 @@ def _check_machine_type_set(self):
 
         return upgradecheck.Result(upgradecheck.Code.SUCCESS)
 
+    def _check_service_user_token(self):
+        if not CONF.service_user.send_service_user_token:
+            msg = (_("""
+Service user token configuration is required for all Nova services.
+For more details see the following:
+https://docs.openstack.org/latest/nova/admin/configuration/service-user-token.html"""))  # noqa
+            return upgradecheck.Result(upgradecheck.Code.FAILURE, msg)
+        return upgradecheck.Result(upgradecheck.Code.SUCCESS)
+
     # The format of the check functions is to return an upgradecheck.Result
     # object with the appropriate upgradecheck.Code and details set. If the
     # check hits warnings or failures then those should be stored in the
@@ -361,6 +370,8 @@ def _check_machine_type_set(self):
         (_('Older than N-1 computes'), _check_old_computes),
         # Added in Wallaby
         (_('hw_machine_type unset'), _check_machine_type_set),
+        # Added in Bobcat
+        (_('Service User Token Configuration'), _check_service_user_token),
     )
 
 
diff --git a/nova/tests/unit/cmd/test_status.py b/nova/tests/unit/cmd/test_status.py
index ba85590697e..2d33c890b77 100644
--- a/nova/tests/unit/cmd/test_status.py
+++ b/nova/tests/unit/cmd/test_status.py
@@ -502,3 +502,19 @@ def test_instances_not_found_without_hw_machine_type(self):
             upgradecheck.Code.SUCCESS,
             result.code
         )
+
+
+class TestUpgradeCheckServiceUserToken(test.NoDBTestCase):
+
+    def setUp(self):
+        super().setUp()
+        self.cmd = status.UpgradeCommands()
+
+    def test_service_user_token_not_configured(self):
+        result = self.cmd._check_service_user_token()
+        self.assertEqual(upgradecheck.Code.FAILURE, result.code)
+
+    def test_service_user_token_configured(self):
+        self.flags(send_service_user_token=True, group='service_user')
+        result = self.cmd._check_service_user_token()
+        self.assertEqual(upgradecheck.Code.SUCCESS, result.code)
diff --git a/nova/tests/unit/virt/hyperv/test_vmops.py b/nova/tests/unit/virt/hyperv/test_vmops.py
index 1a71045ea27..0110b595c7e 100644
--- a/nova/tests/unit/virt/hyperv/test_vmops.py
+++ b/nova/tests/unit/virt/hyperv/test_vmops.py
@@ -1129,7 +1129,7 @@ def test_destroy(self, mock_unplug_vifs, mock_power_off,
         mock_unplug_vifs.assert_called_once_with(
             mock_instance, mock.sentinel.fake_network_info)
         mock_disconnect_volumes.assert_called_once_with(
-            mock.sentinel.FAKE_BD_INFO)
+            mock.sentinel.FAKE_BD_INFO, force=True)
         mock_delete_disk_files.assert_called_once_with(
             mock_instance.name)
 
diff --git a/nova/tests/unit/virt/hyperv/test_volumeops.py b/nova/tests/unit/virt/hyperv/test_volumeops.py
index da7262085d8..4a088b6030e 100644
--- a/nova/tests/unit/virt/hyperv/test_volumeops.py
+++ b/nova/tests/unit/virt/hyperv/test_volumeops.py
@@ -140,7 +140,13 @@ def test_disconnect_volumes(self, mock_get_volume_driver):
 
         self._volumeops.disconnect_volumes(block_device_info)
         fake_volume_driver.disconnect_volume.assert_called_once_with(
-            block_device_mapping[0]['connection_info'])
+            block_device_mapping[0]['connection_info'], force=False)
+
+        # Verify force=True
+        fake_volume_driver.disconnect_volume.reset_mock()
+        self._volumeops.disconnect_volumes(block_device_info, force=True)
+        fake_volume_driver.disconnect_volume.assert_called_once_with(
+            block_device_mapping[0]['connection_info'], force=True)
 
     @mock.patch('time.sleep')
     @mock.patch.object(volumeops.VolumeOps, '_get_volume_driver')
@@ -180,7 +186,7 @@ def _test_attach_volume(self, mock_get_volume_driver, mock_sleep,
 
         if attach_failed:
             fake_volume_driver.disconnect_volume.assert_called_once_with(
-                fake_conn_info)
+                fake_conn_info, force=False)
             mock_sleep.assert_has_calls(
                 [mock.call(CONF.hyperv.volume_attach_retry_interval)] *
                     CONF.hyperv.volume_attach_retry_count)
@@ -202,7 +208,13 @@ def test_disconnect_volume(self, mock_get_volume_driver):
         mock_get_volume_driver.assert_called_once_with(
             mock.sentinel.conn_info)
         fake_volume_driver.disconnect_volume.assert_called_once_with(
-            mock.sentinel.conn_info)
+            mock.sentinel.conn_info, force=False)
+
+        # Verify force=True
+        fake_volume_driver.disconnect_volume.reset_mock()
+        self._volumeops.disconnect_volume(mock.sentinel.conn_info, force=True)
+        fake_volume_driver.disconnect_volume.assert_called_once_with(
+            mock.sentinel.conn_info, force=True)
 
     @mock.patch.object(volumeops.VolumeOps, '_get_volume_driver')
     def test_detach_volume(self, mock_get_volume_driver):
@@ -346,7 +358,13 @@ def test_disconnect_volume(self):
         self._base_vol_driver.disconnect_volume(conn_info)
 
         self._conn.disconnect_volume.assert_called_once_with(
-            conn_info['data'])
+            conn_info['data'], force=False)
+
+        # Verify force=True
+        self._conn.disconnect_volume.reset_mock()
+        self._base_vol_driver.disconnect_volume(conn_info, force=True)
+        self._conn.disconnect_volume.assert_called_once_with(
+            conn_info['data'], force=True)
 
     @mock.patch.object(volumeops.BaseVolumeDriver, '_get_disk_res_path')
     def _test_get_disk_resource_path_by_conn_info(self,
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 33d16851b4c..0eada9ee140 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -9222,7 +9222,7 @@ def test_disconnect_multiattach_single_connection(
         drvr._disconnect_volume(
             self.context, fake_connection_info, fake_instance_1)
         mock_volume_driver.disconnect_volume.assert_called_once_with(
-            fake_connection_info, fake_instance_1)
+            fake_connection_info, fake_instance_1, force=False)
 
     @mock.patch.object(libvirt_driver.LibvirtDriver, '_detach_encryptor')
     @mock.patch('nova.objects.InstanceList.get_uuids_by_host')
@@ -9596,7 +9596,12 @@ def test_detach_volume_order_with_encryptors(self, mock_get_guest,
                 device_name='vdc',
             ),
             mock.call.detach_encryptor(**encryption),
-            mock.call.disconnect_volume(connection_info, instance)])
+            mock.call.disconnect_volume(
+                connection_info,
+                instance,
+                force=False,
+            )
+        ])
         get_device_conf_func = mock_detach_with_retry.mock_calls[0][1][2]
         self.assertEqual(mock_guest.get_disk, get_device_conf_func.func)
         self.assertEqual(('vdc',), get_device_conf_func.args)
@@ -19811,16 +19816,64 @@ def test_cleanup_destroy_secrets(self, mock_disconnect_volume):
                 self.context,
                 mock.sentinel.connection_info,
                 instance,
-                destroy_secrets=False
+                destroy_secrets=False,
+                force=True
             ),
             mock.call(
                 self.context,
                 mock.sentinel.connection_info,
                 instance,
-                destroy_secrets=True
+                destroy_secrets=True,
+                force=True
             )
         ])
 
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_volume_driver')
+    @mock.patch(
+        'nova.virt.libvirt.driver.LibvirtDriver._should_disconnect_target',
+        new=mock.Mock(return_value=True))
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._detach_encryptor',
+                new=mock.Mock())
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._undefine_domain',
+                new=mock.Mock())
+    @mock.patch('nova.virt.libvirt.driver.LibvirtDriver._get_vpmems',
+                new=mock.Mock(return_value=None))
+    def test_cleanup_disconnect_volume(self, mock_vol_driver):
+        """Verify that we call disconnect_volume() with force=True
+
+        cleanup() is called by destroy() when an instance is being deleted and
+        force=True should be passed down to os-brick's disconnect_volume()
+        call, which will ensure removal of devices regardless of errors.
+
+        We need to ensure that devices are removed when an instance is being
+        deleted to avoid leaving leftover devices that could later be
+        erroneously connected by external entities (example: multipathd) to
+        instances that should not have access to the volumes.
+
+        See https://bugs.launchpad.net/nova/+bug/2004555 for details.
+        """
+        connection_info = mock.MagicMock()
+        block_device_info = {
+            'block_device_mapping': [
+                {
+                    'connection_info': connection_info
+                }
+            ]
+        }
+        instance = objects.Instance(self.context, **self.test_instance)
+        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI())
+
+        drvr.cleanup(
+            self.context,
+            instance,
+            network_info={},
+            block_device_info=block_device_info,
+            destroy_vifs=False,
+            destroy_disks=False,
+        )
+        mock_vol_driver.return_value.disconnect_volume.assert_called_once_with(
+            connection_info, instance, force=True)
+
     @mock.patch.object(libvirt_driver.LibvirtDriver, '_get_volume_encryption')
     @mock.patch.object(libvirt_driver.LibvirtDriver, '_allow_native_luksv1')
     def test_swap_volume_native_luks_blocked(self, mock_allow_native_luksv1,
diff --git a/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py b/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py
index 89a59f2f1ab..f0d403e3004 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_fibrechannel.py
@@ -81,3 +81,23 @@ def test_extend_volume(self):
         self.assertEqual(requested_size, new_size)
         libvirt_driver.connector.extend_volume.assert_called_once_with(
            connection_info['data'])
+
+    def test_disconnect_volume(self):
+        device_path = '/dev/fake-dev'
+        connection_info = {'data': {'device_path': device_path}}
+
+        libvirt_driver = fibrechannel.LibvirtFibreChannelVolumeDriver(
+                                                                self.fake_host)
+        libvirt_driver.connector.disconnect_volume = mock.MagicMock()
+        libvirt_driver.disconnect_volume(
+            connection_info, mock.sentinel.instance)
+
+        libvirt_driver.connector.disconnect_volume.assert_called_once_with(
+            connection_info['data'], connection_info['data'], force=False)
+
+        # Verify force=True
+        libvirt_driver.connector.disconnect_volume.reset_mock()
+        libvirt_driver.disconnect_volume(
+            connection_info, mock.sentinel.instance, force=True)
+        libvirt_driver.connector.disconnect_volume.assert_called_once_with(
+            connection_info['data'], connection_info['data'], force=True)
diff --git a/nova/tests/unit/virt/libvirt/volume/test_iscsi.py b/nova/tests/unit/virt/libvirt/volume/test_iscsi.py
index f8a64abea5f..540c9c822d0 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_iscsi.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_iscsi.py
@@ -57,10 +57,19 @@ def test_libvirt_iscsi_driver_disconnect_volume_with_devicenotfound(self,
                 device=device_path))
         libvirt_driver.disconnect_volume(connection_info,
                                          mock.sentinel.instance)
+        libvirt_driver.connector.disconnect_volume.assert_called_once_with(
+            connection_info['data'], None, force=False)
 
         msg = mock_LOG_warning.call_args_list[0]
         self.assertIn('Ignoring VolumeDeviceNotFound', msg[0][0])
 
+        # Verify force=True
+        libvirt_driver.connector.disconnect_volume.reset_mock()
+        libvirt_driver.disconnect_volume(
+            connection_info, mock.sentinel.instance, force=True)
+        libvirt_driver.connector.disconnect_volume.assert_called_once_with(
+            connection_info['data'], None, force=True)
+
     def test_extend_volume(self):
         device_path = '/dev/fake-dev'
         connection_info = {'data': {'device_path': device_path}}
diff --git a/nova/tests/unit/virt/libvirt/volume/test_lightos.py b/nova/tests/unit/virt/libvirt/volume/test_lightos.py
index 67fead13df4..1eb9583d4cf 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_lightos.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_lightos.py
@@ -62,7 +62,13 @@ def test_libvirt_lightos_driver_disconnect(self):
         connection_info = {'data': disk_info}
         lightos_driver.disconnect_volume(connection_info, None)
         lightos_driver.connector.disconnect_volume.assert_called_once_with(
-            disk_info, None)
+            disk_info, None, force=False)
+
+        # Verify force=True
+        lightos_driver.connector.disconnect_volume.reset_mock()
+        lightos_driver.disconnect_volume(connection_info, None, force=True)
+        lightos_driver.connector.disconnect_volume.assert_called_once_with(
+            disk_info, None, force=True)
 
     @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory',
         new=mock.Mock(return_value=mock.Mock()))
diff --git a/nova/tests/unit/virt/libvirt/volume/test_nvme.py b/nova/tests/unit/virt/libvirt/volume/test_nvme.py
index 5159f3aaf60..2803903e9fc 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_nvme.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_nvme.py
@@ -77,7 +77,13 @@ def test_libvirt_nvme_driver_disconnect(self):
         connection_info = {'data': disk_info}
         nvme_driver.disconnect_volume(connection_info, None)
         nvme_driver.connector.disconnect_volume.assert_called_once_with(
-            disk_info, None)
+            disk_info, None, force=False)
+
+        # Verify force=True
+        nvme_driver.connector.disconnect_volume.reset_mock()
+        nvme_driver.disconnect_volume(connection_info, None, force=True)
+        nvme_driver.connector.disconnect_volume.assert_called_once_with(
+            disk_info, None, force=True)
 
     @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory',
         new=mock.Mock(return_value=mock.Mock()))
diff --git a/nova/tests/unit/virt/libvirt/volume/test_scaleio.py b/nova/tests/unit/virt/libvirt/volume/test_scaleio.py
index 6d9247cd2d9..ed5ab08a6e6 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_scaleio.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_scaleio.py
@@ -49,7 +49,13 @@ def test_libvirt_scaleio_driver_disconnect(self):
         conn = {'data': mock.sentinel.conn_data}
         sio.disconnect_volume(conn, mock.sentinel.instance)
         sio.connector.disconnect_volume.assert_called_once_with(
-            mock.sentinel.conn_data, None)
+            mock.sentinel.conn_data, None, force=False)
+
+        # Verify force=True
+        sio.connector.disconnect_volume.reset_mock()
+        sio.disconnect_volume(conn, mock.sentinel.instance, force=True)
+        sio.connector.disconnect_volume.assert_called_once_with(
+            mock.sentinel.conn_data, None, force=True)
 
     @mock.patch('os_brick.initiator.connector.InitiatorConnector.factory',
         new=mock.Mock(return_value=mock.Mock()))
diff --git a/nova/tests/unit/virt/libvirt/volume/test_storpool.py b/nova/tests/unit/virt/libvirt/volume/test_storpool.py
index e14954f1487..9ceac072602 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_storpool.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_storpool.py
@@ -53,9 +53,11 @@ def connect_volume(self, connection_info):
         }
         return {'type': 'block', 'path': test_attached[v]['path']}
 
-    def disconnect_volume(self, connection_info, device_info):
+    def disconnect_volume(self, connection_info, device_info, **kwargs):
         self.inst.assertIn('client_id', connection_info)
         self.inst.assertIn('volume', connection_info)
+        self.inst.assertIn('force', kwargs)
+        self.inst.assertEqual(self.inst.force, kwargs.get('force'))
 
         v = connection_info['volume']
         if v not in test_attached:
@@ -86,6 +88,11 @@ def factory(self, proto, helper):
 class LibvirtStorPoolVolumeDriverTestCase(
         test_volume.LibvirtVolumeBaseTestCase):
 
+    def setUp(self):
+        super().setUp()
+        # This is for testing the force flag of disconnect_volume()
+        self.force = False
+
     def mock_storpool(f):
         def _config_inner_inner1(inst, *args, **kwargs):
             @mock.patch(
@@ -175,3 +182,10 @@ def test_storpool_attach_detach_extend(self):
 
         libvirt_driver.disconnect_volume(ci_2, mock.sentinel.instance)
         self.assertDictEqual({}, test_attached)
+
+        # Connect the volume again so we can detach it again
+        libvirt_driver.connect_volume(ci_2, mock.sentinel.instance)
+        # Verify force=True
+        self.force = True
+        libvirt_driver.disconnect_volume(
+            ci_2, mock.sentinel.instance, force=True)
diff --git a/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py b/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py
index 883cebb55a1..032ceb4fe59 100644
--- a/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py
+++ b/nova/tests/unit/virt/libvirt/volume/test_vzstorage.py
@@ -95,7 +95,13 @@ def test_libvirt_vzstorage_driver_disconnect(self):
         conn = {'data': mock.sentinel.conn_data}
         drv.disconnect_volume(conn, mock.sentinel.instance)
         drv.connector.disconnect_volume.assert_called_once_with(
-            mock.sentinel.conn_data, None)
+            mock.sentinel.conn_data, None, force=False)
+
+        # Verify force=True
+        drv.connector.disconnect_volume.reset_mock()
+        drv.disconnect_volume(conn, mock.sentinel.instance, force=True)
+        drv.connector.disconnect_volume.assert_called_once_with(
+            mock.sentinel.conn_data, None, force=True)
 
     def test_libvirt_vzstorage_driver_get_config(self):
         libvirt_driver = vzstorage.LibvirtVZStorageVolumeDriver(self.fake_host)
diff --git a/nova/virt/hyperv/vmops.py b/nova/virt/hyperv/vmops.py
index 3ec7e90c306..08adeada761 100644
--- a/nova/virt/hyperv/vmops.py
+++ b/nova/virt/hyperv/vmops.py
@@ -747,7 +747,7 @@ def destroy(self, instance, network_info, block_device_info,
             # should be disconnected even if the VM doesn't exist anymore,
             # so they are not leaked.
             self.unplug_vifs(instance, network_info)
-            self._volumeops.disconnect_volumes(block_device_info)
+            self._volumeops.disconnect_volumes(block_device_info, force=True)
 
             if destroy_disks:
                 self._delete_disk_files(instance_name)
diff --git a/nova/virt/hyperv/volumeops.py b/nova/virt/hyperv/volumeops.py
index da5b40f3751..d2bfed2441e 100644
--- a/nova/virt/hyperv/volumeops.py
+++ b/nova/virt/hyperv/volumeops.py
@@ -59,10 +59,10 @@ def attach_volumes(self, volumes, instance_name):
         for vol in volumes:
             self.attach_volume(vol['connection_info'], instance_name)
 
-    def disconnect_volumes(self, block_device_info):
+    def disconnect_volumes(self, block_device_info, force=False):
         mapping = driver.block_device_info_get_mapping(block_device_info)
         for vol in mapping:
-            self.disconnect_volume(vol['connection_info'])
+            self.disconnect_volume(vol['connection_info'], force=force)
 
     def attach_volume(self, connection_info, instance_name,
                       disk_bus=constants.CTRL_TYPE_SCSI):
@@ -116,9 +116,9 @@ def _attach_volume(self, connection_info, instance_name,
             volume_driver.set_disk_qos_specs(connection_info,
                                              qos_specs)
 
-    def disconnect_volume(self, connection_info):
+    def disconnect_volume(self, connection_info, force=False):
         volume_driver = self._get_volume_driver(connection_info)
-        volume_driver.disconnect_volume(connection_info)
+        volume_driver.disconnect_volume(connection_info, force=force)
 
     def detach_volume(self, connection_info, instance_name):
         LOG.debug("Detaching volume: %(connection_info)s "
@@ -231,8 +231,8 @@ def _connector(self):
     def connect_volume(self, connection_info):
         return self._connector.connect_volume(connection_info['data'])
 
-    def disconnect_volume(self, connection_info):
-        self._connector.disconnect_volume(connection_info['data'])
+    def disconnect_volume(self, connection_info, force=False):
+        self._connector.disconnect_volume(connection_info['data'], force=force)
 
     def get_disk_resource_path(self, connection_info):
         disk_paths = self._connector.get_volume_paths(connection_info['data'])
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 542943282fd..615a009e062 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -1639,7 +1639,7 @@ def _cleanup(self, context, instance, network_info, block_device_info=None,
             try:
                 self._disconnect_volume(
                     context, connection_info, instance,
-                    destroy_secrets=destroy_secrets)
+                    destroy_secrets=destroy_secrets, force=True)
             except Exception as exc:
                 with excutils.save_and_reraise_exception() as ctxt:
                     if cleanup_instance_disks:
@@ -1956,7 +1956,7 @@ def _should_disconnect_target(self, context, instance, multiattach,
         return (False if connection_count > 1 else True)
 
     def _disconnect_volume(self, context, connection_info, instance,
-                           encryption=None, destroy_secrets=True):
+                           encryption=None, destroy_secrets=True, force=False):
         self._detach_encryptor(
             context,
             connection_info,
@@ -1968,7 +1968,8 @@ def _disconnect_volume(self, context, connection_info, instance,
         multiattach = connection_info.get('multiattach', False)
         if self._should_disconnect_target(
                 context, instance, multiattach, vol_driver, volume_id):
-            vol_driver.disconnect_volume(connection_info, instance)
+            vol_driver.disconnect_volume(
+                connection_info, instance, force=force)
         else:
             LOG.info('Detected multiple connections on this host for '
                      'volume: %(volume)s, skipping target disconnect.',
diff --git a/nova/virt/libvirt/volume/fibrechannel.py b/nova/virt/libvirt/volume/fibrechannel.py
index b50db3aa1c0..1f890c95c12 100644
--- a/nova/virt/libvirt/volume/fibrechannel.py
+++ b/nova/virt/libvirt/volume/fibrechannel.py
@@ -59,7 +59,7 @@ def connect_volume(self, connection_info, instance):
             connection_info['data']['multipath_id'] = \
                 device_info['multipath_id']
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Detach the volume from instance_name."""
 
         LOG.debug("calling os-brick to detach FC Volume", instance=instance)
@@ -69,11 +69,12 @@ def disconnect_volume(self, connection_info, instance):
         # the 2nd param of disconnect_volume and be consistent
         # with the rest of the connectors.
         self.connector.disconnect_volume(connection_info['data'],
-                                         connection_info['data'])
+                                         connection_info['data'],
+                                         force=force)
         LOG.debug("Disconnected FC Volume", instance=instance)
 
         super(LibvirtFibreChannelVolumeDriver,
-              self).disconnect_volume(connection_info, instance)
+              self).disconnect_volume(connection_info, instance, force=force)
 
     def extend_volume(self, connection_info, instance, requested_size):
         """Extend the volume."""
diff --git a/nova/virt/libvirt/volume/fs.py b/nova/virt/libvirt/volume/fs.py
index 5fb9af4a520..992ef45016e 100644
--- a/nova/virt/libvirt/volume/fs.py
+++ b/nova/virt/libvirt/volume/fs.py
@@ -116,7 +116,7 @@ def connect_volume(self, connection_info, instance):
         connection_info['data']['device_path'] = \
             self._get_device_path(connection_info)
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Disconnect the volume."""
         vol_name = connection_info['data']['name']
         mountpoint = self._get_mount_path(connection_info)
diff --git a/nova/virt/libvirt/volume/iscsi.py b/nova/virt/libvirt/volume/iscsi.py
index 564bac14cc7..2b25972a495 100644
--- a/nova/virt/libvirt/volume/iscsi.py
+++ b/nova/virt/libvirt/volume/iscsi.py
@@ -66,19 +66,20 @@ def connect_volume(self, connection_info, instance):
 
         connection_info['data']['device_path'] = device_info['path']
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Detach the volume from instance_name."""
 
         LOG.debug("calling os-brick to detach iSCSI Volume", instance=instance)
         try:
-            self.connector.disconnect_volume(connection_info['data'], None)
+            self.connector.disconnect_volume(
+                connection_info['data'], None, force=force)
         except os_brick_exception.VolumeDeviceNotFound as exc:
             LOG.warning('Ignoring VolumeDeviceNotFound: %s', exc)
             return
         LOG.debug("Disconnected iSCSI Volume", instance=instance)
 
         super(LibvirtISCSIVolumeDriver,
-              self).disconnect_volume(connection_info, instance)
+              self).disconnect_volume(connection_info, instance, force=force)
 
     def extend_volume(self, connection_info, instance, requested_size):
         """Extend the volume."""
diff --git a/nova/virt/libvirt/volume/lightos.py b/nova/virt/libvirt/volume/lightos.py
index d6d393994e5..6a22bf6dc63 100644
--- a/nova/virt/libvirt/volume/lightos.py
+++ b/nova/virt/libvirt/volume/lightos.py
@@ -42,14 +42,15 @@ def connect_volume(self, connection_info, instance):
         LOG.debug("Connecting NVMe volume with device_info %s", device_info)
         connection_info['data']['device_path'] = device_info['path']
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Detach the volume from the instance."""
         LOG.debug("Disconnecting NVMe disk. instance:%s, volume_id:%s",
                   connection_info.get("instance", ""),
                   connection_info.get("volume_id", ""))
-        self.connector.disconnect_volume(connection_info['data'], None)
+        self.connector.disconnect_volume(
+            connection_info['data'], None, force=force)
         super(LibvirtLightOSVolumeDriver, self).disconnect_volume(
-            connection_info, instance)
+            connection_info, instance, force=force)
 
     def extend_volume(self, connection_info, instance, requested_size=None):
         """Extend the volume."""
diff --git a/nova/virt/libvirt/volume/nvme.py b/nova/virt/libvirt/volume/nvme.py
index 74365528122..e2977c3572b 100644
--- a/nova/virt/libvirt/volume/nvme.py
+++ b/nova/virt/libvirt/volume/nvme.py
@@ -45,13 +45,13 @@ def connect_volume(self, connection_info, instance):
 
         connection_info['data']['device_path'] = device_info['path']
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Detach the volume from the instance."""
         LOG.debug("Disconnecting NVMe disk", instance=instance)
         self.connector.disconnect_volume(
-            connection_info['data'], None)
+            connection_info['data'], None, force=force)
         super(LibvirtNVMEVolumeDriver,
-              self).disconnect_volume(connection_info, instance)
+              self).disconnect_volume(connection_info, instance, force=force)
 
     def extend_volume(self, connection_info, instance, requested_size):
         """Extend the volume."""
diff --git a/nova/virt/libvirt/volume/quobyte.py b/nova/virt/libvirt/volume/quobyte.py
index bb7a770e57e..2eb4bcfb428 100644
--- a/nova/virt/libvirt/volume/quobyte.py
+++ b/nova/virt/libvirt/volume/quobyte.py
@@ -189,7 +189,7 @@ def connect_volume(self, connection_info, instance):
                       instance=instance)
 
     @utils.synchronized('connect_qb_volume')
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Disconnect the volume."""
 
         mount_path = self._get_mount_path(connection_info)
diff --git a/nova/virt/libvirt/volume/scaleio.py b/nova/virt/libvirt/volume/scaleio.py
index 7c414c2870f..04a9423e8ea 100644
--- a/nova/virt/libvirt/volume/scaleio.py
+++ b/nova/virt/libvirt/volume/scaleio.py
@@ -57,12 +57,13 @@ def connect_volume(self, connection_info, instance):
                   instance=instance)
         connection_info['data']['device_path'] = device_info['path']
 
-    def disconnect_volume(self, connection_info, instance):
-        self.connector.disconnect_volume(connection_info['data'], None)
+    def disconnect_volume(self, connection_info, instance, force=False):
+        self.connector.disconnect_volume(
+            connection_info['data'], None, force=force)
         LOG.debug("Disconnected volume", instance=instance)
 
         super(LibvirtScaleIOVolumeDriver, self).disconnect_volume(
-            connection_info, instance)
+            connection_info, instance, force=force)
 
     def extend_volume(self, connection_info, instance, requested_size):
         LOG.debug("calling os-brick to extend ScaleIO Volume",
diff --git a/nova/virt/libvirt/volume/smbfs.py b/nova/virt/libvirt/volume/smbfs.py
index d112af750cb..9de1ce23cd3 100644
--- a/nova/virt/libvirt/volume/smbfs.py
+++ b/nova/virt/libvirt/volume/smbfs.py
@@ -52,7 +52,7 @@ def connect_volume(self, connection_info, instance):
         device_path = self._get_device_path(connection_info)
         connection_info['data']['device_path'] = device_path
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Disconnect the volume."""
         smbfs_share = connection_info['data']['export']
         mount_path = self._get_mount_path(connection_info)
diff --git a/nova/virt/libvirt/volume/storpool.py b/nova/virt/libvirt/volume/storpool.py
index 0e71221f5b2..e6dffca39a6 100644
--- a/nova/virt/libvirt/volume/storpool.py
+++ b/nova/virt/libvirt/volume/storpool.py
@@ -47,10 +47,11 @@ def connect_volume(self, connection_info, instance):
                   device_info, instance=instance)
         connection_info['data']['device_path'] = device_info['path']
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         LOG.debug("Detaching StorPool volume %s",
                   connection_info['data']['volume'], instance=instance)
-        self.connector.disconnect_volume(connection_info['data'], None)
+        self.connector.disconnect_volume(
+            connection_info['data'], None, force=force)
         LOG.debug("Detached StorPool volume", instance=instance)
 
     def extend_volume(self, connection_info, instance, requested_size):
diff --git a/nova/virt/libvirt/volume/volume.py b/nova/virt/libvirt/volume/volume.py
index 6d650c80e64..f76c3618b27 100644
--- a/nova/virt/libvirt/volume/volume.py
+++ b/nova/virt/libvirt/volume/volume.py
@@ -135,7 +135,7 @@ def connect_volume(self, connection_info, instance):
         """Connect the volume."""
         pass
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Disconnect the volume."""
         pass
 
diff --git a/nova/virt/libvirt/volume/vzstorage.py b/nova/virt/libvirt/volume/vzstorage.py
index 85ffb450765..babfdef55c6 100644
--- a/nova/virt/libvirt/volume/vzstorage.py
+++ b/nova/virt/libvirt/volume/vzstorage.py
@@ -126,9 +126,10 @@ def _connect_volume(connection_info, instance):
 
         return _connect_volume(connection_info, instance)
 
-    def disconnect_volume(self, connection_info, instance):
+    def disconnect_volume(self, connection_info, instance, force=False):
         """Detach the volume from instance_name."""
         LOG.debug("calling os-brick to detach Vzstorage Volume",
                 instance=instance)
-        self.connector.disconnect_volume(connection_info['data'], None)
+        self.connector.disconnect_volume(
+            connection_info['data'], None, force=force)
         LOG.debug("Disconnected Vzstorage Volume", instance=instance)
diff --git a/releasenotes/notes/service-user-token-421d067c16257782.yaml b/releasenotes/notes/service-user-token-421d067c16257782.yaml
new file mode 100644
index 00000000000..d3af14fbb85
--- /dev/null
+++ b/releasenotes/notes/service-user-token-421d067c16257782.yaml
@@ -0,0 +1,11 @@
+upgrade:
+  - |
+    Configuration of service user tokens is now **required** for all Nova services
+    to ensure security of block-storage volume data.
+
+    All Nova configuration files must configure the ``[service_user]`` section as
+    described in the `documentation`__.
+
+    See https://bugs.launchpad.net/nova/+bug/2004555 for more details.
+
+    __ https://docs.openstack.org/nova/latest/admin/configuration/service-user-token.html

From 98c3e3707c08a07f7ca5996086b165512f604ad6 Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Tue, 9 May 2023 03:11:25 +0000
Subject: [PATCH 56/93] Enable use of service user token with admin context

When the [service_user] section is configured in nova.conf, nova will
have the ability to send a service user token alongside the user's
token. The service user token is sent when nova calls other services'
REST APIs to authenticate as a service, and service calls can sometimes
have elevated privileges.

Currently, nova does not however have the ability to send a service user
token with an admin context. This means that when nova makes REST API
calls to other services with an anonymous admin RequestContext (such as
in nova-manage or periodic tasks), it will not be authenticated as a
service.

This adds a keyword argument to service_auth.get_auth_plugin() to
enable callers to provide a user_auth object instead of attempting to
extract the user_auth from the RequestContext.

The cinder and neutron client modules are also adjusted to make use of
the new user_auth keyword argument so that nova calls made with
anonymous admin request contexts can authenticate as a service when
configured.

Related-Bug: #2004555

Change-Id: I14df2d55f4b2f0be58f1a6ad3f19e48f7a6bfcb4
(cherry picked from commit 41c64b94b0af333845e998f6cc195e72ca5ab6bc)
(cherry picked from commit 1f781423ee4224c0871ab4aafec191bb2f7ef0e4)
(cherry picked from commit 0d6dd6c67f56c9d4ed36246d14f119da6bca0a5a)
---
 nova/network/neutron.py                 |  8 +++++---
 nova/service_auth.py                    |  6 ++++--
 nova/tests/unit/network/test_neutron.py | 16 ++++++++++++++++
 nova/tests/unit/test_service_auth.py    | 10 ++++++++++
 nova/tests/unit/volume/test_cinder.py   | 11 +++++++++++
 nova/volume/cinder.py                   |  8 +++++---
 6 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index 1e703658f87..faf455d9b86 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -223,13 +223,15 @@ def _get_auth_plugin(context, admin=False):
     # support some services (metadata API) where an admin context is used
     # without an auth token.
     global _ADMIN_AUTH
+    user_auth = None
     if admin or (context.is_admin and not context.auth_token):
         if not _ADMIN_AUTH:
             _ADMIN_AUTH = _load_auth_plugin(CONF)
-        return _ADMIN_AUTH
+        user_auth = _ADMIN_AUTH
 
-    if context.auth_token:
-        return service_auth.get_auth_plugin(context)
+    if context.auth_token or user_auth:
+        # When user_auth = None, user_auth will be extracted from the context.
+        return service_auth.get_auth_plugin(context, user_auth=user_auth)
 
     # We did not get a user token and we should not be using
     # an admin token so log an error
diff --git a/nova/service_auth.py b/nova/service_auth.py
index f5ae0646d8a..aa8fd8fa123 100644
--- a/nova/service_auth.py
+++ b/nova/service_auth.py
@@ -30,8 +30,10 @@ def reset_globals():
     _SERVICE_AUTH = None
 
 
-def get_auth_plugin(context):
-    user_auth = context.get_auth_plugin()
+def get_auth_plugin(context, user_auth=None):
+    # user_auth may be passed in when the RequestContext is anonymous, such as
+    # when get_admin_context() is used for API calls by nova-manage.
+    user_auth = user_auth or context.get_auth_plugin()
 
     if CONF.service_user.send_service_user_token:
         global _SERVICE_AUTH
diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py
index 5cde8d482d3..458777c3a36 100644
--- a/nova/tests/unit/network/test_neutron.py
+++ b/nova/tests/unit/network/test_neutron.py
@@ -143,6 +143,22 @@ def test_non_admin_with_service_token(self, mock_load):
         self.assertIsInstance(cl.httpclient.auth,
                               service_token.ServiceTokenAuthWrapper)
 
+    @mock.patch('nova.service_auth._SERVICE_AUTH')
+    @mock.patch('nova.network.neutron._ADMIN_AUTH')
+    @mock.patch.object(ks_loading, 'load_auth_from_conf_options')
+    def test_admin_with_service_token(
+        self, mock_load, mock_admin_auth, mock_service_auth
+    ):
+        self.flags(send_service_user_token=True, group='service_user')
+
+        admin_context = context.get_admin_context()
+
+        cl = neutronapi.get_client(admin_context)
+        self.assertIsInstance(cl.httpclient.auth,
+                              service_token.ServiceTokenAuthWrapper)
+        self.assertEqual(mock_admin_auth, cl.httpclient.auth.user_auth)
+        self.assertEqual(mock_service_auth, cl.httpclient.auth.service_auth)
+
     @mock.patch.object(client.Client, "list_networks",
                        side_effect=exceptions.Unauthorized())
     def test_Unauthorized_user(self, mock_list_networks):
diff --git a/nova/tests/unit/test_service_auth.py b/nova/tests/unit/test_service_auth.py
index db2a2e28992..ceb2a93b02b 100644
--- a/nova/tests/unit/test_service_auth.py
+++ b/nova/tests/unit/test_service_auth.py
@@ -55,3 +55,13 @@ def test_get_auth_plugin_wraps_bad_config(self, mock_load):
         result = service_auth.get_auth_plugin(self.ctx)
         self.assertEqual(1, mock_load.call_count)
         self.assertNotIsInstance(result, service_token.ServiceTokenAuthWrapper)
+
+    @mock.patch.object(ks_loading, 'load_auth_from_conf_options',
+                       new=mock.Mock())
+    def test_get_auth_plugin_user_auth(self):
+        self.flags(send_service_user_token=True, group='service_user')
+        user_auth = mock.Mock()
+
+        result = service_auth.get_auth_plugin(self.ctx, user_auth=user_auth)
+
+        self.assertEqual(user_auth, result.user_auth)
diff --git a/nova/tests/unit/volume/test_cinder.py b/nova/tests/unit/volume/test_cinder.py
index f4ee7383d45..ffa46ce2aa1 100644
--- a/nova/tests/unit/volume/test_cinder.py
+++ b/nova/tests/unit/volume/test_cinder.py
@@ -1275,3 +1275,14 @@ def test_admin_context_without_token(self,
         admin_ctx = context.get_admin_context()
         params = cinder._get_cinderclient_parameters(admin_ctx)
         self.assertEqual(params[0], mock_admin_auth)
+
+    @mock.patch('nova.service_auth._SERVICE_AUTH')
+    @mock.patch('nova.volume.cinder._ADMIN_AUTH')
+    def test_admin_context_without_user_token_but_with_service_token(
+        self, mock_admin_auth, mock_service_auth
+    ):
+        self.flags(send_service_user_token=True, group='service_user')
+        admin_ctx = context.get_admin_context()
+        params = cinder._get_cinderclient_parameters(admin_ctx)
+        self.assertEqual(mock_admin_auth, params[0].user_auth)
+        self.assertEqual(mock_service_auth, params[0].service_auth)
diff --git a/nova/volume/cinder.py b/nova/volume/cinder.py
index 01efcfec19b..f5328148d24 100644
--- a/nova/volume/cinder.py
+++ b/nova/volume/cinder.py
@@ -91,12 +91,14 @@ def _get_auth(context):
     # from them generated from 'context.get_admin_context'
     # which only set is_admin=True but is without token.
     # So add load_auth_plugin when this condition appear.
+    user_auth = None
     if context.is_admin and not context.auth_token:
         if not _ADMIN_AUTH:
             _ADMIN_AUTH = _load_auth_plugin(CONF)
-        return _ADMIN_AUTH
-    else:
-        return service_auth.get_auth_plugin(context)
+        user_auth = _ADMIN_AUTH
+
+    # When user_auth = None, user_auth will be extracted from the context.
+    return service_auth.get_auth_plugin(context, user_auth=user_auth)
 
 
 # NOTE(efried): Bug #1752152

From aa295b4ad71b59ba9ba612f07f1f108a8a25473b Mon Sep 17 00:00:00 2001
From: Jorge San Emeterio <jsanemet@redhat.com>
Date: Wed, 8 Feb 2023 15:33:54 +0100
Subject: [PATCH 57/93] Have host look for CPU controller of cgroupsv2
 location.

Make the host class look under '/sys/fs/cgroup/cgroup.controllers' for support of the cpu controller. The host will try searching through cgroupsv1 first, just like up until now, and in the case that fails, it will try cgroupsv2 then. The host will not support the feature if both checks fail.

This new check needs to be mocked by all tests that focus on this piece of code, as it touches a system file that requires privileges. For such thing, the CGroupsFixture is defined to easily add suck mocking to all test cases that require so.

I also removed old mocking at test_driver.py in favor of the fixture from above.

Conflicts:
    nova/tests/unit/virt/libvirt/test_driver.py

NOTE(auniyal):
- as new cgroup fixture is added, removed old mocking in few more unit test cases in test_driver
- did not remove test_guest_cpu_shares_with_multi_vcpu from test_driver

Partial-Bug: #2008102
Change-Id: I99b57c27c8a4425389bec2b7f05af660bab85610
(cherry picked from commit 973ff4fc1a0586937d13f2b39e517422713b1003)
(cherry picked from commit eb3fe4ddc621380afa32ec9aec0c285f36f99ee3)
(cherry picked from commit 9e86be5a5365b1896d489de7149e471fd22881d6)
---
 nova/tests/fixtures/nova.py                   | 71 +++++++++++++++++++
 nova/tests/functional/libvirt/base.py         |  1 +
 .../tests/functional/libvirt/test_evacuate.py |  1 +
 nova/tests/functional/libvirt/test_vpmem.py   |  1 +
 .../regressions/test_bug_1595962.py           |  1 +
 nova/tests/unit/virt/libvirt/test_driver.py   | 51 +++++--------
 nova/tests/unit/virt/libvirt/test_host.py     | 64 +++++++++++++----
 nova/tests/unit/virt/test_virt_drivers.py     |  1 +
 nova/virt/libvirt/host.py                     | 31 +++++++-
 9 files changed, 171 insertions(+), 51 deletions(-)

diff --git a/nova/tests/fixtures/nova.py b/nova/tests/fixtures/nova.py
index f9e011dd67d..458c15be116 100644
--- a/nova/tests/fixtures/nova.py
+++ b/nova/tests/fixtures/nova.py
@@ -1356,6 +1356,77 @@ def setUp(self):
             nova.privsep.sys_admin_pctxt, 'client_mode', False))
 
 
+class CGroupsFixture(fixtures.Fixture):
+    """Mocks checks made for available subsystems on the host's control group.
+
+    The fixture mocks all calls made on the host to verify the capabilities
+    provided by its kernel. Through this, one can simulate the underlying
+    system hosts work on top of and have tests react to expected outcomes from
+    such.
+
+    Use sample:
+    >>> cgroups = self.useFixture(CGroupsFixture())
+    >>> cgroups = self.useFixture(CGroupsFixture(version=2))
+    >>> cgroups = self.useFixture(CGroupsFixture())
+    ... cgroups.version = 2
+
+    :attr version: Arranges mocks to simulate the host interact with nova
+                   following the given version of cgroups.
+                   Available values are:
+                        - 0: All checks related to cgroups will return False.
+                        - 1: Checks related to cgroups v1 will return True.
+                        - 2: Checks related to cgroups v2 will return True.
+                   Defaults to 1.
+    """
+
+    def __init__(self, version=1):
+        self._cpuv1 = None
+        self._cpuv2 = None
+
+        self._version = version
+
+    @property
+    def version(self):
+        return self._version
+
+    @version.setter
+    def version(self, value):
+        self._version = value
+        self._update_mocks()
+
+    def setUp(self):
+        super().setUp()
+        self._cpuv1 = self.useFixture(fixtures.MockPatch(
+            'nova.virt.libvirt.host.Host._has_cgroupsv1_cpu_controller')).mock
+        self._cpuv2 = self.useFixture(fixtures.MockPatch(
+            'nova.virt.libvirt.host.Host._has_cgroupsv2_cpu_controller')).mock
+        self._update_mocks()
+
+    def _update_mocks(self):
+        if not self._cpuv1:
+            return
+
+        if not self._cpuv2:
+            return
+
+        if self.version == 0:
+            self._cpuv1.return_value = False
+            self._cpuv2.return_value = False
+            return
+
+        if self.version == 1:
+            self._cpuv1.return_value = True
+            self._cpuv2.return_value = False
+            return
+
+        if self.version == 2:
+            self._cpuv1.return_value = False
+            self._cpuv2.return_value = True
+            return
+
+        raise ValueError(f"Unknown cgroups version: '{self.version}'.")
+
+
 class NoopQuotaDriverFixture(fixtures.Fixture):
     """A fixture to run tests using the NoopQuotaDriver.
 
diff --git a/nova/tests/functional/libvirt/base.py b/nova/tests/functional/libvirt/base.py
index 68c6e294c16..85b884c3ba9 100644
--- a/nova/tests/functional/libvirt/base.py
+++ b/nova/tests/functional/libvirt/base.py
@@ -42,6 +42,7 @@ def setUp(self):
         super(ServersTestBase, self).setUp()
 
         self.useFixture(nova_fixtures.LibvirtImageBackendFixture())
+        self.useFixture(nova_fixtures.CGroupsFixture())
         self.libvirt = self.useFixture(nova_fixtures.LibvirtFixture())
         self.useFixture(nova_fixtures.OSBrickFixture())
 
diff --git a/nova/tests/functional/libvirt/test_evacuate.py b/nova/tests/functional/libvirt/test_evacuate.py
index 531cefc63ca..9da04661afe 100644
--- a/nova/tests/functional/libvirt/test_evacuate.py
+++ b/nova/tests/functional/libvirt/test_evacuate.py
@@ -427,6 +427,7 @@ def setUp(self):
         self.useFixture(nova_fixtures.NeutronFixture(self))
         self.useFixture(nova_fixtures.GlanceFixture(self))
         self.useFixture(func_fixtures.PlacementFixture())
+        self.useFixture(nova_fixtures.CGroupsFixture())
         fake_network.set_stub_network_methods(self)
 
         api_fixture = self.useFixture(
diff --git a/nova/tests/functional/libvirt/test_vpmem.py b/nova/tests/functional/libvirt/test_vpmem.py
index d1cad0e376c..b76e154997c 100644
--- a/nova/tests/functional/libvirt/test_vpmem.py
+++ b/nova/tests/functional/libvirt/test_vpmem.py
@@ -75,6 +75,7 @@ def setUp(self):
             'nova.privsep.libvirt.get_pmem_namespaces',
             return_value=self.fake_pmem_namespaces))
         self.useFixture(nova_fixtures.LibvirtImageBackendFixture())
+        self.useFixture(nova_fixtures.CGroupsFixture())
         self.useFixture(fixtures.MockPatch(
             'nova.virt.libvirt.LibvirtDriver._get_local_gb_info',
             return_value={'total': 128,
diff --git a/nova/tests/functional/regressions/test_bug_1595962.py b/nova/tests/functional/regressions/test_bug_1595962.py
index ebdf82f21a3..78916d09b71 100644
--- a/nova/tests/functional/regressions/test_bug_1595962.py
+++ b/nova/tests/functional/regressions/test_bug_1595962.py
@@ -47,6 +47,7 @@ def setUp(self):
            'nova.virt.libvirt.guest.libvirt',
            fakelibvirt))
         self.useFixture(nova_fixtures.LibvirtFixture())
+        self.useFixture(nova_fixtures.CGroupsFixture())
 
         self.admin_api = api_fixture.admin_api
         self.api = api_fixture.api
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 33d16851b4c..fdfa566b876 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -741,6 +741,7 @@ def setUp(self):
                       imagebackend.Image._get_driver_format)
 
         self.libvirt = self.useFixture(nova_fixtures.LibvirtFixture())
+        self.cgroups = self.useFixture(nova_fixtures.CGroupsFixture())
 
         # ensure tests perform the same on all host architectures; this is
         # already done by the fakelibvirt fixture but we want to change the
@@ -2956,9 +2957,7 @@ def test_get_live_migrate_numa_info_empty(self, _):
                     'fake-instance-numa-topology',
                     'fake-flavor', 'fake-image-meta').obj_to_primitive())
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_numa_host_instance_fits(self, is_able):
+    def test_get_guest_config_numa_host_instance_fits(self):
         self.flags(cpu_shared_set=None, cpu_dedicated_set=None,
                    group='compute')
         instance_ref = objects.Instance(**self.test_instance)
@@ -2995,9 +2994,7 @@ def test_get_guest_config_numa_host_instance_fits(self, is_able):
 
     @mock.patch('nova.privsep.utils.supports_direct_io',
                 new=mock.Mock(return_value=True))
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_numa_host_instance_no_fit(self, is_able):
+    def test_get_guest_config_numa_host_instance_no_fit(self):
         instance_ref = objects.Instance(**self.test_instance)
         image_meta = objects.ImageMeta.from_dict(self.test_image_meta)
         flavor = objects.Flavor(memory_mb=4096, vcpus=4, root_gb=496,
@@ -3388,10 +3385,7 @@ def test_get_guest_memory_backing_config_file_backed_hugepages(self):
                           self._test_get_guest_memory_backing_config,
                           host_topology, inst_topology, numa_tune)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_numa_host_instance_pci_no_numa_info(
-            self, is_able):
+    def test_get_guest_config_numa_host_instance_pci_no_numa_info(self):
         self.flags(cpu_shared_set='3', cpu_dedicated_set=None,
                    group='compute')
 
@@ -3440,9 +3434,7 @@ def test_get_guest_config_numa_host_instance_pci_no_numa_info(
 
     @mock.patch('nova.privsep.utils.supports_direct_io',
                 new=mock.Mock(return_value=True))
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_numa_host_instance_2pci_no_fit(self, is_able):
+    def test_get_guest_config_numa_host_instance_2pci_no_fit(self):
         self.flags(cpu_shared_set='3', cpu_dedicated_set=None,
                    group='compute')
         instance_ref = objects.Instance(**self.test_instance)
@@ -3550,10 +3542,7 @@ def test_get_guest_config_numa_other_arch_qemu(self):
             exception.NUMATopologyUnsupported,
             None)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset(
-            self, is_able):
+    def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset(self):
         self.flags(cpu_shared_set='2-3', cpu_dedicated_set=None,
                    group='compute')
 
@@ -3591,9 +3580,7 @@ def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset(
             self.assertEqual(0, len(cfg.cputune.vcpupin))
             self.assertIsNone(cfg.cpu.numa)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_non_numa_host_instance_topo(self, is_able):
+    def test_get_guest_config_non_numa_host_instance_topo(self):
         instance_topology = objects.InstanceNUMATopology(cells=[
             objects.InstanceNUMACell(
                 id=0, cpuset=set([0]), pcpuset=set(), memory=1024),
@@ -3640,9 +3627,7 @@ def test_get_guest_config_non_numa_host_instance_topo(self, is_able):
                 self.assertEqual(instance_cell.memory * units.Ki,
                                  numa_cfg_cell.memory)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_numa_host_instance_topo(self, is_able):
+    def test_get_guest_config_numa_host_instance_topo(self):
         self.flags(cpu_shared_set='0-5', cpu_dedicated_set=None,
                    group='compute')
 
@@ -7035,9 +7020,7 @@ def test_get_guest_config_with_rng_dev_not_present(self, mock_path):
                           [],
                           image_meta, disk_info)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_guest_cpu_shares_with_multi_vcpu(self, is_able):
+    def test_guest_cpu_shares_with_multi_vcpu(self):
         self.flags(virt_type='kvm', group='libvirt')
 
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
@@ -7055,9 +7038,7 @@ def test_guest_cpu_shares_with_multi_vcpu(self, is_able):
 
         self.assertEqual(4096, cfg.cputune.shares)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_with_cpu_quota(self, is_able):
+    def test_get_guest_config_with_cpu_quota(self):
         self.flags(virt_type='kvm', group='libvirt')
 
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
@@ -7393,9 +7374,7 @@ def test_get_guest_config_disk_cachemodes_network(
         self.flags(images_type='rbd', group='libvirt')
         self._test_get_guest_config_disk_cachemodes('rbd')
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=True)
-    def test_get_guest_config_with_bogus_cpu_quota(self, is_able):
+    def test_get_guest_config_with_bogus_cpu_quota(self):
         self.flags(virt_type='kvm', group='libvirt')
 
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
@@ -7413,9 +7392,10 @@ def test_get_guest_config_with_bogus_cpu_quota(self, is_able):
                           drvr._get_guest_config,
                           instance_ref, [], image_meta, disk_info)
 
-    @mock.patch.object(
-        host.Host, "is_cpu_control_policy_capable", return_value=False)
-    def test_get_update_guest_cputune(self, is_able):
+    def test_get_update_guest_cputune(self):
+        # No CPU controller on the host
+        self.cgroups.version = 0
+
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
         instance_ref = objects.Instance(**self.test_instance)
         instance_ref.flavor.extra_specs = {'quota:cpu_shares': '10000',
@@ -21662,6 +21642,7 @@ def setUp(self):
         self.flags(sysinfo_serial="none", group="libvirt")
         self.flags(instances_path=self.useFixture(fixtures.TempDir()).path)
         self.useFixture(nova_fixtures.LibvirtFixture())
+        self.useFixture(nova_fixtures.CGroupsFixture())
         os_vif.initialize()
 
         self.drvr = libvirt_driver.LibvirtDriver(
diff --git a/nova/tests/unit/virt/libvirt/test_host.py b/nova/tests/unit/virt/libvirt/test_host.py
index a46a3e46a5f..4d48762c910 100644
--- a/nova/tests/unit/virt/libvirt/test_host.py
+++ b/nova/tests/unit/virt/libvirt/test_host.py
@@ -1556,25 +1556,59 @@ def test_compare_cpu(self, mock_compareCPU):
         self.host.compare_cpu("cpuxml")
         mock_compareCPU.assert_called_once_with("cpuxml", 0)
 
-    def test_is_cpu_control_policy_capable_ok(self):
+    def test_is_cpu_control_policy_capable_via_neither(self):
+        self.useFixture(nova_fixtures.CGroupsFixture(version=0))
+        self.assertFalse(self.host.is_cpu_control_policy_capable())
+
+    def test_is_cpu_control_policy_capable_via_cgroupsv1(self):
+        self.useFixture(nova_fixtures.CGroupsFixture(version=1))
+        self.assertTrue(self.host.is_cpu_control_policy_capable())
+
+    def test_is_cpu_control_policy_capable_via_cgroupsv2(self):
+        self.useFixture(nova_fixtures.CGroupsFixture(version=2))
+        self.assertTrue(self.host.is_cpu_control_policy_capable())
+
+    def test_has_cgroupsv1_cpu_controller_ok(self):
         m = mock.mock_open(
-            read_data="""cg /cgroup/cpu,cpuacct cg opt1,cpu,opt3 0 0
-cg /cgroup/memory cg opt1,opt2 0 0
-""")
-        with mock.patch('builtins.open', m, create=True):
-            self.assertTrue(self.host.is_cpu_control_policy_capable())
+            read_data=(
+                "cg /cgroup/cpu,cpuacct cg opt1,cpu,opt3 0 0"
+                "cg /cgroup/memory cg opt1,opt2 0 0"
+            )
+        )
+        with mock.patch("builtins.open", m, create=True):
+            self.assertTrue(self.host._has_cgroupsv1_cpu_controller())
 
-    def test_is_cpu_control_policy_capable_ko(self):
+    def test_has_cgroupsv1_cpu_controller_ko(self):
         m = mock.mock_open(
-            read_data="""cg /cgroup/cpu,cpuacct cg opt1,opt2,opt3 0 0
-cg /cgroup/memory cg opt1,opt2 0 0
-""")
-        with mock.patch('builtins.open', m, create=True):
-            self.assertFalse(self.host.is_cpu_control_policy_capable())
+            read_data=(
+                "cg /cgroup/cpu,cpuacct cg opt1,opt2,opt3 0 0"
+                "cg /cgroup/memory cg opt1,opt2 0 0"
+            )
+        )
+        with mock.patch("builtins.open", m, create=True):
+            self.assertFalse(self.host._has_cgroupsv1_cpu_controller())
 
-    @mock.patch('builtins.open', side_effect=IOError)
-    def test_is_cpu_control_policy_capable_ioerror(self, mock_open):
-        self.assertFalse(self.host.is_cpu_control_policy_capable())
+    @mock.patch("builtins.open", side_effect=IOError)
+    def test_has_cgroupsv1_cpu_controller_ioerror(self, _):
+        self.assertFalse(self.host._has_cgroupsv1_cpu_controller())
+
+    def test_has_cgroupsv2_cpu_controller_ok(self):
+        m = mock.mock_open(
+            read_data="cpuset cpu io memory hugetlb pids rdma misc"
+        )
+        with mock.patch("builtins.open", m, create=True):
+            self.assertTrue(self.host._has_cgroupsv2_cpu_controller())
+
+    def test_has_cgroupsv2_cpu_controller_ko(self):
+        m = mock.mock_open(
+            read_data="memory pids"
+        )
+        with mock.patch("builtins.open", m, create=True):
+            self.assertFalse(self.host._has_cgroupsv2_cpu_controller())
+
+    @mock.patch("builtins.open", side_effect=IOError)
+    def test_has_cgroupsv2_cpu_controller_ioerror(self, _):
+        self.assertFalse(self.host._has_cgroupsv2_cpu_controller())
 
     def test_get_canonical_machine_type(self):
         # this test relies on configuration from the FakeLibvirtFixture
diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py
index 8dcad485bca..e275cd3e3aa 100644
--- a/nova/tests/unit/virt/test_virt_drivers.py
+++ b/nova/tests/unit/virt/test_virt_drivers.py
@@ -832,6 +832,7 @@ def setUp(self):
         # This is needed for the live migration tests which spawn off the
         # operation for monitoring.
         self.useFixture(nova_fixtures.SpawnIsSynchronousFixture())
+        self.useFixture(nova_fixtures.CGroupsFixture())
         # When destroying an instance, os-vif will try to execute some commands
         # which hang tests so let's just stub out the unplug call to os-vif
         # since we don't care about it.
diff --git a/nova/virt/libvirt/host.py b/nova/virt/libvirt/host.py
index ebcc1125345..b1a94e5f315 100644
--- a/nova/virt/libvirt/host.py
+++ b/nova/virt/libvirt/host.py
@@ -1548,15 +1548,44 @@ def is_cpu_control_policy_capable(self):
         CONFIG_CGROUP_SCHED may be disabled in some kernel configs to
         improve scheduler latency.
         """
+        return self._has_cgroupsv1_cpu_controller() or \
+               self._has_cgroupsv2_cpu_controller()
+
+    def _has_cgroupsv1_cpu_controller(self):
+        LOG.debug(f"Searching host: '{self.get_hostname()}' "
+                  "for CPU controller through CGroups V1...")
         try:
             with open("/proc/self/mounts", "r") as fd:
                 for line in fd.readlines():
                     # mount options and split options
                     bits = line.split()[3].split(",")
                     if "cpu" in bits:
+                        LOG.debug("CPU controller found on host.")
+                        return True
+                LOG.debug("CPU controller missing on host.")
+                return False
+        except IOError as ex:
+            LOG.debug(f"Search failed due to: '{ex}'. "
+                      "Maybe the host is not running under CGroups V1. "
+                      "Deemed host to be missing controller by this approach.")
+            return False
+
+    def _has_cgroupsv2_cpu_controller(self):
+        LOG.debug(f"Searching host: '{self.get_hostname()}' "
+                  "for CPU controller through CGroups V2...")
+        try:
+            with open("/sys/fs/cgroup/cgroup.controllers", "r") as fd:
+                for line in fd.readlines():
+                    bits = line.split()
+                    if "cpu" in bits:
+                        LOG.debug("CPU controller found on host.")
                         return True
+                LOG.debug("CPU controller missing on host.")
                 return False
-        except IOError:
+        except IOError as ex:
+            LOG.debug(f"Search failed due to: '{ex}'. "
+                      "Maybe the host is not running under CGroups V2. "
+                      "Deemed host to be missing controller by this approach.")
             return False
 
     def get_canonical_machine_type(self, arch, machine) -> str:

From acb31f01b1f10e1101c520657b2e6432a99e4b35 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Thu, 11 May 2023 16:19:38 +0200
Subject: [PATCH 58/93] CI: fix backport validator for new branch naming

validate-backport job started to fail as only old stable branch naming
is accepted. This patch extends the script to allow numbers and dot as
well in the branch names (like stable/2023.1).

Change-Id: Icbdcd5d124717e195d55d9e42530611ed812fadd
(cherry picked from commit fe125da63b6508788654f0dab721f13005c09d25)
(cherry picked from commit 09f85a8a922e4ad68271886d2389042d4f4d6896)
(cherry picked from commit abd9a34a6014730620cee15a44f328e48e57398e)
---
 tools/check-cherry-picks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh
index 46cef8c2250..3042aa16593 100755
--- a/tools/check-cherry-picks.sh
+++ b/tools/check-cherry-picks.sh
@@ -23,7 +23,7 @@ hashes=$(git show --format='%b' --quiet $commit_hash | sed -nr 's/^.cherry picke
 checked=0
 branches+=""
 for hash in $hashes; do
-    branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z]+)')
+    branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+)')
     if [ $? -ne 0 ]; then
         echo "Cherry pick hash $hash not on any master or stable branches"
         exit 1

From 9a6a421c045a8031ff0923cca9aa7195fe987896 Mon Sep 17 00:00:00 2001
From: Sylvain Bauza <sbauza@redhat.com>
Date: Wed, 3 May 2023 17:00:14 +0200
Subject: [PATCH 59/93] Fix get_segments_id with subnets without segment_id

Unfortunatly when we merged Ie166f3b51fddeaf916cda7c5ac34bbcdda0fd17a we
forgot that subnets can have no segment_id field.

Change-Id: Idb35b7e3c69fe8efe498abe4ebcc6cad8918c4ed
Closes-Bug: #2018375
(cherry picked from commit 6d7bd6a03446d5227d515b2b4c0da632ef4aa4a1)
(cherry picked from commit 6b8d9d419170fb0ec2c6df561a0874e6362382c1)
(cherry picked from commit 77db64237b23050d94df113a38412c5333d23357)
---
 nova/network/neutron.py                 |  2 +-
 nova/tests/unit/network/test_neutron.py | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/nova/network/neutron.py b/nova/network/neutron.py
index faf455d9b86..cc9efa79d71 100644
--- a/nova/network/neutron.py
+++ b/nova/network/neutron.py
@@ -3924,7 +3924,7 @@ def get_segment_ids_for_network(
                 'Failed to get segment IDs for network %s' % network_id) from e
         # The segment field of an unconfigured subnet could be None
         return [subnet['segment_id'] for subnet in subnets
-                                     if subnet['segment_id'] is not None]
+                                     if subnet.get('segment_id') is not None]
 
     def get_segment_id_for_subnet(
         self,
diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py
index 458777c3a36..32f2efc95d1 100644
--- a/nova/tests/unit/network/test_neutron.py
+++ b/nova/tests/unit/network/test_neutron.py
@@ -7432,7 +7432,7 @@ def test_get_segment_ids_for_network_passes(self, mock_client):
             network_id=uuids.network_id, fields='segment_id')
 
     @mock.patch.object(neutronapi, 'get_client')
-    def test_get_segment_ids_for_network_with_no_segments(self, mock_client):
+    def test_get_segment_ids_for_network_with_segments_none(self, mock_client):
         subnets = {'subnets': [{'segment_id': None}]}
         mocked_client = mock.create_autospec(client.Client)
         mock_client.return_value = mocked_client
@@ -7447,6 +7447,22 @@ def test_get_segment_ids_for_network_with_no_segments(self, mock_client):
         mocked_client.list_subnets.assert_called_once_with(
             network_id=uuids.network_id, fields='segment_id')
 
+    @mock.patch.object(neutronapi, 'get_client')
+    def test_get_segment_ids_for_network_with_no_segments(self, mock_client):
+        subnets = {'subnets': [{}]}
+        mocked_client = mock.create_autospec(client.Client)
+        mock_client.return_value = mocked_client
+        mocked_client.list_subnets.return_value = subnets
+        with mock.patch.object(
+            self.api, 'has_segment_extension', return_value=True,
+        ):
+            res = self.api.get_segment_ids_for_network(
+                self.context, uuids.network_id)
+        self.assertEqual([], res)
+        mock_client.assert_called_once_with(self.context, admin=True)
+        mocked_client.list_subnets.assert_called_once_with(
+            network_id=uuids.network_id, fields='segment_id')
+
     @mock.patch.object(neutronapi, 'get_client')
     def test_get_segment_ids_for_network_fails(self, mock_client):
         mocked_client = mock.create_autospec(client.Client)

From cd0403dd3b1099bd13da503500a50249db8e49ea Mon Sep 17 00:00:00 2001
From: Yusuke Okada <okada.yusuke@fujitsu.com>
Date: Wed, 8 Feb 2023 22:10:31 -0500
Subject: [PATCH 60/93] Fix failed count for anti-affinity check

The late anti-affinity check runs in the compute manager to avoid
parallel scheduling requests to invalidate the anti-affinity server
group policy. When the check fails the instance is re-scheduled.
However this failure counted as a real instance boot failure of the
compute host and can lead to de-prioritization of the compute host
in the scheduler via BuildFailureWeigher. As the late anti-affinity
check does not indicate any fault of the compute host itself it
should not be counted towards the build failure counter.
This patch adds new build results to handle this case.

Closes-Bug: #1996732
Change-Id: I2ba035c09ace20e9835d9d12a5c5bee17d616718
Signed-off-by: Yusuke Okada <okada.yusuke@fujitsu.com>
(cherry picked from commit 56d320a203a13f262a2e94e491af222032e453d3)
(cherry picked from commit 1b56714e9119ab4152e6f33985a499b2d83a491b)
(cherry picked from commit 2f1d65774fbcf5c25c4ba53583b6a802a03f4c4d)
---
 nova/compute/build_results.py               |   8 ++
 nova/compute/manager.py                     |  33 +++--
 nova/exception.py                           |   9 ++
 nova/tests/functional/test_server_group.py  |  80 +++++++++++
 nova/tests/unit/compute/test_compute_mgr.py | 149 +++++++++++++++++++-
 5 files changed, 265 insertions(+), 14 deletions(-)

diff --git a/nova/compute/build_results.py b/nova/compute/build_results.py
index ca9ed51410f..a091c89ff65 100644
--- a/nova/compute/build_results.py
+++ b/nova/compute/build_results.py
@@ -24,3 +24,11 @@
 ACTIVE = 'active'  # Instance is running
 FAILED = 'failed'  # Instance failed to build and was not rescheduled
 RESCHEDULED = 'rescheduled'  # Instance failed to build, but was rescheduled
+# Instance failed by policy violation (such as affinity or anti-affinity)
+# and was not rescheduled. In this case, the node's failed count won't be
+# increased.
+FAILED_BY_POLICY = 'failed_by_policy'
+# Instance failed by policy violation (such as affinity or anti-affinity)
+# but was rescheduled. In this case, the node's failed count won't be
+# increased.
+RESCHEDULED_BY_POLICY = 'rescheduled_by_policy'
diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 4c8ed675524..44185ef667a 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -1804,11 +1804,8 @@ def _do_validation(context, instance, group):
                 else:
                     max_server = 1
                 if len(members_on_host) >= max_server:
-                    msg = _("Anti-affinity instance group policy "
-                            "was violated.")
-                    raise exception.RescheduledException(
-                            instance_uuid=instance.uuid,
-                            reason=msg)
+                    raise exception.GroupAffinityViolation(
+                        instance_uuid=instance.uuid, policy='Anti-affinity')
 
             # NOTE(ganso): The check for affinity below does not work and it
             # can easily be violated because the lock happens in different
@@ -1818,10 +1815,8 @@ def _do_validation(context, instance, group):
             elif group.policy and 'affinity' == group.policy:
                 group_hosts = group.get_hosts(exclude=[instance.uuid])
                 if group_hosts and self.host not in group_hosts:
-                    msg = _("Affinity instance group policy was violated.")
-                    raise exception.RescheduledException(
-                            instance_uuid=instance.uuid,
-                            reason=msg)
+                    raise exception.GroupAffinityViolation(
+                        instance_uuid=instance.uuid, policy='Affinity')
 
         _do_validation(context, instance, group)
 
@@ -2256,6 +2251,9 @@ def _locked_do_build_and_run_instance(*args, **kwargs):
                         self.reportclient.delete_allocation_for_instance(
                             context, instance.uuid, force=True)
 
+                    if result in (build_results.FAILED_BY_POLICY,
+                                  build_results.RESCHEDULED_BY_POLICY):
+                        return
                     if result in (build_results.FAILED,
                                   build_results.RESCHEDULED):
                         self._build_failed(node)
@@ -2354,6 +2352,8 @@ def _do_build_and_run_instance(self, context, instance, image,
                 self._nil_out_instance_obj_host_and_node(instance)
                 self._set_instance_obj_error_state(instance,
                                                    clean_task_state=True)
+                if isinstance(e, exception.RescheduledByPolicyException):
+                    return build_results.FAILED_BY_POLICY
                 return build_results.FAILED
             LOG.debug(e.format_message(), instance=instance)
             # This will be used for logging the exception
@@ -2380,6 +2380,10 @@ def _do_build_and_run_instance(self, context, instance, image,
                     injected_files, requested_networks, security_groups,
                     block_device_mapping, request_spec=request_spec,
                     host_lists=[host_list])
+
+            if isinstance(e, exception.RescheduledByPolicyException):
+                return build_results.RESCHEDULED_BY_POLICY
+
             return build_results.RESCHEDULED
         except (exception.InstanceNotFound,
                 exception.UnexpectedDeletingTaskStateError):
@@ -2597,6 +2601,17 @@ def _build_and_run_instance(self, context, instance, image, injected_files,
                     bdms=block_device_mapping)
             raise exception.BuildAbortException(instance_uuid=instance.uuid,
                     reason=e.format_message())
+        except exception.GroupAffinityViolation as e:
+            LOG.exception('Failed to build and run instance',
+                          instance=instance)
+            self._notify_about_instance_usage(context, instance,
+                    'create.error', fault=e)
+            compute_utils.notify_about_instance_create(
+                    context, instance, self.host,
+                    phase=fields.NotificationPhase.ERROR, exception=e,
+                    bdms=block_device_mapping)
+            raise exception.RescheduledByPolicyException(
+                    instance_uuid=instance.uuid, reason=str(e))
         except Exception as e:
             LOG.exception('Failed to build and run instance',
                           instance=instance)
diff --git a/nova/exception.py b/nova/exception.py
index 4588898aae9..2d6ce3785b6 100644
--- a/nova/exception.py
+++ b/nova/exception.py
@@ -1477,6 +1477,15 @@ class RescheduledException(NovaException):
                 "%(reason)s")
 
 
+class RescheduledByPolicyException(RescheduledException):
+    msg_fmt = _("Build of instance %(instance_uuid)s was re-scheduled: "
+                "%(reason)s")
+
+
+class GroupAffinityViolation(NovaException):
+    msg_fmt = _("%(policy)s instance group policy was violated")
+
+
 class InstanceFaultRollback(NovaException):
     def __init__(self, inner_exception=None):
         message = _("Instance rollback performed due to: %s")
diff --git a/nova/tests/functional/test_server_group.py b/nova/tests/functional/test_server_group.py
index a64a04b2c9a..a562df84078 100644
--- a/nova/tests/functional/test_server_group.py
+++ b/nova/tests/functional/test_server_group.py
@@ -19,6 +19,7 @@
 from nova.compute import instance_actions
 from nova import context
 from nova.db.main import api as db
+from nova import objects
 from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.functional.api import client
@@ -494,6 +495,85 @@ def test_soft_affinity_not_supported(self):
         self.assertIn('Invalid input', ex.response.text)
         self.assertIn('soft-affinity', ex.response.text)
 
+    @mock.patch('nova.scheduler.filters.affinity_filter.'
+        'ServerGroupAffinityFilter.host_passes', return_value=True)
+    def test_failed_count_with_affinity_violation(self, mock_host_passes):
+        """Check failed count not incremented after violation of the late
+        affinity check. https://bugs.launchpad.net/nova/+bug/1996732
+        """
+
+        created_group = self.api.post_server_groups(self.affinity)
+        flavor = self.api.get_flavors()[2]
+
+        # Ensure the first instance is on compute1
+        with utils.temporary_mutation(self.admin_api, microversion='2.53'):
+            compute2_service_id = self.admin_api.get_services(
+            host=self.compute2.host, binary='nova-compute')[0]['id']
+            self.admin_api.put_service(compute2_service_id,
+                                        {'status': 'disabled'})
+
+        self._boot_a_server_to_group(created_group, flavor=flavor)
+
+        # Ensure the second instance is on compute2
+        with utils.temporary_mutation(self.admin_api, microversion='2.53'):
+            self.admin_api.put_service(compute2_service_id,
+                                        {'status': 'enabled'})
+            compute1_service_id = self.admin_api.get_services(
+            host=self.compute.host, binary='nova-compute')[0]['id']
+            self.admin_api.put_service(compute1_service_id,
+                                        {'status': 'disabled'})
+
+        # Expects GroupAffinityViolation exception
+        failed_server = self._boot_a_server_to_group(created_group,
+                                                     flavor=flavor,
+                                                     expected_status='ERROR')
+
+        self.assertEqual('Exceeded maximum number of retries. Exhausted all '
+                         'hosts available for retrying build failures for '
+                         'instance %s.' % failed_server['id'],
+                         failed_server['fault']['message'])
+
+        ctxt = context.get_admin_context()
+        computes = objects.ComputeNodeList.get_all(ctxt)
+
+        for node in computes:
+            self.assertEqual(node.stats.get('failed_builds'), '0')
+
+    @mock.patch('nova.scheduler.filters.affinity_filter.'
+        'ServerGroupAntiAffinityFilter.host_passes', return_value=True)
+    def test_failed_count_with_anti_affinity_violation(self, mock_host_passes):
+        """Check failed count after violation of the late affinity check.
+        https://bugs.launchpad.net/nova/+bug/1996732
+        """
+
+        created_group = self.api.post_server_groups(self.anti_affinity)
+        flavor = self.api.get_flavors()[2]
+
+        # Ensure two instances are scheduled on the same host
+        with utils.temporary_mutation(self.admin_api, microversion='2.53'):
+            compute2_service_id = self.admin_api.get_services(
+            host=self.compute2.host, binary='nova-compute')[0]['id']
+            self.admin_api.put_service(compute2_service_id,
+                                        {'status': 'disabled'})
+
+        self._boot_a_server_to_group(created_group, flavor=flavor)
+
+        # Expects GroupAffinityViolation exception
+        failed_server = self._boot_a_server_to_group(created_group,
+                                                     flavor=flavor,
+                                                     expected_status='ERROR')
+
+        self.assertEqual('Exceeded maximum number of retries. Exhausted all '
+                         'hosts available for retrying build failures for '
+                         'instance %s.' % failed_server['id'],
+                         failed_server['fault']['message'])
+
+        ctxt = context.get_admin_context()
+        computes = objects.ComputeNodeList.get_all(ctxt)
+
+        for node in computes:
+            self.assertEqual(node.stats.get('failed_builds'), '0')
+
 
 class ServerGroupAffinityConfTest(ServerGroupTestBase):
     api_major_version = 'v2.1'
diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py
index f7c9400342b..0d39a570c02 100644
--- a/nova/tests/unit/compute/test_compute_mgr.py
+++ b/nova/tests/unit/compute/test_compute_mgr.py
@@ -6546,13 +6546,14 @@ def test_build_and_run_instance_with_unlimited_max_concurrent_builds(self):
         self.compute = manager.ComputeManager()
         self._test_build_and_run_instance()
 
+    @mock.patch.object(manager.ComputeManager, '_build_succeeded')
     @mock.patch.object(objects.InstanceActionEvent,
                        'event_finish_with_failure')
     @mock.patch.object(objects.InstanceActionEvent, 'event_start')
     @mock.patch.object(objects.Instance, 'save')
     @mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
     def _test_build_and_run_instance(self, mock_build, mock_save,
-                                     mock_start, mock_finish):
+                                     mock_start, mock_finish, mock_succeeded):
         self._do_build_instance_update(mock_save)
 
         orig_do_build_and_run = self.compute._do_build_and_run_instance
@@ -6585,6 +6586,7 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs):
                 self.requested_networks, self.security_groups,
                 self.block_device_mapping, self.node, self.limits,
                 self.filter_properties, {}, self.accel_uuids)
+        mock_succeeded.assert_called_once_with(self.node)
 
     # This test when sending an icehouse compatible rpc call to juno compute
     # node, NetworkRequest object can load from three items tuple.
@@ -6612,6 +6614,7 @@ def test_build_and_run_instance_with_icehouse_requested_network(
         self.assertEqual('10.0.0.1', str(requested_network.address))
         self.assertEqual(uuids.port_instance, requested_network.port_id)
 
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
     @mock.patch.object(objects.InstanceActionEvent,
                        'event_finish_with_failure')
     @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@@ -6627,7 +6630,7 @@ def test_build_and_run_instance_with_icehouse_requested_network(
     def test_build_abort_exception(self, mock_build_run,
                                    mock_build, mock_set, mock_nil, mock_add,
                                    mock_clean_vol, mock_clean_net, mock_save,
-                                   mock_start, mock_finish):
+                                   mock_start, mock_finish, mock_failed):
         self._do_build_instance_update(mock_save)
         mock_build_run.side_effect = exception.BuildAbortException(reason='',
                                         instance_uuid=self.instance.uuid)
@@ -6670,7 +6673,9 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs):
                 mock.ANY, mock.ANY)
         mock_nil.assert_called_once_with(self.instance)
         mock_set.assert_called_once_with(self.instance, clean_task_state=True)
+        mock_failed.assert_called_once_with(self.node)
 
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
     @mock.patch.object(objects.InstanceActionEvent,
                        'event_finish_with_failure')
     @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@@ -6681,8 +6686,8 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs):
     @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
     @mock.patch.object(manager.ComputeManager, '_build_and_run_instance')
     def test_rescheduled_exception(self, mock_build_run,
-                                   mock_build, mock_set, mock_nil,
-                                   mock_save, mock_start, mock_finish):
+                                   mock_build, mock_set, mock_nil, mock_save,
+                                   mock_start, mock_finish, mock_failed):
         self._do_build_instance_update(mock_save, reschedule_update=True)
         mock_build_run.side_effect = exception.RescheduledException(reason='',
                 instance_uuid=self.instance.uuid)
@@ -6729,6 +6734,7 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs):
                 self.admin_pass, self.injected_files, self.requested_networks,
                 self.security_groups, self.block_device_mapping,
                 request_spec={}, host_lists=[fake_host_list])
+        mock_failed.assert_called_once_with(self.node)
 
     @mock.patch.object(manager.ComputeManager, '_shutdown_instance')
     @mock.patch.object(manager.ComputeManager, '_build_networks_for_instance')
@@ -7082,6 +7088,139 @@ def _wrapped_do_build_and_run_instance(*args, **kwargs):
                 self.security_groups, self.block_device_mapping,
                 request_spec={}, host_lists=[fake_host_list])
 
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
+                new=mock.MagicMock())
+    @mock.patch.object(objects.InstanceActionEvent,
+                       'event_finish_with_failure')
+    @mock.patch.object(objects.InstanceActionEvent, 'event_start')
+    @mock.patch.object(objects.Instance, 'save')
+    @mock.patch.object(manager.ComputeManager,
+                       '_nil_out_instance_obj_host_and_node')
+    @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
+    @mock.patch.object(manager.ComputeManager, '_build_succeeded')
+    @mock.patch.object(manager.ComputeManager,
+                       '_validate_instance_group_policy')
+    def test_group_affinity_violation_exception_with_retry(
+        self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
+        mock_nil, mock_save, mock_start, mock_finish,
+    ):
+        """Test retry by affinity or anti-affinity validation check doesn't
+        increase failed build
+        """
+
+        self._do_build_instance_update(mock_save, reschedule_update=True)
+        mock_validate_policy.side_effect = \
+                exception.GroupAffinityViolation(
+                instance_uuid=self.instance.uuid, policy="Affinity")
+
+        orig_do_build_and_run = self.compute._do_build_and_run_instance
+
+        def _wrapped_do_build_and_run_instance(*args, **kwargs):
+            ret = orig_do_build_and_run(*args, **kwargs)
+            self.assertEqual(build_results.RESCHEDULED_BY_POLICY, ret)
+            return ret
+
+        with test.nested(
+            mock.patch.object(
+                self.compute, '_do_build_and_run_instance',
+                side_effect=_wrapped_do_build_and_run_instance,
+            ),
+            mock.patch.object(
+                self.compute.network_api, 'get_instance_nw_info',
+            ),
+        ):
+            self.compute.build_and_run_instance(
+                self.context, self.instance,
+                self.image, request_spec={},
+                filter_properties=self.filter_properties,
+                accel_uuids=self.accel_uuids,
+                injected_files=self.injected_files,
+                admin_password=self.admin_pass,
+                requested_networks=self.requested_networks,
+                security_groups=self.security_groups,
+                block_device_mapping=self.block_device_mapping, node=self.node,
+                limits=self.limits, host_list=fake_host_list)
+
+        mock_succeeded.assert_not_called()
+        mock_failed.assert_not_called()
+
+        self._instance_action_events(mock_start, mock_finish)
+        self._assert_build_instance_update(mock_save, reschedule_update=True)
+        mock_nil.assert_called_once_with(self.instance)
+        mock_build.assert_called_once_with(self.context,
+                [self.instance], self.image, self.filter_properties,
+                self.admin_pass, self.injected_files, self.requested_networks,
+                self.security_groups, self.block_device_mapping,
+                request_spec={}, host_lists=[fake_host_list])
+
+    @mock.patch('nova.compute.resource_tracker.ResourceTracker.instance_claim',
+                new=mock.MagicMock())
+    @mock.patch.object(objects.InstanceActionEvent,
+                       'event_finish_with_failure')
+    @mock.patch.object(objects.InstanceActionEvent, 'event_start')
+    @mock.patch.object(objects.Instance, 'save')
+    @mock.patch.object(manager.ComputeManager,
+                       '_nil_out_instance_obj_host_and_node')
+    @mock.patch.object(manager.ComputeManager, '_cleanup_allocated_networks')
+    @mock.patch.object(manager.ComputeManager, '_set_instance_obj_error_state')
+    @mock.patch.object(compute_utils, 'add_instance_fault_from_exc')
+    @mock.patch.object(conductor_api.ComputeTaskAPI, 'build_instances')
+    @mock.patch.object(manager.ComputeManager, '_build_failed')
+    @mock.patch.object(manager.ComputeManager, '_build_succeeded')
+    @mock.patch.object(manager.ComputeManager,
+                       '_validate_instance_group_policy')
+    def test_group_affinity_violation_exception_without_retry(
+        self, mock_validate_policy, mock_succeeded, mock_failed, mock_build,
+        mock_add, mock_set_state, mock_clean_net, mock_nil, mock_save,
+        mock_start, mock_finish,
+    ):
+        """Test failure by affinity or anti-affinity validation check doesn't
+        increase failed build
+        """
+
+        self._do_build_instance_update(mock_save)
+        mock_validate_policy.side_effect = \
+                exception.GroupAffinityViolation(
+                instance_uuid=self.instance.uuid, policy="Affinity")
+
+        orig_do_build_and_run = self.compute._do_build_and_run_instance
+
+        def _wrapped_do_build_and_run_instance(*args, **kwargs):
+            ret = orig_do_build_and_run(*args, **kwargs)
+            self.assertEqual(build_results.FAILED_BY_POLICY, ret)
+            return ret
+
+        with mock.patch.object(
+                self.compute, '_do_build_and_run_instance',
+                side_effect=_wrapped_do_build_and_run_instance,
+        ):
+            self.compute.build_and_run_instance(
+                self.context, self.instance,
+                self.image, request_spec={},
+                filter_properties={},
+                accel_uuids=[],
+                injected_files=self.injected_files,
+                admin_password=self.admin_pass,
+                requested_networks=self.requested_networks,
+                security_groups=self.security_groups,
+                block_device_mapping=self.block_device_mapping, node=self.node,
+                limits=self.limits, host_list=fake_host_list)
+
+        mock_succeeded.assert_not_called()
+        mock_failed.assert_not_called()
+
+        self._instance_action_events(mock_start, mock_finish)
+        self._assert_build_instance_update(mock_save)
+        mock_clean_net.assert_called_once_with(self.context, self.instance,
+                self.requested_networks)
+        mock_add.assert_called_once_with(self.context, self.instance,
+                mock.ANY, mock.ANY, fault_message=mock.ANY)
+        mock_nil.assert_called_once_with(self.instance)
+        mock_build.assert_not_called()
+        mock_set_state.assert_called_once_with(self.instance,
+                clean_task_state=True)
+
     @mock.patch.object(objects.InstanceActionEvent,
                        'event_finish_with_failure')
     @mock.patch.object(objects.InstanceActionEvent, 'event_start')
@@ -7661,7 +7800,7 @@ def test_validate_instance_group_policy_with_rules(
         nodes.return_value = ['nodename']
         migration_list.return_value = [objects.Migration(
             uuid=uuids.migration, instance_uuid=uuids.instance)]
-        self.assertRaises(exception.RescheduledException,
+        self.assertRaises(exception.GroupAffinityViolation,
                           self.compute._validate_instance_group_policy,
                           self.context, instance, hints)
 

From 8aa6723fa4355c9e609ab449c28ea3cbe356c839 Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Wed, 17 May 2023 03:04:49 +0000
Subject: [PATCH 61/93] Add debug logging when Instance raises
 OrphanedObjectError

This logging would be helpful in debugging issues when
OrphanedObjectError is raised by an instance. Currently, there is
not a way to identify which instance is attempting to lazy-load a
field while orphaned. Being able to locate the instance in the
database could also help with recovery/cleanup when a problematic
record is disrupting operation of a deployment.

Change-Id: I093de2839c1bb7c949a0812e07b63de4cc5ed167
(cherry picked from commit e0fbb6fc06d3b08b938af2e36b11f04c57fe6954)
(cherry picked from commit f32deaa617286e4b0dc2d01585ccb5ac821a571c)
(cherry picked from commit 9e8456297681ad21680acd35718e3cb97f8458f2)
---
 nova/objects/instance.py                 |  5 +++++
 nova/tests/unit/objects/test_instance.py | 15 +++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/nova/objects/instance.py b/nova/objects/instance.py
index e99762d2777..894b8d2a19c 100644
--- a/nova/objects/instance.py
+++ b/nova/objects/instance.py
@@ -1089,6 +1089,11 @@ def clear_numa_topology(self):
     def obj_load_attr(self, attrname):
         # NOTE(danms): We can't lazy-load anything without a context and a uuid
         if not self._context:
+            if 'uuid' in self:
+                LOG.debug(
+                    "Lazy-load of '%s' attempted by orphaned instance",
+                    attrname, instance=self
+                )
             raise exception.OrphanedObjectError(method='obj_load_attr',
                                                 objtype=self.obj_name())
         if 'uuid' not in self:
diff --git a/nova/tests/unit/objects/test_instance.py b/nova/tests/unit/objects/test_instance.py
index e187a4c251c..cbe8c3e6f68 100644
--- a/nova/tests/unit/objects/test_instance.py
+++ b/nova/tests/unit/objects/test_instance.py
@@ -1632,6 +1632,21 @@ def test_save_objectfield_reraises_if_not_instance_related(self):
         self._test_save_objectfield_fk_constraint_fails(
                 'other_foreign_key', db_exc.DBReferenceError)
 
+    @mock.patch('nova.objects.instance.LOG.debug')
+    def test_obj_load_attr_log(self, mock_log_debug):
+        # Instance with no UUID should not log.
+        instance = objects.Instance()
+        self.assertRaises(
+            exception.OrphanedObjectError, instance.obj_load_attr, 'foo')
+        mock_log_debug.assert_not_called()
+        # Instance with UUID should log.
+        instance = objects.Instance(
+            uuid='127a0d59-b88c-422b-b9a1-2dc7cc51fb9a')
+        self.assertRaises(
+            exception.OrphanedObjectError, instance.obj_load_attr, 'foo')
+        msg = "Lazy-load of '%s' attempted by orphaned instance"
+        mock_log_debug.assert_called_once_with(msg, 'foo', instance=instance)
+
 
 class TestRemoteInstanceObject(test_objects._RemoteTest,
                                _TestInstanceObject):

From 0b0da898ff5ca77a111fb545a62dc3f86987bfb3 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Tue, 4 Jul 2023 16:42:08 +0100
Subject: [PATCH 62/93] enable validations in nova-lvm

As of I8ca059a4702471d4d30ea5a06079859eba3f5a81 validations
are now requried for test_rebuild_volume_backed_server.
Validations are also required for any volume attach/detach based test
in general due to know qemu issues.

This patch just turns them back on to unblock the gate.

Closes-Bug: #2025813
Change-Id: Ia198f712e2ad277743aed08e27e480208f463ac7
(cherry picked from commit 6f56c5c9fd60ee1d53376a9100a9580cb2b38dc3)
(cherry picked from commit 976364f9e8f2ddb0e2cb5d8dc765c37ef833c837)
(cherry picked from commit e5eb65e7a0a481a30332ea06e87d3c274dc1b046)
---
 .zuul.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index 3866c9af9d1..f79f9630118 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -258,8 +258,6 @@
         NOVA_BACKEND: LVM
         # Do not waste time clearing volumes.
         LVM_VOLUME_CLEAR: none
-        # Disable SSH validation in tests to save time.
-        TEMPEST_RUN_VALIDATION: false
         # Increase the size of the swift loopback device to accommodate RAW
         # snapshots from the LV based instance disks.
         # See bug #1913451 for more details.

From 0a6b57a9a24a0936383aaf444c690772aacc3245 Mon Sep 17 00:00:00 2001
From: Artom Lifshitz <alifshit@redhat.com>
Date: Mon, 10 Jan 2022 13:36:36 -0500
Subject: [PATCH 63/93] libvirt: remove default cputune shares value

Previously, the libvirt driver defaulted to 1024 * (# of CPUs) for the
value of domain/cputune/shares in the libvirt XML. This value is then
passed directly by libvirt to the cgroups API. Cgroups v2 imposes a
maximum value of 10000 that can be passed in. This makes Nova
unable to launch instances with more than 9 CPUs on hosts that run
cgroups v2, like Ubuntu Jammy or RHEL 9.

Fix this by just removing the default entirely. Because there is no
longer a guarantee that domain/cputune will contain at least a shares
element, we can stop always generating the former, and only generate
it if it will actually contain something.

We can also make operators's lives easier by leveraging the fact that
we update the XML during live migration, so this patch also adds a
method to remove the shares value from the live migration XML if one
was not set as the quota:cpu_shares flavor extra spec.

For operators that *have* set this extra spec to something greater
than 10000, their flavors will have to get updates, and their
instances resized.

Partial-bug: 1978489
Change-Id: I49d757f5f261b3562ada27e6cf57284f615ca395
(cherry picked from commit f77a9fee5b736899ecc39d33e4f4e4012cee751c)
---
 doc/source/admin/resource-limits.rst          |  3 +-
 nova/tests/unit/virt/libvirt/test_driver.py   | 40 ++++++------------
 .../tests/unit/virt/libvirt/test_migration.py | 42 +++++++++++++++++--
 nova/virt/libvirt/driver.py                   |  8 +---
 nova/virt/libvirt/migration.py                | 13 ++++++
 ...putune-shares-values-85d5ddf4b8e24eaa.yaml | 15 +++++++
 6 files changed, 83 insertions(+), 38 deletions(-)
 create mode 100644 releasenotes/notes/remove-default-cputune-shares-values-85d5ddf4b8e24eaa.yaml

diff --git a/doc/source/admin/resource-limits.rst b/doc/source/admin/resource-limits.rst
index c74ad31c17b..8ef248a9a1d 100644
--- a/doc/source/admin/resource-limits.rst
+++ b/doc/source/admin/resource-limits.rst
@@ -38,7 +38,8 @@ CPU limits
 Libvirt enforces CPU limits in terms of *shares* and *quotas*, configured
 via :nova:extra-spec:`quota:cpu_shares` and :nova:extra-spec:`quota:cpu_period`
 / :nova:extra-spec:`quota:cpu_quota`, respectively. Both are implemented using
-the `cgroups v1 cpu controller`__.
+the `cgroups cpu controller`__. Note that allowed values for *shares* are
+platform dependant.
 
 CPU shares are a proportional weighted share of total CPU resources relative to
 other instances. It does not limit CPU usage if CPUs are not busy. There is no
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 22abff16d30..e41cd740dd9 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -2989,7 +2989,7 @@ def test_get_guest_config_numa_host_instance_fits(self):
             cfg = drvr._get_guest_config(instance_ref, [],
                                          image_meta, disk_info)
             self.assertIsNone(cfg.cpuset)
-            self.assertEqual(0, len(cfg.cputune.vcpupin))
+            self.assertIsNone(cfg.cputune)
             self.assertIsNone(cfg.cpu.numa)
 
     @mock.patch('nova.privsep.utils.supports_direct_io',
@@ -3024,7 +3024,7 @@ def test_get_guest_config_numa_host_instance_no_fit(self):
                                          image_meta, disk_info)
             self.assertFalse(choice_mock.called)
             self.assertIsNone(cfg.cpuset)
-            self.assertEqual(0, len(cfg.cputune.vcpupin))
+            self.assertIsNone(cfg.cputune)
             self.assertIsNone(cfg.cpu.numa)
 
     def _test_get_guest_memory_backing_config(
@@ -3429,7 +3429,7 @@ def test_get_guest_config_numa_host_instance_pci_no_numa_info(self):
             cfg = conn._get_guest_config(instance_ref, [],
                                          image_meta, disk_info)
             self.assertEqual(set([3]), cfg.cpuset)
-            self.assertEqual(0, len(cfg.cputune.vcpupin))
+            self.assertIsNone(cfg.cputune)
             self.assertIsNone(cfg.cpu.numa)
 
     @mock.patch('nova.privsep.utils.supports_direct_io',
@@ -3481,7 +3481,7 @@ def test_get_guest_config_numa_host_instance_2pci_no_fit(self):
                                          image_meta, disk_info)
             self.assertFalse(choice_mock.called)
             self.assertEqual(set([3]), cfg.cpuset)
-            self.assertEqual(0, len(cfg.cputune.vcpupin))
+            self.assertIsNone(cfg.cputune)
             self.assertIsNone(cfg.cpu.numa)
 
     @mock.patch.object(fakelibvirt.Connection, 'getType')
@@ -3577,7 +3577,7 @@ def test_get_guest_config_numa_host_instance_fit_w_cpu_pinset(self):
             # NOTE(ndipanov): we make sure that pin_set was taken into account
             # when choosing viable cells
             self.assertEqual(set([2, 3]), cfg.cpuset)
-            self.assertEqual(0, len(cfg.cputune.vcpupin))
+            self.assertIsNone(cfg.cputune)
             self.assertIsNone(cfg.cpu.numa)
 
     def test_get_guest_config_non_numa_host_instance_topo(self):
@@ -3617,7 +3617,7 @@ def test_get_guest_config_non_numa_host_instance_topo(self):
             cfg = drvr._get_guest_config(instance_ref, [],
                                          image_meta, disk_info)
             self.assertIsNone(cfg.cpuset)
-            self.assertEqual(0, len(cfg.cputune.vcpupin))
+            self.assertIsNone(cfg.cputune)
             self.assertIsNone(cfg.numatune)
             self.assertIsNotNone(cfg.cpu.numa)
             for instance_cell, numa_cfg_cell in zip(
@@ -7020,25 +7020,9 @@ def test_get_guest_config_with_rng_dev_not_present(self, mock_path):
                           [],
                           image_meta, disk_info)
 
-    def test_guest_cpu_shares_with_multi_vcpu(self):
-        self.flags(virt_type='kvm', group='libvirt')
-
-        drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
-
-        instance_ref = objects.Instance(**self.test_instance)
-        instance_ref.flavor.vcpus = 4
-        image_meta = objects.ImageMeta.from_dict(self.test_image_meta)
-
-        disk_info = blockinfo.get_disk_info(CONF.libvirt.virt_type,
-                                            instance_ref,
-                                            image_meta)
-
-        cfg = drvr._get_guest_config(instance_ref, [],
-                                     image_meta, disk_info)
-
-        self.assertEqual(4096, cfg.cputune.shares)
-
-    def test_get_guest_config_with_cpu_quota(self):
+    @mock.patch.object(
+        host.Host, "is_cpu_control_policy_capable", return_value=True)
+    def test_get_guest_config_with_cpu_quota(self, is_able):
         self.flags(virt_type='kvm', group='libvirt')
 
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), True)
@@ -11608,7 +11592,7 @@ def test_live_migration_update_graphics_xml(self, mock_xml,
                                                       mock_migrateToURI3,
                                                       mock_min_version):
         self.compute = manager.ComputeManager()
-        instance_ref = self.test_instance
+        instance_ref = objects.Instance(**self.test_instance)
         target_connection = '127.0.0.2'
 
         xml_tmpl = ("<domain type='kvm'>"
@@ -12288,7 +12272,7 @@ def test_live_migration_update_serial_console_xml(self, mock_xml,
                                                       mock_get,
                                                       mock_min_version):
         self.compute = manager.ComputeManager()
-        instance_ref = self.test_instance
+        instance_ref = objects.Instance(**self.test_instance)
         target_connection = '127.0.0.2'
 
         xml_tmpl = ("<domain type='kvm'>"
@@ -12578,7 +12562,7 @@ def test_live_migration_raises_exception(self, mock_xml,
                                              mock_min_version):
         # Prepare data
         self.compute = manager.ComputeManager()
-        instance_ref = self.test_instance
+        instance_ref = objects.Instance(**self.test_instance)
         target_connection = '127.0.0.2'
 
         disk_paths = ['vda', 'vdb']
diff --git a/nova/tests/unit/virt/libvirt/test_migration.py b/nova/tests/unit/virt/libvirt/test_migration.py
index f4e64fbe53e..70488f88cf4 100644
--- a/nova/tests/unit/virt/libvirt/test_migration.py
+++ b/nova/tests/unit/virt/libvirt/test_migration.py
@@ -28,6 +28,7 @@
 from nova import test
 from nova.tests import fixtures as nova_fixtures
 from nova.tests.fixtures import libvirt as fakelibvirt
+from nova.tests.unit.virt.libvirt import test_driver
 from nova.virt.libvirt import config as vconfig
 from nova.virt.libvirt import guest as libvirt_guest
 from nova.virt.libvirt import host
@@ -80,16 +81,51 @@ def test_get_updated_guest_xml(
         get_volume_config = mock.MagicMock()
         mock_guest.get_xml_desc.return_value = '<domain></domain>'
 
-        migration.get_updated_guest_xml(
-            mock.sentinel.instance, mock_guest, data, get_volume_config)
+        instance = objects.Instance(**test_driver._create_test_instance())
+        migration.get_updated_guest_xml(instance, mock_guest, data,
+                                        get_volume_config)
         mock_graphics.assert_called_once_with(mock.ANY, data)
         mock_serial.assert_called_once_with(mock.ANY, data)
         mock_volume.assert_called_once_with(
-            mock.ANY, data, mock.sentinel.instance, get_volume_config)
+            mock.ANY, data, instance, get_volume_config)
         mock_perf_events_xml.assert_called_once_with(mock.ANY, data)
         mock_memory_backing.assert_called_once_with(mock.ANY, data)
         self.assertEqual(1, mock_tostring.called)
 
+    def test_update_quota_xml(self):
+        old_xml = """<domain>
+                         <name>fake-instance</name>
+                         <cputune>
+                             <shares>42</shares>
+                             <period>1337</period>
+                         </cputune>
+                     </domain>"""
+        instance = objects.Instance(**test_driver._create_test_instance())
+        new_xml = migration._update_quota_xml(instance,
+                                              etree.fromstring(old_xml))
+        new_xml = etree.tostring(new_xml, encoding='unicode')
+        self.assertXmlEqual(
+            """<domain>
+                   <name>fake-instance</name>
+                   <cputune>
+                       <period>1337</period>
+                   </cputune>
+               </domain>""", new_xml)
+
+    def test_update_quota_xml_empty_cputune(self):
+        old_xml = """<domain>
+                         <name>fake-instance</name>
+                         <cputune>
+                             <shares>42</shares>
+                         </cputune>
+                     </domain>"""
+        instance = objects.Instance(**test_driver._create_test_instance())
+        new_xml = migration._update_quota_xml(instance,
+                                              etree.fromstring(old_xml))
+        new_xml = etree.tostring(new_xml, encoding='unicode')
+        self.assertXmlEqual('<domain><name>fake-instance</name></domain>',
+                            new_xml)
+
     def test_update_device_resources_xml_vpmem(self):
         # original xml for vpmems, /dev/dax0.1 and /dev/dax0.2 here
         # are vpmem device path on source host
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 615a009e062..4de51ce8e31 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -5682,15 +5682,11 @@ def _update_guest_cputune(self, guest, flavor):
         if not is_able or CONF.libvirt.virt_type not in ('lxc', 'kvm', 'qemu'):
             return
 
-        if guest.cputune is None:
-            guest.cputune = vconfig.LibvirtConfigGuestCPUTune()
-            # Setting the default cpu.shares value to be a value
-            # dependent on the number of vcpus
-        guest.cputune.shares = 1024 * guest.vcpus
-
         for name in cputuning:
             key = "quota:cpu_" + name
             if key in flavor.extra_specs:
+                if guest.cputune is None:
+                    guest.cputune = vconfig.LibvirtConfigGuestCPUTune()
                 setattr(guest.cputune, name,
                         int(flavor.extra_specs[key]))
 
diff --git a/nova/virt/libvirt/migration.py b/nova/virt/libvirt/migration.py
index 8cea9f29831..4726111a765 100644
--- a/nova/virt/libvirt/migration.py
+++ b/nova/virt/libvirt/migration.py
@@ -62,6 +62,7 @@ def get_updated_guest_xml(instance, guest, migrate_data, get_volume_config,
         xml_doc, migrate_data, instance, get_volume_config)
     xml_doc = _update_perf_events_xml(xml_doc, migrate_data)
     xml_doc = _update_memory_backing_xml(xml_doc, migrate_data)
+    xml_doc = _update_quota_xml(instance, xml_doc)
     if get_vif_config is not None:
         xml_doc = _update_vif_xml(xml_doc, migrate_data, get_vif_config)
     if 'dst_numa_info' in migrate_data:
@@ -71,6 +72,18 @@ def get_updated_guest_xml(instance, guest, migrate_data, get_volume_config,
     return etree.tostring(xml_doc, encoding='unicode')
 
 
+def _update_quota_xml(instance, xml_doc):
+    flavor_shares = instance.flavor.extra_specs.get('quota:cpu_shares')
+    cputune = xml_doc.find('./cputune')
+    shares = xml_doc.find('./cputune/shares')
+    if shares is not None and not flavor_shares:
+        cputune.remove(shares)
+    # Remove the cputune element entirely if it has no children left.
+    if cputune is not None and not list(cputune):
+        xml_doc.remove(cputune)
+    return xml_doc
+
+
 def _update_device_resources_xml(xml_doc, new_resources):
     vpmems = []
     for resource in new_resources:
diff --git a/releasenotes/notes/remove-default-cputune-shares-values-85d5ddf4b8e24eaa.yaml b/releasenotes/notes/remove-default-cputune-shares-values-85d5ddf4b8e24eaa.yaml
new file mode 100644
index 00000000000..9dd0987bb8c
--- /dev/null
+++ b/releasenotes/notes/remove-default-cputune-shares-values-85d5ddf4b8e24eaa.yaml
@@ -0,0 +1,15 @@
+upgrade:
+  - |
+    In the libvirt driver, the default value of the ``<cputune><shares>``
+    element has been removed, and is now left to libvirt to decide. This is
+    because allowed values are platform dependant, and the previous code was
+    not guaranteed to be supported on all platforms. If any of your flavors are
+    using the quota:cpu_shares extra spec, you may need to resize to a
+    supported value before upgrading.
+
+    To facilitate the transition to no Nova default for ``<cputune><shares>``,
+    its value will be removed during live migration unless a value is set in
+    the ``quota:cpu_shares`` extra spec. This can cause temporary CPU
+    starvation for the live migrated instance if other instances on the
+    destination host still have the old default ``<cputune><shares>`` value. To
+    fix this, hard reboot, cold migrate, or live migrate the other instances.

From a1cd1ff3ec7b9d786a5a49c04389624881340c18 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Tue, 21 Nov 2023 17:33:10 +0100
Subject: [PATCH 64/93] [stable-only] Remove nova-emulation from check pipeline
 of Yoga

Yoga will transition soon to Unmaintained and nova-emulation job is
almost constantly failing on stable/yoga now, so this patch moves it
to experimental pipeline.
Note that nova-emulation is only running on stable/yoga, and we don't
run it on newer branches. Only periodic-weekly pipeline is targeted for
this job.

Change-Id: Iff6ffab833754cf39c47483a48a5df3b749fcffd
---
 .zuul.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index f79f9630118..58235618941 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -661,7 +661,6 @@
         - nova-multi-cell
         - nova-next
         - nova-ovs-hybrid-plug
-        - nova-emulation
         - nova-tox-validate-backport:
             voting: false
         - nova-tox-functional-centos8-py36
@@ -742,6 +741,7 @@
         - devstack-plugin-nfs-tempest-full:
             irrelevant-files: *nova-base-irrelevant-files
         - nova-osprofiler-redis
+        - nova-emulation
         - tempest-pg-full:
             irrelevant-files: *nova-base-irrelevant-files
         - nova-tempest-full-oslo.versionedobjects:

From bf8b69960599cf9af02ce11d1963dc920d915230 Mon Sep 17 00:00:00 2001
From: OpenStack Release Bot <infra-root@openstack.org>
Date: Tue, 6 Feb 2024 14:27:02 +0000
Subject: [PATCH 65/93] Update .gitreview for unmaintained/yoga

Depends-On: https://review.opendev.org/c/openstack/openstack-zuul-jobs/+/908190

Change-Id: If244422b6e3571c14ee086952809586f68acab5c
---
 .gitreview | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitreview b/.gitreview
index b111c5d6a83..23383d4f695 100644
--- a/.gitreview
+++ b/.gitreview
@@ -2,4 +2,4 @@
 host=review.opendev.org
 port=29418
 project=openstack/nova.git
-defaultbranch=stable/yoga
+defaultbranch=unmaintained/yoga

From 4d2b894e77dd779cbab47f95625604fa329c1aa9 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Mon, 8 Jul 2024 09:00:39 +0200
Subject: [PATCH 66/93] [CI] Use yoga-last from tempest for ceph multistore job

nova-ceph-multistore job is broken on yoga branch. This patch sets to
use yoga-last tagged version of cinder-tempest-plugin.

Also: drop nova-tox-functional-centos8-py36 job as centos8 is end of
life, hence the job cannot be fixed.

Change-Id: Ic81d5cd4eb3b702dfe14a3c492953be0b03baab8
---
 .zuul.yaml | 38 +++++++++-----------------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index 58235618941..846e3e23bac 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -2,13 +2,12 @@
 # for job naming conventions.
 
 - job:
-    name: nova-tox-functional-centos8-py36
-    parent: openstack-tox-functional-py36
-    nodeset: devstack-single-node-centos-8-stream
+    name: nova-tox-functional-py38
+    parent: openstack-tox-functional-py38
     description: |
       Run tox-based functional tests for the OpenStack Nova project
-      under cPython version 3.6 with Nova specific irrelevant-files list.
-      Uses tox with the ``functional-py36`` environment.
+      under cPython version 3.8 with Nova specific irrelevant-files list.
+      Uses tox with the ``functional-py38`` environment.
 
       This job also provides a parent for other projects to run the nova
       functional tests on their own changes.
@@ -22,28 +21,6 @@
       - ^doc/(source|test)/.*$
       - ^nova/locale/.*$
       - ^releasenotes/.*$
-    vars:
-      # explicitly stating the work dir makes this job reusable by other
-      # projects
-      zuul_work_dir: src/opendev.org/openstack/nova
-      bindep_profile: test py36
-    timeout: 3600
-
-- job:
-    name: nova-tox-functional-py38
-    parent: openstack-tox-functional-py38
-    description: |
-      Run tox-based functional tests for the OpenStack Nova project
-      under cPython version 3.8 with Nova specific irrelevant-files list.
-      Uses tox with the ``functional-py38`` environment.
-
-      This job also provides a parent for other projects to run the nova
-      functional tests on their own changes.
-    required-projects:
-      # including nova here makes this job reusable by other projects
-      - openstack/nova
-      - openstack/placement
-    irrelevant-files: *functional-irrelevant-files
     vars:
       # explicitly stating the work dir makes this job reusable by other
       # projects
@@ -588,9 +565,14 @@
     irrelevant-files: *nova-base-irrelevant-files
     required-projects:
       - openstack/nova
+      - name: openstack/cinder-tempest-plugin
+        override-checkout: yoga-last
     pre-run:
       - playbooks/ceph/glance-copy-policy.yaml
     vars:
+      # NOTE(elod.illes): this job is breaking with the following test case on
+      # unmaintained/yoga, so let's just exclude it to unblock the gate
+      tempest_exclude_regex: test_nova_image_snapshot_dependency
       # NOTE(danms): These tests create an empty non-raw image, which nova
       # will refuse because we set never_download_image_if_on_rbd in this job.
       # Just skip these tests for this case.
@@ -663,7 +645,6 @@
         - nova-ovs-hybrid-plug
         - nova-tox-validate-backport:
             voting: false
-        - nova-tox-functional-centos8-py36
         - nova-tox-functional-py38
         - nova-tox-functional-py39
         - tempest-integrated-compute:
@@ -708,7 +689,6 @@
       jobs:
         - nova-live-migration
         - nova-live-migration-ceph
-        - nova-tox-functional-centos8-py36
         - nova-tox-functional-py38
         - nova-tox-functional-py39
         - nova-multi-cell

From f844c8fe3ccbf5b477c007ac1d2e290c9d74f2e6 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Mon, 1 Apr 2024 07:32:11 -0700
Subject: [PATCH 67/93] Reject qcow files with data-file attributes

Change-Id: Ic3fa16f55acc38cf6c1a4ac1dce4487225e66d04
Closes-Bug: #2059809
(cherry picked from commit ec9c55cbbc91d1f31e42ced289a7c82cf79dc2a2)
(cherry picked from commit 58d933eafb3f7164419000700a305c8f75d5cb6e)
(cherry picked from commit 736328f78fb88b6d567b94b50cd14b3ebef08a5e)
(cherry picked from commit af4d819c606d6662d0b086365a51f5220b596e48)
(cherry picked from commit d69d441cf5d82f69d8ed7d555a6af73624866400)
---
 nova/tests/unit/virt/libvirt/test_utils.py |  1 +
 nova/tests/unit/virt/test_images.py        | 31 ++++++++++++++++++++++
 nova/virt/images.py                        |  9 +++++++
 3 files changed, 41 insertions(+)

diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py
index 4e73c662c57..797480bdc00 100644
--- a/nova/tests/unit/virt/libvirt/test_utils.py
+++ b/nova/tests/unit/virt/libvirt/test_utils.py
@@ -354,6 +354,7 @@ class FakeImgInfo(object):
             FakeImgInfo.file_format = file_format
             FakeImgInfo.backing_file = backing_file
             FakeImgInfo.virtual_size = 1
+            FakeImgInfo.format_specific = None if file_format == 'raw' else {}
 
             return FakeImgInfo()
 
diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py
index 563330b5414..e0b08bb9f59 100644
--- a/nova/tests/unit/virt/test_images.py
+++ b/nova/tests/unit/virt/test_images.py
@@ -112,6 +112,37 @@ def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch):
                                images.fetch_to_raw,
                                None, 'href123', '/no/path')
 
+    @mock.patch.object(images, 'convert_image',
+                       side_effect=exception.ImageUnacceptable)
+    @mock.patch.object(images, 'qemu_img_info')
+    @mock.patch.object(images, 'fetch')
+    def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn,
+                                    fetch):
+        # NOTE(danms): the above test needs the following line as well, as it
+        # is broken without it.
+        qemu_img_info = qemu_img_info_fn.return_value
+        qemu_img_info.backing_file = None
+        qemu_img_info.file_format = 'qcow2'
+        qemu_img_info.virtual_size = 20
+        qemu_img_info.format_specific = {'data': {'data-file': 'somefile'}}
+        self.assertRaisesRegex(exception.ImageUnacceptable,
+                               'Image href123 is unacceptable.*somefile',
+                               images.fetch_to_raw,
+                               None, 'href123', '/no/path')
+
+    @mock.patch('os.rename')
+    @mock.patch.object(images, 'qemu_img_info')
+    @mock.patch.object(images, 'fetch')
+    def test_fetch_to_raw_from_raw(self, fetch, qemu_img_info_fn, mock_rename):
+        # Make sure we support a case where we fetch an already-raw image and
+        # qemu-img returns None for "format_specific".
+        qemu_img_info = qemu_img_info_fn.return_value
+        qemu_img_info.file_format = 'raw'
+        qemu_img_info.backing_file = None
+        qemu_img_info.format_specific = None
+        images.fetch_to_raw(None, 'href123', '/no/path')
+        mock_rename.assert_called_once_with('/no/path.part', '/no/path')
+
     @mock.patch.object(compute_utils, 'disk_ops_semaphore')
     @mock.patch('nova.privsep.utils.supports_direct_io', return_value=True)
     @mock.patch('oslo_concurrency.processutils.execute')
diff --git a/nova/virt/images.py b/nova/virt/images.py
index f13c8722909..5f80a1d0758 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -157,6 +157,15 @@ def fetch_to_raw(context, image_href, path, trusted_certs=None):
                 reason=(_("fmt=%(fmt)s backed by: %(backing_file)s") %
                         {'fmt': fmt, 'backing_file': backing_file}))
 
+        try:
+            data_file = data.format_specific['data']['data-file']
+        except (KeyError, TypeError, AttributeError):
+            data_file = None
+        if data_file is not None:
+            raise exception.ImageUnacceptable(image_id=image_href,
+                reason=(_("fmt=%(fmt)s has data-file: %(data_file)s") %
+                        {'fmt': fmt, 'data_file': data_file}))
+
         if fmt == 'vmdk':
             check_vmdk_image(image_href, data)
 

From b8a3d56f2e27531cc735606fbe92b648a51e8d62 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Wed, 17 Apr 2024 07:06:13 -0700
Subject: [PATCH 68/93] Check images with format_inspector for safety

It has been asserted that we should not be calling qemu-img info
on untrusted files. That means we need to know if they have a
backing_file, data_file or other unsafe configuration *before* we use
qemu-img to probe or convert them.

This grafts glance's format_inspector module into nova/images so we
can use it to check the file early for safety. The expectation is that
this will be moved to oslo.utils (or something) later and thus we will
just delete the file from nova and change our import when that happens.

NOTE: This includes whitespace changes from the glance version of
format_inspector.py because of autopep8 demands.

Conflicts:
  nova/conf/workarounds.py

NOTE(elod.illes): conflict is due to the following patch that is only
present in zed: Iab92124b5776a799c7f90d07281d28fcf191c8fe

Change-Id: Iaefbe41b4c4bf0cf95d8f621653fdf65062aaa59
Closes-Bug: #2059809
(cherry picked from commit 9cdce715945619fc851ab3f43c97fab4bae4e35a)
(cherry picked from commit f07fa55fd86726eeafcd4c0c687bc49dd4df9f4c)
(cherry picked from commit 0acf5ee7b5dfb6ff0f9a9745f5ad2a0ed2bf65bf)
(cherry picked from commit 67e5376dd64407f5aaf1ea5f8c896e356064a2c9)
(cherry picked from commit da352edceb74dbd715268f94516503042b48cc90)
---
 nova/conf/workarounds.py                   |  10 +
 nova/image/format_inspector.py             | 889 +++++++++++++++++++++
 nova/tests/unit/virt/libvirt/test_utils.py |  48 +-
 nova/tests/unit/virt/test_images.py        | 136 +++-
 nova/virt/images.py                        |  47 +-
 5 files changed, 1121 insertions(+), 9 deletions(-)
 create mode 100644 nova/image/format_inspector.py

diff --git a/nova/conf/workarounds.py b/nova/conf/workarounds.py
index 2ec53282cdb..55d810cce89 100644
--- a/nova/conf/workarounds.py
+++ b/nova/conf/workarounds.py
@@ -416,6 +416,16 @@
         help="""
 When this is enabled, it will skip version-checking of hypervisors
 during live migration.
+"""),
+    cfg.BoolOpt(
+        'disable_deep_image_inspection',
+        default=False,
+        help="""
+This disables the additional deep image inspection that the compute node does
+when downloading from glance. This includes backing-file, data-file, and
+known-features detection *before* passing the image to qemu-img. Generally,
+this inspection should be enabled for maximum safety, but this workaround
+option allows disabling it if there is a compatibility concern.
 """),
 ]
 
diff --git a/nova/image/format_inspector.py b/nova/image/format_inspector.py
new file mode 100644
index 00000000000..268c98b99cb
--- /dev/null
+++ b/nova/image/format_inspector.py
@@ -0,0 +1,889 @@
+# Copyright 2020 Red Hat, Inc
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+"""
+This is a python implementation of virtual disk format inspection routines
+gathered from various public specification documents, as well as qemu disk
+driver code. It attempts to store and parse the minimum amount of data
+required, and in a streaming-friendly manner to collect metadata about
+complex-format images.
+"""
+
+import struct
+
+from oslo_log import log as logging
+
+LOG = logging.getLogger(__name__)
+
+
+def chunked_reader(fileobj, chunk_size=512):
+    while True:
+        chunk = fileobj.read(chunk_size)
+        if not chunk:
+            break
+        yield chunk
+
+
+class CaptureRegion(object):
+    """Represents a region of a file we want to capture.
+
+    A region of a file we want to capture requires a byte offset into
+    the file and a length. This is expected to be used by a data
+    processing loop, calling capture() with the most recently-read
+    chunk. This class handles the task of grabbing the desired region
+    of data across potentially multiple fractional and unaligned reads.
+
+    :param offset: Byte offset into the file starting the region
+    :param length: The length of the region
+    """
+
+    def __init__(self, offset, length):
+        self.offset = offset
+        self.length = length
+        self.data = b''
+
+    @property
+    def complete(self):
+        """Returns True when we have captured the desired data."""
+        return self.length == len(self.data)
+
+    def capture(self, chunk, current_position):
+        """Process a chunk of data.
+
+        This should be called for each chunk in the read loop, at least
+        until complete returns True.
+
+        :param chunk: A chunk of bytes in the file
+        :param current_position: The position of the file processed by the
+                                 read loop so far. Note that this will be
+                                 the position in the file *after* the chunk
+                                 being presented.
+        """
+        read_start = current_position - len(chunk)
+        if (read_start <= self.offset <= current_position or
+                self.offset <= read_start <= (self.offset + self.length)):
+            if read_start < self.offset:
+                lead_gap = self.offset - read_start
+            else:
+                lead_gap = 0
+            self.data += chunk[lead_gap:]
+            self.data = self.data[:self.length]
+
+
+class ImageFormatError(Exception):
+    """An unrecoverable image format error that aborts the process."""
+    pass
+
+
+class TraceDisabled(object):
+    """A logger-like thing that swallows tracing when we do not want it."""
+
+    def debug(self, *a, **k):
+        pass
+
+    info = debug
+    warning = debug
+    error = debug
+
+
+class FileInspector(object):
+    """A stream-based disk image inspector.
+
+    This base class works on raw images and is subclassed for more
+    complex types. It is to be presented with the file to be examined
+    one chunk at a time, during read processing and will only store
+    as much data as necessary to determine required attributes of
+    the file.
+    """
+
+    def __init__(self, tracing=False):
+        self._total_count = 0
+
+        # NOTE(danms): The logging in here is extremely verbose for a reason,
+        # but should never really be enabled at that level at runtime. To
+        # retain all that work and assist in future debug, we have a separate
+        # debug flag that can be passed from a manual tool to turn it on.
+        if tracing:
+            self._log = logging.getLogger(str(self))
+        else:
+            self._log = TraceDisabled()
+        self._capture_regions = {}
+
+    def _capture(self, chunk, only=None):
+        for name, region in self._capture_regions.items():
+            if only and name not in only:
+                continue
+            if not region.complete:
+                region.capture(chunk, self._total_count)
+
+    def eat_chunk(self, chunk):
+        """Call this to present chunks of the file to the inspector."""
+        pre_regions = set(self._capture_regions.keys())
+
+        # Increment our position-in-file counter
+        self._total_count += len(chunk)
+
+        # Run through the regions we know of to see if they want this
+        # data
+        self._capture(chunk)
+
+        # Let the format do some post-read processing of the stream
+        self.post_process()
+
+        # Check to see if the post-read processing added new regions
+        # which may require the current chunk.
+        new_regions = set(self._capture_regions.keys()) - pre_regions
+        if new_regions:
+            self._capture(chunk, only=new_regions)
+
+    def post_process(self):
+        """Post-read hook to process what has been read so far.
+
+        This will be called after each chunk is read and potentially captured
+        by the defined regions. If any regions are defined by this call,
+        those regions will be presented with the current chunk in case it
+        is within one of the new regions.
+        """
+        pass
+
+    def region(self, name):
+        """Get a CaptureRegion by name."""
+        return self._capture_regions[name]
+
+    def new_region(self, name, region):
+        """Add a new CaptureRegion by name."""
+        if self.has_region(name):
+            # This is a bug, we tried to add the same region twice
+            raise ImageFormatError('Inspector re-added region %s' % name)
+        self._capture_regions[name] = region
+
+    def has_region(self, name):
+        """Returns True if named region has been defined."""
+        return name in self._capture_regions
+
+    @property
+    def format_match(self):
+        """Returns True if the file appears to be the expected format."""
+        return True
+
+    @property
+    def virtual_size(self):
+        """Returns the virtual size of the disk image, or zero if unknown."""
+        return self._total_count
+
+    @property
+    def actual_size(self):
+        """Returns the total size of the file, usually smaller than
+        virtual_size. NOTE: this will only be accurate if the entire
+        file is read and processed.
+        """
+        return self._total_count
+
+    @property
+    def complete(self):
+        """Returns True if we have all the information needed."""
+        return all(r.complete for r in self._capture_regions.values())
+
+    def __str__(self):
+        """The string name of this file format."""
+        return 'raw'
+
+    @property
+    def context_info(self):
+        """Return info on amount of data held in memory for auditing.
+
+        This is a dict of region:sizeinbytes items that the inspector
+        uses to examine the file.
+        """
+        return {name: len(region.data) for name, region in
+                self._capture_regions.items()}
+
+    @classmethod
+    def from_file(cls, filename):
+        """Read as much of a file as necessary to complete inspection.
+
+        NOTE: Because we only read as much of the file as necessary, the
+        actual_size property will not reflect the size of the file, but the
+        amount of data we read before we satisfied the inspector.
+
+        Raises ImageFormatError if we cannot parse the file.
+        """
+        inspector = cls()
+        with open(filename, 'rb') as f:
+            for chunk in chunked_reader(f):
+                inspector.eat_chunk(chunk)
+                if inspector.complete:
+                    # No need to eat any more data
+                    break
+        if not inspector.complete or not inspector.format_match:
+            raise ImageFormatError('File is not in requested format')
+        return inspector
+
+    def safety_check(self):
+        """Perform some checks to determine if this file is safe.
+
+        Returns True if safe, False otherwise. It may raise ImageFormatError
+        if safety cannot be guaranteed because of parsing or other errors.
+        """
+        return True
+
+
+# The qcow2 format consists of a big-endian 72-byte header, of which
+# only a small portion has information we care about:
+#
+# Dec   Hex   Name
+#   0  0x00   Magic 4-bytes 'QFI\xfb'
+#   4  0x04   Version (uint32_t, should always be 2 for modern files)
+#  . . .
+#   8  0x08   Backing file offset (uint64_t)
+#  24  0x18   Size in bytes (unint64_t)
+#  . . .
+#  72  0x48   Incompatible features bitfield (6 bytes)
+#
+# https://gitlab.com/qemu-project/qemu/-/blob/master/docs/interop/qcow2.txt
+class QcowInspector(FileInspector):
+    """QEMU QCOW2 Format
+
+    This should only require about 32 bytes of the beginning of the file
+    to determine the virtual size, and 104 bytes to perform the safety check.
+    """
+
+    BF_OFFSET = 0x08
+    BF_OFFSET_LEN = 8
+    I_FEATURES = 0x48
+    I_FEATURES_LEN = 8
+    I_FEATURES_DATAFILE_BIT = 3
+    I_FEATURES_MAX_BIT = 4
+
+    def __init__(self, *a, **k):
+        super(QcowInspector, self).__init__(*a, **k)
+        self.new_region('header', CaptureRegion(0, 512))
+
+    def _qcow_header_data(self):
+        magic, version, bf_offset, bf_sz, cluster_bits, size = (
+            struct.unpack('>4sIQIIQ', self.region('header').data[:32]))
+        return magic, size
+
+    @property
+    def has_header(self):
+        return self.region('header').complete
+
+    @property
+    def virtual_size(self):
+        if not self.region('header').complete:
+            return 0
+        if not self.format_match:
+            return 0
+        magic, size = self._qcow_header_data()
+        return size
+
+    @property
+    def format_match(self):
+        if not self.region('header').complete:
+            return False
+        magic, size = self._qcow_header_data()
+        return magic == b'QFI\xFB'
+
+    @property
+    def has_backing_file(self):
+        if not self.region('header').complete:
+            return None
+        if not self.format_match:
+            return False
+        bf_offset_bytes = self.region('header').data[
+            self.BF_OFFSET:self.BF_OFFSET + self.BF_OFFSET_LEN]
+        # nonzero means "has a backing file"
+        bf_offset, = struct.unpack('>Q', bf_offset_bytes)
+        return bf_offset != 0
+
+    @property
+    def has_unknown_features(self):
+        if not self.region('header').complete:
+            return None
+        if not self.format_match:
+            return False
+        i_features = self.region('header').data[
+            self.I_FEATURES:self.I_FEATURES + self.I_FEATURES_LEN]
+
+        # This is the maximum byte number we should expect any bits to be set
+        max_byte = self.I_FEATURES_MAX_BIT // 8
+
+        # The flag bytes are in big-endian ordering, so if we process
+        # them in index-order, they're reversed
+        for i, byte_num in enumerate(reversed(range(self.I_FEATURES_LEN))):
+            if byte_num == max_byte:
+                # If we're in the max-allowed byte, allow any bits less than
+                # the maximum-known feature flag bit to be set
+                allow_mask = ((1 << self.I_FEATURES_MAX_BIT) - 1)
+            elif byte_num > max_byte:
+                # If we're above the byte with the maximum known feature flag
+                # bit, then we expect all zeroes
+                allow_mask = 0x0
+            else:
+                # Any earlier-than-the-maximum byte can have any of the flag
+                # bits set
+                allow_mask = 0xFF
+
+            if i_features[i] & ~allow_mask:
+                LOG.warning('Found unknown feature bit in byte %i: %s/%s',
+                            byte_num, bin(i_features[byte_num] & ~allow_mask),
+                            bin(allow_mask))
+                return True
+
+        return False
+
+    @property
+    def has_data_file(self):
+        if not self.region('header').complete:
+            return None
+        if not self.format_match:
+            return False
+        i_features = self.region('header').data[
+            self.I_FEATURES:self.I_FEATURES + self.I_FEATURES_LEN]
+
+        # First byte of bitfield, which is i_features[7]
+        byte = self.I_FEATURES_LEN - 1 - self.I_FEATURES_DATAFILE_BIT // 8
+        # Third bit of bitfield, which is 0x04
+        bit = 1 << (self.I_FEATURES_DATAFILE_BIT - 1 % 8)
+        return bool(i_features[byte] & bit)
+
+    def __str__(self):
+        return 'qcow2'
+
+    def safety_check(self):
+        return (not self.has_backing_file and
+                not self.has_data_file and
+                not self.has_unknown_features)
+
+
+# The VHD (or VPC as QEMU calls it) format consists of a big-endian
+# 512-byte "footer" at the beginning of the file with various
+# information, most of which does not matter to us:
+#
+# Dec   Hex   Name
+#   0  0x00   Magic string (8-bytes, always 'conectix')
+#  40  0x28   Disk size (uint64_t)
+#
+# https://github.com/qemu/qemu/blob/master/block/vpc.c
+class VHDInspector(FileInspector):
+    """Connectix/MS VPC VHD Format
+
+    This should only require about 512 bytes of the beginning of the file
+    to determine the virtual size.
+    """
+
+    def __init__(self, *a, **k):
+        super(VHDInspector, self).__init__(*a, **k)
+        self.new_region('header', CaptureRegion(0, 512))
+
+    @property
+    def format_match(self):
+        return self.region('header').data.startswith(b'conectix')
+
+    @property
+    def virtual_size(self):
+        if not self.region('header').complete:
+            return 0
+
+        if not self.format_match:
+            return 0
+
+        return struct.unpack('>Q', self.region('header').data[40:48])[0]
+
+    def __str__(self):
+        return 'vhd'
+
+
+# The VHDX format consists of a complex dynamic little-endian
+# structure with multiple regions of metadata and data, linked by
+# offsets with in the file (and within regions), identified by MSFT
+# GUID strings. The header is a 320KiB structure, only a few pieces of
+# which we actually need to capture and interpret:
+#
+#     Dec    Hex  Name
+#      0 0x00000  Identity (Technically 9-bytes, padded to 64KiB, the first
+#                 8 bytes of which are 'vhdxfile')
+# 196608 0x30000  The Region table (64KiB of a 32-byte header, followed
+#                 by up to 2047 36-byte region table entry structures)
+#
+# The region table header includes two items we need to read and parse,
+# which are:
+#
+# 196608 0x30000  4-byte signature ('regi')
+# 196616 0x30008  Entry count (uint32-t)
+#
+# The region table entries follow the region table header immediately
+# and are identified by a 16-byte GUID, and provide an offset of the
+# start of that region. We care about the "metadata region", identified
+# by the METAREGION class variable. The region table entry is (offsets
+# from the beginning of the entry, since it could be in multiple places):
+#
+#      0 0x00000 16-byte MSFT GUID
+#     16 0x00010 Offset of the actual metadata region (uint64_t)
+#
+# When we find the METAREGION table entry, we need to grab that offset
+# and start examining the region structure at that point. That
+# consists of a metadata table of structures, which point to places in
+# the data in an unstructured space that follows. The header is
+# (offsets relative to the region start):
+#
+#      0 0x00000 8-byte signature ('metadata')
+#      . . .
+#     16 0x00010 2-byte entry count (up to 2047 entries max)
+#
+# This header is followed by the specified number of metadata entry
+# structures, identified by GUID:
+#
+#      0 0x00000 16-byte MSFT GUID
+#     16 0x00010 4-byte offset (uint32_t, relative to the beginning of
+#                the metadata region)
+#
+# We need to find the "Virtual Disk Size" metadata item, identified by
+# the GUID in the VIRTUAL_DISK_SIZE class variable, grab the offset,
+# add it to the offset of the metadata region, and examine that 8-byte
+# chunk of data that follows.
+#
+# The "Virtual Disk Size" is a naked uint64_t which contains the size
+# of the virtual disk, and is our ultimate target here.
+#
+# https://docs.microsoft.com/en-us/openspecs/windows_protocols/ms-vhdx/83e061f8-f6e2-4de1-91bd-5d518a43d477
+class VHDXInspector(FileInspector):
+    """MS VHDX Format
+
+    This requires some complex parsing of the stream. The first 256KiB
+    of the image is stored to get the header and region information,
+    and then we capture the first metadata region to read those
+    records, find the location of the virtual size data and parse
+    it. This needs to store the metadata table entries up until the
+    VDS record, which may consist of up to 2047 32-byte entries at
+    max.  Finally, it must store a chunk of data at the offset of the
+    actual VDS uint64.
+
+    """
+    METAREGION = '8B7CA206-4790-4B9A-B8FE-575F050F886E'
+    VIRTUAL_DISK_SIZE = '2FA54224-CD1B-4876-B211-5DBED83BF4B8'
+    VHDX_METADATA_TABLE_MAX_SIZE = 32 * 2048  # From qemu
+
+    def __init__(self, *a, **k):
+        super(VHDXInspector, self).__init__(*a, **k)
+        self.new_region('ident', CaptureRegion(0, 32))
+        self.new_region('header', CaptureRegion(192 * 1024, 64 * 1024))
+
+    def post_process(self):
+        # After reading a chunk, we may have the following conditions:
+        #
+        # 1. We may have just completed the header region, and if so,
+        #    we need to immediately read and calculate the location of
+        #    the metadata region, as it may be starting in the same
+        #    read we just did.
+        # 2. We may have just completed the metadata region, and if so,
+        #    we need to immediately calculate the location of the
+        #    "virtual disk size" record, as it may be starting in the
+        #    same read we just did.
+        if self.region('header').complete and not self.has_region('metadata'):
+            region = self._find_meta_region()
+            if region:
+                self.new_region('metadata', region)
+        elif self.has_region('metadata') and not self.has_region('vds'):
+            region = self._find_meta_entry(self.VIRTUAL_DISK_SIZE)
+            if region:
+                self.new_region('vds', region)
+
+    @property
+    def format_match(self):
+        return self.region('ident').data.startswith(b'vhdxfile')
+
+    @staticmethod
+    def _guid(buf):
+        """Format a MSFT GUID from the 16-byte input buffer."""
+        guid_format = '<IHHBBBBBBBB'
+        return '%08X-%04X-%04X-%02X%02X-%02X%02X%02X%02X%02X%02X' % (
+            struct.unpack(guid_format, buf))
+
+    def _find_meta_region(self):
+        # The region table entries start after a 16-byte table header
+        region_entry_first = 16
+
+        # Parse the region table header to find the number of regions
+        regi, cksum, count, reserved = struct.unpack(
+            '<IIII', self.region('header').data[:16])
+        if regi != 0x69676572:
+            raise ImageFormatError('Region signature not found at %x' % (
+                self.region('header').offset))
+
+        if count >= 2048:
+            raise ImageFormatError('Region count is %i (limit 2047)' % count)
+
+        # Process the regions until we find the metadata one; grab the
+        # offset and return
+        self._log.debug('Region entry first is %x', region_entry_first)
+        self._log.debug('Region entries %i', count)
+        meta_offset = 0
+        for i in range(0, count):
+            entry_start = region_entry_first + (i * 32)
+            entry_end = entry_start + 32
+            entry = self.region('header').data[entry_start:entry_end]
+            self._log.debug('Entry offset is %x', entry_start)
+
+            # GUID is the first 16 bytes
+            guid = self._guid(entry[:16])
+            if guid == self.METAREGION:
+                # This entry is the metadata region entry
+                meta_offset, meta_len, meta_req = struct.unpack(
+                    '<QII', entry[16:])
+                self._log.debug('Meta entry %i specifies offset: %x',
+                                i, meta_offset)
+                # NOTE(danms): The meta_len in the region descriptor is the
+                # entire size of the metadata table and data. This can be
+                # very large, so we should only capture the size required
+                # for the maximum length of the table, which is one 32-byte
+                # table header, plus up to 2047 32-byte entries.
+                meta_len = 2048 * 32
+                return CaptureRegion(meta_offset, meta_len)
+
+        self._log.warning('Did not find metadata region')
+        return None
+
+    def _find_meta_entry(self, desired_guid):
+        meta_buffer = self.region('metadata').data
+        if len(meta_buffer) < 32:
+            # Not enough data yet for full header
+            return None
+
+        # Make sure we found the metadata region by checking the signature
+        sig, reserved, count = struct.unpack('<8sHH', meta_buffer[:12])
+        if sig != b'metadata':
+            raise ImageFormatError(
+                'Invalid signature for metadata region: %r' % sig)
+
+        entries_size = 32 + (count * 32)
+        if len(meta_buffer) < entries_size:
+            # Not enough data yet for all metadata entries. This is not
+            # strictly necessary as we could process whatever we have until
+            # we find the V-D-S one, but there are only 2047 32-byte
+            # entries max (~64k).
+            return None
+
+        if count >= 2048:
+            raise ImageFormatError(
+                'Metadata item count is %i (limit 2047)' % count)
+
+        for i in range(0, count):
+            entry_offset = 32 + (i * 32)
+            guid = self._guid(meta_buffer[entry_offset:entry_offset + 16])
+            if guid == desired_guid:
+                # Found the item we are looking for by id.
+                # Stop our region from capturing
+                item_offset, item_length, _reserved = struct.unpack(
+                    '<III',
+                    meta_buffer[entry_offset + 16:entry_offset + 28])
+                item_length = min(item_length,
+                                  self.VHDX_METADATA_TABLE_MAX_SIZE)
+                self.region('metadata').length = len(meta_buffer)
+                self._log.debug('Found entry at offset %x', item_offset)
+                # Metadata item offset is from the beginning of the metadata
+                # region, not the file.
+                return CaptureRegion(
+                    self.region('metadata').offset + item_offset,
+                    item_length)
+
+        self._log.warning('Did not find guid %s', desired_guid)
+        return None
+
+    @property
+    def virtual_size(self):
+        # Until we have found the offset and have enough metadata buffered
+        # to read it, return "unknown"
+        if not self.has_region('vds') or not self.region('vds').complete:
+            return 0
+
+        size, = struct.unpack('<Q', self.region('vds').data)
+        return size
+
+    def __str__(self):
+        return 'vhdx'
+
+
+# The VMDK format comes in a large number of variations, but the
+# single-file 'monolithicSparse' version 4 one is mostly what we care
+# about. It contains a 512-byte little-endian header, followed by a
+# variable-length "descriptor" region of text. The header looks like:
+#
+#   Dec  Hex  Name
+#     0 0x00  4-byte magic string 'KDMV'
+#     4 0x04  Version (uint32_t)
+#     8 0x08  Flags (uint32_t, unused by us)
+#    16 0x10  Number of 512 byte sectors in the disk (uint64_t)
+#    24 0x18  Granularity (uint64_t, unused by us)
+#    32 0x20  Descriptor offset in 512-byte sectors (uint64_t)
+#    40 0x28  Descriptor size in 512-byte sectors (uint64_t)
+#
+# After we have the header, we need to find the descriptor region,
+# which starts at the sector identified in the "descriptor offset"
+# field, and is "descriptor size" 512-byte sectors long. Once we have
+# that region, we need to parse it as text, looking for the
+# createType=XXX line that specifies the mechanism by which the data
+# extents are stored in this file. We only support the
+# "monolithicSparse" format, so we just need to confirm that this file
+# contains that specifier.
+#
+# https://www.vmware.com/app/vmdk/?src=vmdk
+class VMDKInspector(FileInspector):
+    """vmware VMDK format (monolithicSparse and streamOptimized variants only)
+
+    This needs to store the 512 byte header and the descriptor region
+    which should be just after that. The descriptor region is some
+    variable number of 512 byte sectors, but is just text defining the
+    layout of the disk.
+    """
+
+    # The beginning and max size of the descriptor is also hardcoded in Qemu
+    # at 0x200 and 1MB - 1
+    DESC_OFFSET = 0x200
+    DESC_MAX_SIZE = (1 << 20) - 1
+    GD_AT_END = 0xffffffffffffffff
+
+    def __init__(self, *a, **k):
+        super(VMDKInspector, self).__init__(*a, **k)
+        self.new_region('header', CaptureRegion(0, 512))
+
+    def post_process(self):
+        # If we have just completed the header region, we need to calculate
+        # the location and length of the descriptor, which should immediately
+        # follow and may have been partially-read in this read.
+        if not self.region('header').complete:
+            return
+
+        (sig, ver, _flags, _sectors, _grain, desc_sec, desc_num,
+         _numGTEsperGT, _rgdOffset, gdOffset) = struct.unpack(
+            '<4sIIQQQQIQQ', self.region('header').data[:64])
+
+        if sig != b'KDMV':
+            raise ImageFormatError('Signature KDMV not found: %r' % sig)
+
+        if ver not in (1, 2, 3):
+            raise ImageFormatError('Unsupported format version %i' % ver)
+
+        if gdOffset == self.GD_AT_END:
+            # This means we have a footer, which takes precedence over the
+            # header, which we cannot support since we stream.
+            raise ImageFormatError('Unsupported VMDK footer')
+
+        # Since we parse both desc_sec and desc_num (the location of the
+        # VMDK's descriptor, expressed in 512 bytes sectors) we enforce a
+        # check on the bounds to create a reasonable CaptureRegion. This
+        # is similar to how it's done in qemu.
+        desc_offset = desc_sec * 512
+        desc_size = min(desc_num * 512, self.DESC_MAX_SIZE)
+        if desc_offset != self.DESC_OFFSET:
+            raise ImageFormatError("Wrong descriptor location")
+
+        if not self.has_region('descriptor'):
+            self.new_region('descriptor', CaptureRegion(
+                desc_offset, desc_size))
+
+    @property
+    def format_match(self):
+        return self.region('header').data.startswith(b'KDMV')
+
+    @property
+    def virtual_size(self):
+        if not self.has_region('descriptor'):
+            # Not enough data yet
+            return 0
+
+        descriptor_rgn = self.region('descriptor')
+        if not descriptor_rgn.complete:
+            # Not enough data yet
+            return 0
+
+        descriptor = descriptor_rgn.data
+        type_idx = descriptor.index(b'createType="') + len(b'createType="')
+        type_end = descriptor.find(b'"', type_idx)
+        # Make sure we don't grab and log a huge chunk of data in a
+        # maliciously-formatted descriptor region
+        if type_end - type_idx < 64:
+            vmdktype = descriptor[type_idx:type_end]
+        else:
+            vmdktype = b'formatnotfound'
+        if vmdktype not in (b'monolithicSparse', b'streamOptimized'):
+            LOG.warning('Unsupported VMDK format %s', vmdktype)
+            return 0
+
+        # If we have the descriptor, we definitely have the header
+        _sig, _ver, _flags, sectors, _grain, _desc_sec, _desc_num = (
+            struct.unpack('<IIIQQQQ', self.region('header').data[:44]))
+
+        return sectors * 512
+
+    def safety_check(self):
+        if (not self.has_region('descriptor') or
+                not self.region('descriptor').complete):
+            return False
+
+        try:
+            # Descriptor is padded to 512 bytes
+            desc_data = self.region('descriptor').data.rstrip(b'\x00')
+            # Descriptor is actually case-insensitive ASCII text
+            desc_text = desc_data.decode('ascii').lower()
+        except UnicodeDecodeError:
+            LOG.error('VMDK descriptor failed to decode as ASCII')
+            raise ImageFormatError('Invalid VMDK descriptor data')
+
+        extent_access = ('rw', 'rdonly', 'noaccess')
+        header_fields = []
+        extents = []
+        ddb = []
+
+        # NOTE(danms): Cautiously parse the VMDK descriptor. Each line must
+        # be something we understand, otherwise we refuse it.
+        for line in [x.strip() for x in desc_text.split('\n')]:
+            if line.startswith('#') or not line:
+                # Blank or comment lines are ignored
+                continue
+            elif line.startswith('ddb'):
+                # DDB lines are allowed (but not used by us)
+                ddb.append(line)
+            elif '=' in line and ' ' not in line.split('=')[0]:
+                # Header fields are a single word followed by an '=' and some
+                # value
+                header_fields.append(line)
+            elif line.split(' ')[0] in extent_access:
+                # Extent lines start with one of the three access modes
+                extents.append(line)
+            else:
+                # Anything else results in a rejection
+                LOG.error('Unsupported line %r in VMDK descriptor', line)
+                raise ImageFormatError('Invalid VMDK descriptor data')
+
+        # Check all the extent lines for concerning content
+        for extent_line in extents:
+            if '/' in extent_line:
+                LOG.error('Extent line %r contains unsafe characters',
+                          extent_line)
+                return False
+
+        if not extents:
+            LOG.error('VMDK file specified no extents')
+            return False
+
+        return True
+
+    def __str__(self):
+        return 'vmdk'
+
+
+# The VirtualBox VDI format consists of a 512-byte little-endian
+# header, some of which we care about:
+#
+#  Dec   Hex  Name
+#   64  0x40  4-byte Magic (0xbeda107f)
+#   . . .
+#  368 0x170  Size in bytes (uint64_t)
+#
+# https://github.com/qemu/qemu/blob/master/block/vdi.c
+class VDIInspector(FileInspector):
+    """VirtualBox VDI format
+
+    This only needs to store the first 512 bytes of the image.
+    """
+
+    def __init__(self, *a, **k):
+        super(VDIInspector, self).__init__(*a, **k)
+        self.new_region('header', CaptureRegion(0, 512))
+
+    @property
+    def format_match(self):
+        if not self.region('header').complete:
+            return False
+
+        signature, = struct.unpack('<I', self.region('header').data[0x40:0x44])
+        return signature == 0xbeda107f
+
+    @property
+    def virtual_size(self):
+        if not self.region('header').complete:
+            return 0
+        if not self.format_match:
+            return 0
+
+        size, = struct.unpack('<Q', self.region('header').data[0x170:0x178])
+        return size
+
+    def __str__(self):
+        return 'vdi'
+
+
+class InfoWrapper(object):
+    """A file-like object that wraps another and updates a format inspector.
+
+    This passes chunks to the format inspector while reading. If the inspector
+    fails, it logs the error and stops calling it, but continues proxying data
+    from the source to its user.
+    """
+
+    def __init__(self, source, fmt):
+        self._source = source
+        self._format = fmt
+        self._error = False
+
+    def __iter__(self):
+        return self
+
+    def _process_chunk(self, chunk):
+        if not self._error:
+            try:
+                self._format.eat_chunk(chunk)
+            except Exception as e:
+                # Absolutely do not allow the format inspector to break
+                # our streaming of the image. If we failed, just stop
+                # trying, log and keep going.
+                LOG.error('Format inspector failed, aborting: %s', e)
+                self._error = True
+
+    def __next__(self):
+        try:
+            chunk = next(self._source)
+        except StopIteration:
+            raise
+        self._process_chunk(chunk)
+        return chunk
+
+    def read(self, size):
+        chunk = self._source.read(size)
+        self._process_chunk(chunk)
+        return chunk
+
+    def close(self):
+        if hasattr(self._source, 'close'):
+            self._source.close()
+
+
+def get_inspector(format_name):
+    """Returns a FormatInspector class based on the given name.
+
+    :param format_name: The name of the disk_format (raw, qcow2, etc).
+    :returns: A FormatInspector or None if unsupported.
+    """
+    formats = {
+        'raw': FileInspector,
+        'qcow2': QcowInspector,
+        'vhd': VHDInspector,
+        'vhdx': VHDXInspector,
+        'vmdk': VMDKInspector,
+        'vdi': VDIInspector,
+    }
+
+    return formats.get(format_name)
diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py
index 797480bdc00..d95cc488723 100644
--- a/nova/tests/unit/virt/libvirt/test_utils.py
+++ b/nova/tests/unit/virt/libvirt/test_utils.py
@@ -29,6 +29,7 @@
 from nova.compute import utils as compute_utils
 from nova import context
 from nova import exception
+from nova.image import format_inspector
 from nova import objects
 from nova.objects import fields as obj_fields
 import nova.privsep.fs
@@ -321,11 +322,13 @@ def test_fetch_initrd_image(self, mock_images):
         mock_images.assert_called_once_with(
             _context, image_id, target, trusted_certs)
 
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch.object(format_inspector, 'get_inspector')
     @mock.patch.object(compute_utils, 'disk_ops_semaphore')
     @mock.patch('nova.privsep.utils.supports_direct_io', return_value=True)
     @mock.patch('nova.privsep.qemu.unprivileged_convert_image')
     def test_fetch_raw_image(self, mock_convert_image, mock_direct_io,
-                             mock_disk_op_sema):
+                             mock_disk_op_sema, mock_gi, mock_glance):
 
         def fake_rename(old, new):
             self.executes.append(('mv', old, new))
@@ -336,7 +339,7 @@ def fake_unlink(path):
         def fake_rm_on_error(path, remove=None):
             self.executes.append(('rm', '-f', path))
 
-        def fake_qemu_img_info(path):
+        def fake_qemu_img_info(path, format=None):
             class FakeImgInfo(object):
                 pass
 
@@ -365,6 +368,8 @@ class FakeImgInfo(object):
         self.stub_out('oslo_utils.fileutils.delete_if_exists',
                       fake_rm_on_error)
 
+        mock_inspector = mock_gi.return_value.from_file.return_value
+
         # Since the remove param of fileutils.remove_path_on_error()
         # is initialized at load time, we must provide a wrapper
         # that explicitly resets it to our fake delete_if_exists()
@@ -375,6 +380,9 @@ class FakeImgInfo(object):
         context = 'opaque context'
         image_id = '4'
 
+        # Make sure qcow2 gets converted to raw
+        mock_inspector.safety_check.return_value = True
+        mock_glance.get.return_value = {'disk_format': 'qcow2'}
         target = 't.qcow2'
         self.executes = []
         expected_commands = [('rm', 't.qcow2.part'),
@@ -386,14 +394,44 @@ class FakeImgInfo(object):
             't.qcow2.part', 't.qcow2.converted', 'qcow2', 'raw',
             CONF.instances_path, False)
         mock_convert_image.reset_mock()
-
+        mock_inspector.safety_check.assert_called_once_with()
+        mock_gi.assert_called_once_with('qcow2')
+
+        # Make sure raw does not get converted
+        mock_gi.reset_mock()
+        mock_inspector.safety_check.reset_mock()
+        mock_inspector.safety_check.return_value = True
+        mock_glance.get.return_value = {'disk_format': 'raw'}
         target = 't.raw'
         self.executes = []
         expected_commands = [('mv', 't.raw.part', 't.raw')]
         images.fetch_to_raw(context, image_id, target)
         self.assertEqual(self.executes, expected_commands)
         mock_convert_image.assert_not_called()
-
+        mock_inspector.safety_check.assert_called_once_with()
+        mock_gi.assert_called_once_with('raw')
+
+        # Make sure safety check failure prevents us from proceeding
+        mock_gi.reset_mock()
+        mock_inspector.safety_check.reset_mock()
+        mock_inspector.safety_check.return_value = False
+        mock_glance.get.return_value = {'disk_format': 'qcow2'}
+        target = 'backing.qcow2'
+        self.executes = []
+        expected_commands = [('rm', '-f', 'backing.qcow2.part')]
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, context, image_id, target)
+        self.assertEqual(self.executes, expected_commands)
+        mock_convert_image.assert_not_called()
+        mock_inspector.safety_check.assert_called_once_with()
+        mock_gi.assert_called_once_with('qcow2')
+
+        # Make sure a format mismatch prevents us from proceeding
+        mock_gi.reset_mock()
+        mock_inspector.safety_check.reset_mock()
+        mock_inspector.safety_check.side_effect = (
+            format_inspector.ImageFormatError)
+        mock_glance.get.return_value = {'disk_format': 'qcow2'}
         target = 'backing.qcow2'
         self.executes = []
         expected_commands = [('rm', '-f', 'backing.qcow2.part')]
@@ -401,6 +439,8 @@ class FakeImgInfo(object):
                           images.fetch_to_raw, context, image_id, target)
         self.assertEqual(self.executes, expected_commands)
         mock_convert_image.assert_not_called()
+        mock_inspector.safety_check.assert_called_once_with()
+        mock_gi.assert_called_once_with('qcow2')
 
         del self.executes
 
diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py
index e0b08bb9f59..55943f7f308 100644
--- a/nova/tests/unit/virt/test_images.py
+++ b/nova/tests/unit/virt/test_images.py
@@ -21,6 +21,7 @@
 
 from nova.compute import utils as compute_utils
 from nova import exception
+from nova.image import format_inspector
 from nova import test
 from nova.virt import images
 
@@ -99,11 +100,17 @@ def test_qemu_img_info_with_disk_not_found(self, exists, mocked_execute):
         exists.assert_called_once_with(path)
         mocked_execute.assert_called_once()
 
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch('nova.image.format_inspector.get_inspector')
     @mock.patch.object(images, 'convert_image',
                        side_effect=exception.ImageUnacceptable)
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
-    def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch):
+    def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch,
+                                 get_inspector, glance):
+        inspector = get_inspector.return_value.from_file.return_value
+        inspector.safety_check.return_value = True
+        glance.get.return_value = {'disk_format': 'qcow2'}
         qemu_img_info.backing_file = None
         qemu_img_info.file_format = 'qcow2'
         qemu_img_info.virtual_size = 20
@@ -112,12 +119,17 @@ def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch):
                                images.fetch_to_raw,
                                None, 'href123', '/no/path')
 
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch('nova.image.format_inspector.get_inspector')
     @mock.patch.object(images, 'convert_image',
                        side_effect=exception.ImageUnacceptable)
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
     def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn,
-                                    fetch):
+                                    fetch, mock_gi, mock_glance):
+        mock_glance.get.return_value = {'disk_format': 'qcow2'}
+        inspector = mock_gi.return_value.from_file.return_value
+        inspector.safety_check.return_value = True
         # NOTE(danms): the above test needs the following line as well, as it
         # is broken without it.
         qemu_img_info = qemu_img_info_fn.return_value
@@ -130,12 +142,16 @@ def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn,
                                images.fetch_to_raw,
                                None, 'href123', '/no/path')
 
+    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch.object(images, 'IMAGE_API')
     @mock.patch('os.rename')
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
-    def test_fetch_to_raw_from_raw(self, fetch, qemu_img_info_fn, mock_rename):
+    def test_fetch_to_raw_from_raw(self, fetch, qemu_img_info_fn, mock_rename,
+                                   mock_glance, mock_gi):
         # Make sure we support a case where we fetch an already-raw image and
         # qemu-img returns None for "format_specific".
+        mock_glance.get.return_value = {'disk_format': 'raw'}
         qemu_img_info = qemu_img_info_fn.return_value
         qemu_img_info.file_format = 'raw'
         qemu_img_info.backing_file = None
@@ -198,9 +214,15 @@ def test_convert_image_vmdk_allowed_list_checking(self):
                           imageutils.QemuImgInfo(jsonutils.dumps(info),
                                                  format='json'))
 
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch('nova.image.format_inspector.get_inspector')
     @mock.patch.object(images, 'fetch')
     @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info')
-    def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch):
+    def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi,
+                                     mock_glance):
+        mock_glance.get.return_value = {'disk_format': 'vmdk'}
+        inspector = mock_gi.return_value.from_file.return_value
+        inspector.safety_check.return_value = True
         info = {'format': 'vmdk',
                 'format-specific': {
                     'type': 'vmdk',
@@ -212,3 +234,109 @@ def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch):
             e = self.assertRaises(exception.ImageUnacceptable,
                                   images.fetch_to_raw, None, 'foo', 'anypath')
             self.assertIn('Invalid VMDK create-type specified', str(e))
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch.object(images, 'qemu_img_info')
+    @mock.patch.object(images, 'fetch')
+    def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_gi,
+                                    mock_glance):
+        # Image claims to be qcow2, is qcow2, but fails safety check, so we
+        # abort before qemu-img-info
+        mock_glance.get.return_value = {'disk_format': 'qcow2'}
+        inspector = mock_gi.return_value.from_file.return_value
+        inspector.safety_check.return_value = False
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        qemu_img_info.assert_not_called()
+        mock_gi.assert_called_once_with('qcow2')
+        mock_gi.return_value.from_file.assert_called_once_with('/no.path.part')
+        inspector.safety_check.assert_called_once_with()
+        mock_glance.get.assert_called_once_with(None, 'href123')
+
+        # Image claims to be qcow2, is qcow2, passes safety check, so we make
+        # it all the way to qemu-img-info
+        inspector.safety_check.return_value = True
+        qemu_img_info.side_effect = test.TestingException
+        self.assertRaises(test.TestingException,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+
+        # Image claims to be qcow2 in glance, but the image is something else,
+        # so we abort before qemu-img-info
+        qemu_img_info.reset_mock()
+        mock_gi.reset_mock()
+        inspector.safety_check.reset_mock()
+        mock_gi.return_value.from_file.side_effect = (
+            format_inspector.ImageFormatError)
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        mock_gi.assert_called_once_with('qcow2')
+        inspector.safety_check.assert_not_called()
+        qemu_img_info.assert_not_called()
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch.object(images, 'qemu_img_info')
+    @mock.patch.object(images, 'fetch')
+    def test_fetch_to_raw_inspector_disabled(self, fetch, qemu_img_info,
+                                             mock_gi, mock_glance):
+        self.flags(disable_deep_image_inspection=True,
+                   group='workarounds')
+        qemu_img_info.side_effect = test.TestingException
+        self.assertRaises(test.TestingException,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        # If deep inspection is disabled, we should never call the inspector
+        mock_gi.assert_not_called()
+        # ... and we let qemu-img detect the format itself.
+        qemu_img_info.assert_called_once_with('/no.path.part',
+                                              format=None)
+        mock_glance.get.assert_not_called()
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch.object(images, 'qemu_img_info')
+    def test_fetch_inspect_ami(self, imginfo, glance):
+        glance.get.return_value = {'disk_format': 'ami'}
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        # Make sure 'ami was translated into 'raw' before we call qemu-img
+        imginfo.assert_called_once_with('/no.path.part', format='raw')
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch.object(images, 'qemu_img_info')
+    def test_fetch_inspect_aki(self, imginfo, glance):
+        glance.get.return_value = {'disk_format': 'aki'}
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        # Make sure 'aki was translated into 'raw' before we call qemu-img
+        imginfo.assert_called_once_with('/no.path.part', format='raw')
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch.object(images, 'qemu_img_info')
+    def test_fetch_inspect_ari(self, imginfo, glance):
+        glance.get.return_value = {'disk_format': 'ari'}
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        # Make sure 'aki was translated into 'raw' before we call qemu-img
+        imginfo.assert_called_once_with('/no.path.part', format='raw')
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch.object(images, 'qemu_img_info')
+    def test_fetch_inspect_unknown_format(self, imginfo, glance):
+        glance.get.return_value = {'disk_format': 'commodore-64-disk'}
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        # Unsupported formats do not make it past deep inspection
+        imginfo.assert_not_called()
+
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch.object(images, 'qemu_img_info')
+    @mock.patch('nova.image.format_inspector.get_inspector')
+    def test_fetch_inspect_disagrees_qemu(self, mock_gi, imginfo, glance):
+        glance.get.return_value = {'disk_format': 'qcow2'}
+        # Glance and inspector think it is a qcow2 file, but qemu-img does not
+        # agree. It was forced to interpret as a qcow2, but returned no
+        # format information as a result.
+        imginfo.return_value.data_file = None
+        self.assertRaises(exception.ImageUnacceptable,
+                          images.fetch_to_raw, None, 'href123', '/no.path')
+        imginfo.assert_called_once_with('/no.path.part', format='qcow2')
diff --git a/nova/virt/images.py b/nova/virt/images.py
index 5f80a1d0758..5ec0dc0b6ba 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -30,6 +30,7 @@
 import nova.conf
 from nova import exception
 from nova.i18n import _
+from nova.image import format_inspector
 from nova.image import glance
 import nova.privsep.qemu
 
@@ -138,13 +139,57 @@ def check_vmdk_image(image_id, data):
         raise exception.ImageUnacceptable(image_id=image_id, reason=msg)
 
 
+def do_image_deep_inspection(img, image_href, path):
+    disk_format = img['disk_format']
+    try:
+        # NOTE(danms): Use our own cautious inspector module to make sure
+        # the image file passes safety checks.
+        # See https://bugs.launchpad.net/nova/+bug/2059809 for details.
+        inspector_cls = format_inspector.get_inspector(disk_format)
+        if not inspector_cls.from_file(path).safety_check():
+            raise exception.ImageUnacceptable(
+                image_id=image_href,
+                reason=(_('Image does not pass safety check')))
+    except format_inspector.ImageFormatError:
+        # If the inspector we chose based on the image's metadata does not
+        # think the image is the proper format, we refuse to use it.
+        raise exception.ImageUnacceptable(
+            image_id=image_href,
+            reason=_('Image content does not match disk_format'))
+    except AttributeError:
+        # No inspector was found
+        LOG.warning('Unable to perform deep image inspection on type %r',
+                    img['disk_format'])
+        if disk_format in ('ami', 'aki', 'ari'):
+            # A lot of things can be in a UEC, although it is typically a raw
+            # filesystem. We really have nothing we can do other than treat it
+            # like a 'raw', which is what qemu-img will detect a filesystem as
+            # anyway. If someone puts a qcow2 inside, we should fail because
+            # we won't do our inspection.
+            disk_format = 'raw'
+        else:
+            raise exception.ImageUnacceptable(
+                image_id=image_href,
+                reason=_('Image not in a supported format'))
+    return disk_format
+
+
 def fetch_to_raw(context, image_href, path, trusted_certs=None):
     path_tmp = "%s.part" % path
     fetch(context, image_href, path_tmp, trusted_certs)
 
     with fileutils.remove_path_on_error(path_tmp):
-        data = qemu_img_info(path_tmp)
+        if not CONF.workarounds.disable_deep_image_inspection:
+            # If we're doing deep inspection, we take the determined format
+            # from it.
+            img = IMAGE_API.get(context, image_href)
+            force_format = do_image_deep_inspection(img, image_href, path_tmp)
+        else:
+            force_format = None
 
+        # Only run qemu-img after we have done deep inspection (if enabled).
+        # If it was not enabled, we will let it detect the format.
+        data = qemu_img_info(path_tmp, format=force_format)
         fmt = data.file_format
         if fmt is None:
             raise exception.ImageUnacceptable(

From e7bdaac1b6b14530a9eefb718f32c37f72096c2a Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Mon, 24 Jun 2024 09:09:36 -0700
Subject: [PATCH 69/93] Additional qemu safety checking on base images

There is an additional way we can be fooled into using a qcow2 file
with a data-file, which is uploading it as raw to glance and then
booting an instance from it. Because when we go to create the
ephemeral disk from a cached base image, we've lost the information
about the original source's format, we probe the image's file type
without a strict format specified. If a qcow2 file is listed in
glance as a raw, we won't notice it until it is too late.

This brings over another piece of code (proposed against) glance's
format inspector which provides a safe format detection routine. This
patch uses that to detect the format of and run a safety check on the
base image each time we go to use it to create an ephemeral disk
image from it.

This also detects QED files and always marks them as unsafe as we do
not support that format at all. Since we could be fooled into
downloading one and passing it to qemu-img if we don't recognize it,
we need to detect and reject it as unsafe.

Conflicts:
  nova/tests/unit/virt/libvirt/test_utils.py
  nova/virt/libvirt/utils.py

NOTE(elod.illes): conflicts are due to patch to consolidate image
creation functions (I111cfc8a5eae27b15c6312957255fcf973038ddf) is only
introduced in zed.

Change-Id: I4881c8cbceb30c1ff2d2b859c554e0d02043f1f5
(cherry picked from commit b1b88bf001757546fbbea959f4b73cb344407dfb)
(cherry picked from commit 8a0d5f2afaf40c4554419a0b2488ce092eda7a1a)
(cherry picked from commit 0269234dc42fe2c320dc4696123cf5132642f9b7)
(cherry picked from commit 9e10ac25490e7b5353cb01e768d22eb5a1f92825)
(cherry picked from commit 303c2c9644c45d2f04461b6e9e2ef8a3273d3be8)
---
 nova/image/format_inspector.py                | 70 ++++++++++++++++---
 nova/tests/unit/virt/libvirt/test_driver.py   |  7 +-
 .../unit/virt/libvirt/test_imagebackend.py    | 45 ++++++++++--
 nova/tests/unit/virt/libvirt/test_utils.py    | 47 ++++++++++++-
 nova/virt/libvirt/imagebackend.py             | 15 ++++
 nova/virt/libvirt/utils.py                    | 29 ++++++++
 6 files changed, 195 insertions(+), 18 deletions(-)

diff --git a/nova/image/format_inspector.py b/nova/image/format_inspector.py
index 268c98b99cb..8e57d7ed2c4 100644
--- a/nova/image/format_inspector.py
+++ b/nova/image/format_inspector.py
@@ -368,6 +368,23 @@ def safety_check(self):
                 not self.has_unknown_features)
 
 
+class QEDInspector(FileInspector):
+    def __init__(self, tracing=False):
+        super().__init__(tracing)
+        self.new_region('header', CaptureRegion(0, 512))
+
+    @property
+    def format_match(self):
+        if not self.region('header').complete:
+            return False
+        return self.region('header').data.startswith(b'QED\x00')
+
+    def safety_check(self):
+        # QED format is not supported by anyone, but we want to detect it
+        # and mark it as just always unsafe.
+        return False
+
+
 # The VHD (or VPC as QEMU calls it) format consists of a big-endian
 # 512-byte "footer" at the beginning of the file with various
 # information, most of which does not matter to us:
@@ -871,19 +888,52 @@ def close(self):
             self._source.close()
 
 
+ALL_FORMATS = {
+    'raw': FileInspector,
+    'qcow2': QcowInspector,
+    'vhd': VHDInspector,
+    'vhdx': VHDXInspector,
+    'vmdk': VMDKInspector,
+    'vdi': VDIInspector,
+    'qed': QEDInspector,
+}
+
+
 def get_inspector(format_name):
     """Returns a FormatInspector class based on the given name.
 
     :param format_name: The name of the disk_format (raw, qcow2, etc).
     :returns: A FormatInspector or None if unsupported.
     """
-    formats = {
-        'raw': FileInspector,
-        'qcow2': QcowInspector,
-        'vhd': VHDInspector,
-        'vhdx': VHDXInspector,
-        'vmdk': VMDKInspector,
-        'vdi': VDIInspector,
-    }
-
-    return formats.get(format_name)
+
+    return ALL_FORMATS.get(format_name)
+
+
+def detect_file_format(filename):
+    """Attempts to detect the format of a file.
+
+    This runs through a file one time, running all the known inspectors in
+    parallel. It stops reading the file once one of them matches or all of
+    them are sure they don't match.
+
+    Returns the FileInspector that matched, if any. None if 'raw'.
+    """
+    inspectors = {k: v() for k, v in ALL_FORMATS.items()}
+    with open(filename, 'rb') as f:
+        for chunk in chunked_reader(f):
+            for format, inspector in list(inspectors.items()):
+                try:
+                    inspector.eat_chunk(chunk)
+                except ImageFormatError:
+                    # No match, so stop considering this format
+                    inspectors.pop(format)
+                    continue
+                if (inspector.format_match and inspector.complete and
+                        format != 'raw'):
+                    # First complete match (other than raw) wins
+                    return inspector
+            if all(i.complete for i in inspectors.values()):
+                # If all the inspectors are sure they are not a match, avoid
+                # reading to the end of the file to settle on 'raw'.
+                break
+    return inspectors['raw']
diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index e41cd740dd9..0e053f7d531 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -13793,10 +13793,11 @@ def test_create_images_and_backing_images_exist(
                                             '/fake/instance/dir', disk_info)
         self.assertFalse(mock_fetch_image.called)
 
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch('nova.privsep.path.utime')
     @mock.patch('nova.virt.libvirt.utils.create_cow_image')
     def test_create_images_and_backing_ephemeral_gets_created(
-            self, mock_create_cow_image, mock_utime):
+            self, mock_create_cow_image, mock_utime, mock_detect):
         drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
 
         base_dir = os.path.join(CONF.instances_path,
@@ -15532,11 +15533,13 @@ def test_create_ephemeral_specified_fs(self, fake_mkfs):
         fake_mkfs.assert_has_calls([mock.call('ext4', '/dev/something',
                                               'myVol')])
 
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch('nova.privsep.path.utime')
     @mock.patch('nova.virt.libvirt.utils.fetch_image')
     @mock.patch('nova.virt.libvirt.utils.create_cow_image')
     def test_create_ephemeral_specified_fs_not_valid(
-            self, mock_create_cow_image, mock_fetch_image, mock_utime):
+            self, mock_create_cow_image, mock_fetch_image, mock_utime,
+            mock_detect):
         CONF.set_override('default_ephemeral_format', 'ext4')
         ephemerals = [{'device_type': 'disk',
                        'disk_bus': 'virtio',
diff --git a/nova/tests/unit/virt/libvirt/test_imagebackend.py b/nova/tests/unit/virt/libvirt/test_imagebackend.py
index decb27f9824..ce6cf3909a6 100644
--- a/nova/tests/unit/virt/libvirt/test_imagebackend.py
+++ b/nova/tests/unit/virt/libvirt/test_imagebackend.py
@@ -522,13 +522,15 @@ def test_cache_template_exists(self, mock_exists):
 
         mock_exists.assert_has_calls(exist_calls)
 
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(imagebackend.utils, 'synchronized')
     @mock.patch('nova.virt.libvirt.utils.create_cow_image')
     @mock.patch.object(os.path, 'exists', side_effect=[])
     @mock.patch.object(imagebackend.Image, 'verify_base_size')
     @mock.patch('nova.privsep.path.utime')
     def test_create_image(
-        self, mock_utime, mock_verify, mock_exist, mock_create, mock_sync
+        self, mock_utime, mock_verify, mock_exist, mock_create, mock_sync,
+        mock_detect_format
     ):
         mock_sync.side_effect = lambda *a, **kw: self._fake_deco
         fn = mock.MagicMock()
@@ -549,7 +551,10 @@ def test_create_image(
         mock_exist.assert_has_calls(exist_calls)
         self.assertTrue(mock_sync.called)
         mock_utime.assert_called()
+        mock_detect_format.assert_called_once()
+        mock_detect_format.return_value.safety_check.assert_called_once_with()
 
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(imagebackend.utils, 'synchronized')
     @mock.patch('nova.virt.libvirt.utils.create_cow_image')
     @mock.patch.object(imagebackend.disk, 'extend')
@@ -557,7 +562,8 @@ def test_create_image(
     @mock.patch.object(imagebackend.Qcow2, 'get_disk_size')
     @mock.patch('nova.privsep.path.utime')
     def test_create_image_too_small(self, mock_utime, mock_get, mock_exist,
-                                    mock_extend, mock_create, mock_sync):
+                                    mock_extend, mock_create, mock_sync,
+                                    mock_detect_format):
         mock_sync.side_effect = lambda *a, **kw: self._fake_deco
         mock_get.return_value = self.SIZE
         fn = mock.MagicMock()
@@ -574,7 +580,9 @@ def test_create_image_too_small(self, mock_utime, mock_get, mock_exist,
         self.assertTrue(mock_sync.called)
         self.assertFalse(mock_create.called)
         self.assertFalse(mock_extend.called)
+        mock_detect_format.assert_called_once()
 
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(imagebackend.utils, 'synchronized')
     @mock.patch('nova.virt.libvirt.utils.create_cow_image')
     @mock.patch('nova.virt.libvirt.utils.get_disk_backing_file')
@@ -586,7 +594,8 @@ def test_create_image_too_small(self, mock_utime, mock_get, mock_exist,
     def test_generate_resized_backing_files(self, mock_utime, mock_copy,
                                             mock_verify, mock_exist,
                                             mock_extend, mock_get,
-                                            mock_create, mock_sync):
+                                            mock_create, mock_sync,
+                                            mock_detect_format):
         mock_sync.side_effect = lambda *a, **kw: self._fake_deco
         mock_get.return_value = self.QCOW2_BASE
         fn = mock.MagicMock()
@@ -613,7 +622,9 @@ def test_generate_resized_backing_files(self, mock_utime, mock_copy,
         self.assertTrue(mock_sync.called)
         self.assertFalse(mock_create.called)
         mock_utime.assert_called()
+        mock_detect_format.assert_called_once()
 
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(imagebackend.utils, 'synchronized')
     @mock.patch('nova.virt.libvirt.utils.create_cow_image')
     @mock.patch('nova.virt.libvirt.utils.get_disk_backing_file')
@@ -624,7 +635,8 @@ def test_generate_resized_backing_files(self, mock_utime, mock_copy,
     def test_qcow2_exists_and_has_no_backing_file(self, mock_utime,
                                                   mock_verify, mock_exist,
                                                   mock_extend, mock_get,
-                                                  mock_create, mock_sync):
+                                                  mock_create, mock_sync,
+                                                  mock_detect_format):
         mock_sync.side_effect = lambda *a, **kw: self._fake_deco
         mock_get.return_value = None
         fn = mock.MagicMock()
@@ -645,6 +657,31 @@ def test_qcow2_exists_and_has_no_backing_file(self, mock_utime,
         self.assertTrue(mock_sync.called)
         self.assertFalse(mock_create.called)
         self.assertFalse(mock_extend.called)
+        mock_detect_format.assert_called_once()
+
+    @mock.patch('nova.image.format_inspector.detect_file_format')
+    @mock.patch.object(imagebackend.utils, 'synchronized')
+    @mock.patch('nova.virt.libvirt.utils.create_image')
+    @mock.patch('nova.virt.libvirt.utils.get_disk_backing_file')
+    @mock.patch.object(imagebackend.disk, 'extend')
+    @mock.patch.object(os.path, 'exists', side_effect=[])
+    @mock.patch.object(imagebackend.Image, 'verify_base_size')
+    def test_qcow2_exists_and_fails_safety_check(self,
+                                                 mock_verify, mock_exist,
+                                                 mock_extend, mock_get,
+                                                 mock_create, mock_sync,
+                                                 mock_detect_format):
+        mock_detect_format.return_value.safety_check.return_value = False
+        mock_sync.side_effect = lambda *a, **kw: self._fake_deco
+        mock_get.return_value = None
+        fn = mock.MagicMock()
+        mock_exist.side_effect = [False, True, False, True, True]
+        image = self.image_class(self.INSTANCE, self.NAME)
+
+        self.assertRaises(exception.InvalidDiskInfo,
+                          image.create_image, fn, self.TEMPLATE_PATH,
+                          self.SIZE)
+        mock_verify.assert_not_called()
 
     def test_resolve_driver_format(self):
         image = self.image_class(self.INSTANCE, self.NAME)
diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py
index d95cc488723..a49bf723ffa 100644
--- a/nova/tests/unit/virt/libvirt/test_utils.py
+++ b/nova/tests/unit/virt/libvirt/test_utils.py
@@ -117,11 +117,27 @@ def test_create_image(self, mock_execute):
     @mock.patch('os.path.exists', return_value=True)
     @mock.patch('oslo_concurrency.processutils.execute')
     @mock.patch('nova.virt.images.qemu_img_info')
-    def test_create_cow_image(self, mock_info, mock_execute, mock_exists):
+    @mock.patch('nova.image.format_inspector.detect_file_format')
+    def _test_create_cow_image(
+        self, mock_detect, mock_info, mock_execute,
+        mock_exists, backing_file=None, safety_check=True
+    ):
+        if isinstance(backing_file, dict):
+            backing_info = backing_file
+            backing_file = backing_info.pop('file', None)
+        else:
+            backing_info = {}
+        backing_backing_file = backing_info.pop('backing_file', None)
+
         mock_execute.return_value = ('stdout', None)
         mock_info.return_value = mock.Mock(
             file_format=mock.sentinel.backing_fmt,
-            cluster_size=mock.sentinel.cluster_size)
+            cluster_size=mock.sentinel.cluster_size,
+            backing_file=backing_backing_file,
+            format_specific=backing_info)
+
+        mock_detect.return_value.safety_check.return_value = safety_check
+
         libvirt_utils.create_cow_image(mock.sentinel.backing_path,
                                        mock.sentinel.new_path)
         mock_info.assert_called_once_with(mock.sentinel.backing_path)
@@ -131,6 +147,33 @@ def test_create_cow_image(self, mock_info, mock_execute, mock_exists):
                 mock.sentinel.backing_path, mock.sentinel.backing_fmt,
                 mock.sentinel.cluster_size),
              mock.sentinel.new_path)])
+        if backing_file:
+            mock_detect.return_value.safety_check.assert_called_once_with()
+
+    def test_create_image_qcow2(self):
+        self._test_create_cow_image()
+
+    def test_create_image_backing_file(self):
+        self._test_create_cow_image(
+            backing_file=mock.sentinel.backing_file
+        )
+
+    def test_create_image_base_has_backing_file(self):
+        self.assertRaises(
+            exception.InvalidDiskInfo,
+            self._test_create_cow_image,
+            backing_file={'file': mock.sentinel.backing_file,
+                          'backing_file': mock.sentinel.backing_backing_file},
+        )
+
+    def test_create_image_base_has_data_file(self):
+        self.assertRaises(
+            exception.InvalidDiskInfo,
+            self._test_create_cow_image,
+            backing_file={'file': mock.sentinel.backing_file,
+                          'backing_file': mock.sentinel.backing_backing_file,
+                          'data': {'data-file': mock.sentinel.data_file}},
+        )
 
     @ddt.unpack
     @ddt.data({'fs_type': 'some_fs_type',
diff --git a/nova/virt/libvirt/imagebackend.py b/nova/virt/libvirt/imagebackend.py
index 617adfe0303..6a5252b11b3 100644
--- a/nova/virt/libvirt/imagebackend.py
+++ b/nova/virt/libvirt/imagebackend.py
@@ -34,6 +34,7 @@
 import nova.conf
 from nova import exception
 from nova.i18n import _
+from nova.image import format_inspector
 from nova.image import glance
 import nova.privsep.libvirt
 import nova.privsep.path
@@ -637,6 +638,20 @@ def create_qcow2_image(base, target, size):
         if not os.path.exists(base):
             prepare_template(target=base, *args, **kwargs)
 
+        # NOTE(danms): We need to perform safety checks on the base image
+        # before we inspect it for other attributes. We do this each time
+        # because additional safety checks could have been added since we
+        # downloaded the image.
+        if not CONF.workarounds.disable_deep_image_inspection:
+            inspector = format_inspector.detect_file_format(base)
+            if not inspector.safety_check():
+                LOG.warning('Base image %s failed safety check', base)
+                # NOTE(danms): This is the same exception as would be raised
+                # by qemu_img_info() if the disk format was unreadable or
+                # otherwise unsuitable.
+                raise exception.InvalidDiskInfo(
+                    reason=_('Base image failed safety check'))
+
         # NOTE(ankit): Update the mtime of the base file so the image
         # cache manager knows it is in use.
         _update_utime_ignore_eacces(base)
diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py
index a1b9459b7e6..1d6f7f4e1d4 100644
--- a/nova/virt/libvirt/utils.py
+++ b/nova/virt/libvirt/utils.py
@@ -34,6 +34,7 @@
 from nova import context as nova_context
 from nova import exception
 from nova.i18n import _
+from nova.image import format_inspector
 from nova import objects
 from nova.objects import fields as obj_fields
 import nova.privsep.fs
@@ -139,7 +140,35 @@ def create_cow_image(
     base_cmd = ['qemu-img', 'create', '-f', 'qcow2']
     cow_opts = []
     if backing_file:
+        # NOTE(danms): We need to perform safety checks on the base image
+        # before we inspect it for other attributes. We do this each time
+        # because additional safety checks could have been added since we
+        # downloaded the image.
+        if not CONF.workarounds.disable_deep_image_inspection:
+            inspector = format_inspector.detect_file_format(backing_file)
+            if not inspector.safety_check():
+                LOG.warning('Base image %s failed safety check', backing_file)
+                # NOTE(danms): This is the same exception as would be raised
+                # by qemu_img_info() if the disk format was unreadable or
+                # otherwise unsuitable.
+                raise exception.InvalidDiskInfo(
+                    reason=_('Base image failed safety check'))
+
         base_details = images.qemu_img_info(backing_file)
+
+        if base_details.backing_file is not None:
+            LOG.warning('Base image %s failed safety check', backing_file)
+            raise exception.InvalidDiskInfo(
+                reason=_('Base image failed safety check'))
+        try:
+            data_file = base_details.format_specific['data']['data-file']
+        except (KeyError, TypeError, AttributeError):
+            data_file = None
+        if data_file is not None:
+            LOG.warning('Base image %s failed safety check', backing_file)
+            raise exception.InvalidDiskInfo(
+                reason=_('Base image failed safety check'))
+
         cow_opts += ['backing_file=%s' % backing_file]
         cow_opts += ['backing_fmt=%s' % base_details.file_format]
     else:

From 3ba8ee16116e6a721413a382bbd4bcb68355cdf0 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Mon, 1 Jul 2024 09:06:40 -0700
Subject: [PATCH 70/93] Fix vmdk_allowed_types checking

This restores the vmdk_allowed_types checking in create_image()
that was unintentionally lost by tightening the
qemu-type-matches-glance code in the fetch patch recently. Since we
are still detecting the format of base images without metadata, we
would have treated a vmdk file that claims to be raw as raw in fetch,
but then read it like a vmdk once it was used as a base image for
something else.

Conflicts:
  nova/tests/unit/virt/libvirt/test_utils.py
  nova/virt/libvirt/utils.py

NOTE(elod.illes): conflicts are due to patch to consolidate image
creation functions (I111cfc8a5eae27b15c6312957255fcf973038ddf) is only
introduced in zed.

Change-Id: I07b332a7edb814f6a91661651d9d24bfd6651ae7
Related-Bug: #2059809
(cherry picked from commit 08be7b2a0dc1d7728d8034bc2aab0428c4fb642e)
(cherry picked from commit 11301e7e3f0d81a3368632f90608e30d9c647111)
(cherry picked from commit 70a435fd519a0ebcc3ac9ad5254fefbf19c93e48)
(cherry picked from commit f732f8476851e6272d8ad9937f54b918795844e8)
(cherry picked from commit a2acb31d790e6cb41c067bfc0343bde274c9428c)
---
 nova/tests/unit/virt/libvirt/test_utils.py | 28 ++++++++++++++++++++--
 nova/virt/libvirt/utils.py                 |  3 ++-
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py
index a49bf723ffa..8a43c45f34b 100644
--- a/nova/tests/unit/virt/libvirt/test_utils.py
+++ b/nova/tests/unit/virt/libvirt/test_utils.py
@@ -128,10 +128,12 @@ def _test_create_cow_image(
         else:
             backing_info = {}
         backing_backing_file = backing_info.pop('backing_file', None)
+        backing_fmt = backing_info.pop('backing_fmt',
+                                       mock.sentinel.backing_fmt)
 
         mock_execute.return_value = ('stdout', None)
         mock_info.return_value = mock.Mock(
-            file_format=mock.sentinel.backing_fmt,
+            file_format=backing_fmt,
             cluster_size=mock.sentinel.cluster_size,
             backing_file=backing_backing_file,
             format_specific=backing_info)
@@ -144,7 +146,7 @@ def _test_create_cow_image(
         mock_execute.assert_has_calls([mock.call(
             'qemu-img', 'create', '-f', 'qcow2', '-o',
             'backing_file=%s,backing_fmt=%s,cluster_size=%s' % (
-                mock.sentinel.backing_path, mock.sentinel.backing_fmt,
+                mock.sentinel.backing_path, backing_fmt,
                 mock.sentinel.cluster_size),
              mock.sentinel.new_path)])
         if backing_file:
@@ -175,6 +177,28 @@ def test_create_image_base_has_data_file(self):
                           'data': {'data-file': mock.sentinel.data_file}},
         )
 
+    def test_create_image_size_none(self):
+        self._test_create_cow_image(
+            backing_file=mock.sentinel.backing_file,
+        )
+
+    def test_create_image_vmdk(self):
+        self._test_create_cow_image(
+            backing_file={'file': mock.sentinel.backing_file,
+                          'backing_fmt': 'vmdk',
+                          'backing_file': None,
+                          'data': {'create-type': 'monolithicSparse'}}
+        )
+
+    def test_create_image_vmdk_invalid_type(self):
+        self.assertRaises(exception.ImageUnacceptable,
+            self._test_create_cow_image,
+            backing_file={'file': mock.sentinel.backing_file,
+                          'backing_fmt': 'vmdk',
+                          'backing_file': None,
+                          'data': {'create-type': 'monolithicFlat'}}
+        )
+
     @ddt.unpack
     @ddt.data({'fs_type': 'some_fs_type',
                'default_eph_format': None,
diff --git a/nova/virt/libvirt/utils.py b/nova/virt/libvirt/utils.py
index 1d6f7f4e1d4..93c5b38cb77 100644
--- a/nova/virt/libvirt/utils.py
+++ b/nova/virt/libvirt/utils.py
@@ -155,7 +155,8 @@ def create_cow_image(
                     reason=_('Base image failed safety check'))
 
         base_details = images.qemu_img_info(backing_file)
-
+        if base_details.file_format == 'vmdk':
+            images.check_vmdk_image('base', base_details)
         if base_details.backing_file is not None:
             LOG.warning('Base image %s failed safety check', backing_file)
             raise exception.InvalidDiskInfo(

From 6a1eb596bb46bcfae2802b77b07a73c86e7ea570 Mon Sep 17 00:00:00 2001
From: Stephen Finucane <sfinucan@redhat.com>
Date: Thu, 15 Dec 2022 00:09:04 +0000
Subject: [PATCH 71/93] Remove use of removeprefix

This is not supported on Python 3.8 [1]. I have no idea why this was not
failing CI.

[1] https://docs.python.org/3.9/library/stdtypes.html#str.removeprefix

Change-Id: I225e9ced0f75c415b1d2fee05440291e3d8635c0
Signed-off-by: Stephen Finucane <sfinucan@redhat.com>
(cherry picked from commit 3ccf82ef9e2c87a1d33a0dda8929c05e80844087)
(cherry picked from commit 5ba1bd1185e09ecc2f77fe7427f9647dce0bdaea)
---
 nova/tests/unit/console/test_websocketproxy.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/nova/tests/unit/console/test_websocketproxy.py b/nova/tests/unit/console/test_websocketproxy.py
index 0c897e3e911..cd99ad53f0a 100644
--- a/nova/tests/unit/console/test_websocketproxy.py
+++ b/nova/tests/unit/console/test_websocketproxy.py
@@ -637,7 +637,9 @@ def test_reject_open_redirect(self, url='//example.com/%2F..'):
         # now the same url but with extra leading '/' characters removed.
         if expected_cpython in errmsg:
             location = result[3].decode()
-            location = location.removeprefix('Location: ').rstrip('\r\n')
+            if location.startswith('Location: '):
+                location = location[len('Location: '):]
+            location = location.rstrip('\r\n')
             self.assertTrue(
                 location.startswith('/example.com/%2F..'),
                 msg='Redirect location is not the expected sanitized URL',

From bf57a8a1a19cb5ab6ba4959b98f749448a55c73d Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Fri, 12 Jul 2024 22:53:06 +0200
Subject: [PATCH 72/93] [tools] Ignore bot generated patches

This is a fix for the test whether a patch is bot generated or not, as
that did not worked as intended. The problem is that the script is
checking the email address of the parent patch (HEAD~), which probably
should be right in case the patch would be a MERGE patch. But this is
wrong in case the patch is not a MERGE patch. This fix uses the very
same pattern as it is using for the commit message parsing: the
$commit_hash variable, which is the parent's commit hash if the patch
is a MERGE patch, and an empty string in the other case (causing to
call 'git show' on HEAD).

Change-Id: I0abc72180edf34a6dd0624a40fb8682397805eca
(cherry picked from commit b8f3975d3641fad19971cc159bdb9decb6ea95f8)
(cherry picked from commit 92b781f96e076f22ef098ca7894a3eeddb647731)
(cherry picked from commit 7a914d6bfc5467e91175c55c8ea63e62e3518d86)
(cherry picked from commit 8b79f0f6a4315a89cb90ea86c7e05dfde3b1fc92)
(cherry picked from commit fe0eb7ad395bad4c3321e4efce9d77506cbf134d)
---
 tools/check-cherry-picks.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh
index 3042aa16593..439f42df3af 100755
--- a/tools/check-cherry-picks.sh
+++ b/tools/check-cherry-picks.sh
@@ -14,7 +14,7 @@ if [ $parent_number -eq 2 ]; then
     commit_hash=$(git show --format='%P' --quiet | awk '{print $NF}')
 fi
 
-if git show --format='%aE' HEAD~ --quiet | grep -qi 'infra-root@openstack.org'; then
+if git show --format='%aE' --quiet $commit_hash | grep -qi 'infra-root@openstack.org'; then
     echo 'Bot generated change; ignoring'
     exit 0
 fi

From 7b7bac3e21a64bea33527a8aedf12f5db8a3dea0 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Fri, 12 Jul 2024 23:10:26 +0200
Subject: [PATCH 73/93] [tools] Backport validator: handle unmaintained

When the script was created there were only stable/* branches, but now
there are unmaintained/* branches as well, where the validator fails
when looking for hashes only on stable/* branches even if the given
hash is already on unmtaintained/* branch. This patch matches now both
stable/* and unmaintained/* branches.

Change-Id: I08fcc63ab0fbe5af1be70d5fde5af98bf006101c
(cherry picked from commit e2697de8e41a566eb86aefa364906bda9bc59863)
(cherry picked from commit 602e68364c54fb54140006f38d6995b9a5b354a9)
(cherry picked from commit 56e73cc7bad51435a79584e9411f07add0d0536a)
(cherry picked from commit f53824f95bea8769a2b28c62f23e57cb8dbafae5)
(cherry picked from commit f43ceef5769f1bfbeddf062f3fd745fc3c519ace)
---
 tools/check-cherry-picks.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh
index 3042aa16593..52bf3d834ed 100755
--- a/tools/check-cherry-picks.sh
+++ b/tools/check-cherry-picks.sh
@@ -1,7 +1,8 @@
 #!/bin/sh
 #
 # A tool to check the cherry-pick hashes from the current git commit message
-# to verify that they're all on either master or stable/ branches
+# to verify that they're all on either master, stable/ or unmaintained/
+# branches
 #
 
 commit_hash=""
@@ -23,9 +24,9 @@ hashes=$(git show --format='%b' --quiet $commit_hash | sed -nr 's/^.cherry picke
 checked=0
 branches+=""
 for hash in $hashes; do
-    branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+)')
+    branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+|unmaintained/[a-z0-9.]+)')
     if [ $? -ne 0 ]; then
-        echo "Cherry pick hash $hash not on any master or stable branches"
+        echo "Cherry pick hash $hash not on any master, stable or unmaintained branches"
         exit 1
     fi
     branches+=" $branch"
@@ -33,7 +34,7 @@ for hash in $hashes; do
 done
 
 if [ $checked -eq 0 ]; then
-    if ! grep -q '^defaultbranch=stable/' .gitreview; then
+    if ! grep -qE '^defaultbranch=(stable|unmaintained)/' .gitreview; then
         echo "Checked $checked cherry-pick hashes: OK"
         exit 0
     else

From 23fc1b9206691a4b5861f49db46d48487acb8187 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Tue, 18 Jun 2024 15:10:13 +0200
Subject: [PATCH 74/93] [CI] Replace deprecated regex and remove Centos8

Latest Zuul drops the following warnings:

  All regular expressions must conform to RE2 syntax, but an
  expression using the deprecated Perl-style syntax has been detected.
  Adjust the configuration to conform to RE2 syntax.

  The RE2 syntax error is: invalid perl operator: (?!

This patch replaces the 'irrelevant-files' to 'files' with explicitly
listing the pattern which files should be the tests run against.

NOTE(elod.illes): this patch is extended with the removal of CentOS 8
Stream based jobs as that OS is End of Life, hence removed from
available nodesets.

Change-Id: If287e800fb9ff428dbe6f9c4c046627f22afe3df
(cherry picked from commit 9b77bae8a32ff41712b96bb6a67c7eacae45a4c9)
(cherry picked from commit 8223e6a7c429441c28178316e455767a66d3e8f8)
(cherry picked from commit 510a27ba36fc47309308228bc45b5b9ea6ba695b)
(cherry picked from commit 197b14d7659252c62b7436bcdd2a9b8c8b470771)
(cherry picked from commit ffc252eeae01f4829a92a0549b47fb9e4175c2da)
---
 .zuul.yaml | 61 ++++++++++++++----------------------------------------
 1 file changed, 16 insertions(+), 45 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index 846e3e23bac..92b88560f6b 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -197,24 +197,11 @@
     parent: devstack-tempest
     description: |
       Run tempest compute API tests using LVM image backend. This only runs
-      against nova/virt/libvirt/* changes.
-    # Copy irrelevant-files from nova-dsvm-multinode-base and then exclude
-    # anything that is not in nova/virt/libvirt/* or nova/privsep/*.
-    irrelevant-files:
-      - ^(?!.zuul.yaml)(?!nova/virt/libvirt/)(?!nova/privsep/).*$
-      - ^api-.*$
-      - ^(test-|)requirements.txt$
-      - ^.*\.rst$
-      - ^.git.*$
-      - ^doc/.*$
-      - ^nova/hacking/.*$
-      - ^nova/locale/.*$
-      - ^nova/tests/.*$
-      - ^nova/test.py$
-      - ^releasenotes/.*$
-      - ^setup.cfg$
-      - ^tools/.*$
-      - ^tox.ini$
+      against nova/virt/libvirt/*, nova/privsep/* and .zuul.yaml changes.
+    files:
+      - ^nova/virt/libvirt/.*$
+      - ^nova/privsep/.*$
+      - .zuul.yaml
     vars:
       # We use the "all" environment for tempest_test_regex and
       # tempest_exclude_regex.
@@ -253,22 +240,11 @@
     # NOTE(chateaulav): due to constraints with no IDE support for aarch64,
     # tests have been limited to eliminate any items that are incompatible.
     # This is to be re-evaluated as greater support is added and defined.
-    irrelevant-files:
-      - ^(?!.zuul.yaml)(?!nova/virt/libvirt/)(?!nova/objects/)(?!nova/scheduler/).*$
-      - ^api-.*$
-      - ^(test-|)requirements.txt$
-      - ^.*\.rst$
-      - ^.git.*$
-      - ^doc/.*$
-      - ^nova/hacking/.*$
-      - ^nova/locale/.*$
-      - ^nova/policies/.*$
-      - ^nova/tests/.*$
-      - ^nova/test.py$
-      - ^releasenotes/.*$
-      - ^setup.cfg$
-      - ^tools/.*$
-      - ^tox.ini$
+    files:
+      - ^nova/virt/libvirt/.*$
+      - ^nova/objects/.*$
+      - ^nova/scheduler/.*$
+      - .zuul.yaml
     vars:
       tox_envlist: all
       tempest_test_regex: ^tempest\.(api\.compute\.servers|scenario\.test_network_basic_ops)
@@ -632,11 +608,12 @@
         - nova-ceph-multistore:
             irrelevant-files: *nova-base-irrelevant-files
         - neutron-linuxbridge-tempest:
-            irrelevant-files:
+            files:
               # NOTE(mriedem): This job has its own irrelevant-files section
               # so that we only run it on changes to networking and libvirt/vif
               # code; we don't need to run this on all changes.
-              - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$
+              - ^nova/network/.*$
+              - nova/virt/libvirt/vif.py
         - nova-live-migration
         - nova-live-migration-ceph
         - nova-lvm
@@ -680,11 +657,6 @@
         - barbican-tempest-plugin-simple-crypto:
             irrelevant-files: *nova-base-irrelevant-files
             voting: false
-        - tempest-integrated-compute-centos-8-stream:
-            irrelevant-files: *nova-base-irrelevant-files
-        - tempest-centos8-stream-fips:
-            irrelevant-files: *nova-base-irrelevant-files
-            voting: false
     gate:
       jobs:
         - nova-live-migration
@@ -697,11 +669,12 @@
         - nova-ceph-multistore:
             irrelevant-files: *nova-base-irrelevant-files
         - neutron-linuxbridge-tempest:
-            irrelevant-files:
+            files:
               # NOTE(mriedem): This job has its own irrelevant-files section
               # so that we only run it on changes to networking and libvirt/vif
               # code; we don't need to run this on all changes.
-              - ^(?!nova/network/.*)(?!nova/virt/libvirt/vif.py).*$
+              - ^nova/network/.*$
+              - nova/virt/libvirt/vif.py
         - tempest-integrated-compute:
            irrelevant-files: *policies-irrelevant-files
         - nova-grenade-multinode:
@@ -710,8 +683,6 @@
             irrelevant-files: *nova-base-irrelevant-files
         - openstacksdk-functional-devstack:
             irrelevant-files: *nova-base-irrelevant-files
-        - tempest-integrated-compute-centos-8-stream:
-            irrelevant-files: *nova-base-irrelevant-files
     experimental:
       jobs:
         - ironic-tempest-bfv:

From 089604d201ec4ba6a150d1e583d5e6e20d82aca0 Mon Sep 17 00:00:00 2001
From: jskunda <jskunda@redhat.com>
Date: Wed, 21 Jun 2023 12:11:05 +0200
Subject: [PATCH 75/93] Drop Fedora support

We are about to drop Fedora support as the latest image in upstream
has been transitioned to EOL. Centos 9 Stream has evolved as
replacement platform for new features. Patch which removes fedora
jobs and nodeset from devstack:
https://review.opendev.org/c/openstack/devstack/+/885467

This is needed for https://review.opendev.org/c/openstack/devstack/+/925837

Change-Id: Ib7d3dd93602c94fd801f8fe5daa26353b04f589b
(cherry picked from commit 86c542c56a1da23b1ba71cf2f6f2b76332c3b0a6)
(cherry picked from commit fde9368dd7c50c2e5601d5683ded60677c657dc8)
(cherry picked from commit 25aafcfce38711eb61a653fb0d09162ea72c5c84)
---
 .zuul.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index 92b88560f6b..1afdbaa2b63 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -705,10 +705,6 @@
             irrelevant-files: *nova-base-irrelevant-files
         - os-vif-ovs:
             irrelevant-files: *nova-base-irrelevant-files
-        - devstack-platform-fedora-latest:
-            irrelevant-files: *nova-base-irrelevant-files
-        - devstack-platform-fedora-latest-virt-preview:
-            irrelevant-files: *nova-base-irrelevant-files
         - devstack-plugin-ceph-compute-local-ephemeral:
             irrelevant-files: *nova-base-irrelevant-files
         - devstack-tobiko-nova:

From 1e37e5881197bee5e50d2c2bfc2503a7bb7225fb Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Thu, 4 Jul 2024 12:38:39 +0100
Subject: [PATCH 76/93] port format inspector tests from glance

This commit is a direct port of the format inspector
unit tests from glance as of commit
0d8e79b713bc31a78f0f4eac14ee594ca8520999

the only changes to the test are as follows

"from glance.common import format_inspector" was updated to
"from nova.image import format_inspector"

"from glance.tests import utils as test_utils"
was replaced with "from nova import test"

"test_utils.BaseTestCase" was replaced with "test.NoDBTestCase"

"glance-unittest-formatinspector-" was replaced with
"nova-unittest-formatinspector-"

This makes the test funtional in nova.

TestFormatInspectors requries qemu-img to be installed on the
host which would be a new depency for executing unit tests.
to avoid that we skip TestFormatInspectors if qemu-img
is not installed.
TestFormatInspectorInfra and TestFormatInspectorsTargeted
do not have a qemu-img dependency so
no changes to the test assertions were required.

Note for yoga backport:With older qemu installed one of the qemu-img
create commands fails, let's skip it from unmaintained/yoga and
below that.

Change-Id: Ia34203f246f0bc574e11476287dfb33fda7954fe
(cherry picked from commit 838daa3cad5fb3cdd10fb7aa76c647330a66939e)
(cherry picked from commit 66205be426028f8b7d16163ca6901bc181d703b6)
(cherry picked from commit 497abea5a189cc7043766273e9d17571f722190a)
(cherry picked from commit 58cd955c7d4848ed8da71f3c0352a5303cae6200)
(cherry picked from commit d7e3d722cd6c59968cbfe1d7a3bd7021c90165e5)
---
 .../tests/unit/image/test_format_inspector.py | 519 ++++++++++++++++++
 1 file changed, 519 insertions(+)
 create mode 100644 nova/tests/unit/image/test_format_inspector.py

diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py
new file mode 100644
index 00000000000..86194ef139c
--- /dev/null
+++ b/nova/tests/unit/image/test_format_inspector.py
@@ -0,0 +1,519 @@
+# Copyright 2020 Red Hat, Inc
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import io
+import os
+import re
+import struct
+import subprocess
+import tempfile
+from unittest import mock
+
+from oslo_utils import units
+
+from nova.image import format_inspector
+from nova import test
+
+
+def get_size_from_qemu_img(filename):
+    output = subprocess.check_output('qemu-img info "%s"' % filename,
+                                     shell=True)
+    for line in output.split(b'\n'):
+        m = re.search(b'^virtual size: .* .([0-9]+) bytes', line.strip())
+        if m:
+            return int(m.group(1))
+
+    raise Exception('Could not find virtual size with qemu-img')
+
+
+class TestFormatInspectors(test.NoDBTestCase):
+    def setUp(self):
+        super(TestFormatInspectors, self).setUp()
+        # these tests depend on qemu-img being installed
+        # and in the path, if it is not installed, skip
+        try:
+            subprocess.check_output('qemu-img --version', shell=True)
+        except Exception:
+            self.skipTest('qemu-img not installed')
+
+        self._created_files = []
+
+    def tearDown(self):
+        super(TestFormatInspectors, self).tearDown()
+        for fn in self._created_files:
+            try:
+                os.remove(fn)
+            except Exception:
+                pass
+
+    def _create_img(self, fmt, size, subformat=None, options=None,
+                    backing_file=None):
+        if fmt == 'vhd':
+            # QEMU calls the vhd format vpc
+            fmt = 'vpc'
+
+        if options is None:
+            options = {}
+        opt = ''
+        prefix = 'nova-unittest-formatinspector-'
+
+        if subformat:
+            options['subformat'] = subformat
+            prefix += subformat + '-'
+
+        if options:
+            opt += '-o ' + ','.join('%s=%s' % (k, v)
+                                    for k, v in options.items())
+
+        if backing_file is not None:
+            opt += ' -b %s -F raw' % backing_file
+
+        fn = tempfile.mktemp(prefix=prefix,
+                             suffix='.%s' % fmt)
+        self._created_files.append(fn)
+        subprocess.check_output(
+            'qemu-img create -f %s %s %s %i' % (fmt, opt, fn, size),
+            shell=True)
+        return fn
+
+    def _create_allocated_vmdk(self, size_mb, subformat=None):
+        # We need a "big" VMDK file to exercise some parts of the code of the
+        # format_inspector. A way to create one is to first create an empty
+        # file, and then to convert it with the -S 0 option.
+
+        if subformat is None:
+            # Matches qemu-img default, see `qemu-img convert -O vmdk -o help`
+            subformat = 'monolithicSparse'
+
+        prefix = 'nova-unittest-formatinspector-%s-' % subformat
+        fn = tempfile.mktemp(prefix=prefix, suffix='.vmdk')
+        self._created_files.append(fn)
+        raw = tempfile.mktemp(prefix=prefix, suffix='.raw')
+        self._created_files.append(raw)
+
+        # Create a file with pseudo-random data, otherwise it will get
+        # compressed in the streamOptimized format
+        subprocess.check_output(
+            'dd if=/dev/urandom of=%s bs=1M count=%i' % (raw, size_mb),
+            shell=True)
+
+        # Convert it to VMDK
+        subprocess.check_output(
+            'qemu-img convert -f raw -O vmdk -o subformat=%s -S 0 %s %s' % (
+                subformat, raw, fn),
+            shell=True)
+        return fn
+
+    def _test_format_at_block_size(self, format_name, img, block_size):
+        fmt = format_inspector.get_inspector(format_name)()
+        self.assertIsNotNone(fmt,
+                             'Did not get format inspector for %s' % (
+                                 format_name))
+        wrapper = format_inspector.InfoWrapper(open(img, 'rb'), fmt)
+
+        while True:
+            chunk = wrapper.read(block_size)
+            if not chunk:
+                break
+
+        wrapper.close()
+        return fmt
+
+    def _test_format_at_image_size(self, format_name, image_size,
+                                   subformat=None):
+        img = self._create_img(format_name, image_size, subformat=subformat)
+
+        # Some formats have internal alignment restrictions making this not
+        # always exactly like image_size, so get the real value for comparison
+        virtual_size = get_size_from_qemu_img(img)
+
+        # Read the format in various sizes, some of which will read whole
+        # sections in a single read, others will be completely unaligned, etc.
+        for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
+            fmt = self._test_format_at_block_size(format_name, img, block_size)
+            self.assertTrue(fmt.format_match,
+                            'Failed to match %s at size %i block %i' % (
+                                format_name, image_size, block_size))
+            self.assertEqual(virtual_size, fmt.virtual_size,
+                             ('Failed to calculate size for %s at size %i '
+                              'block %i') % (format_name, image_size,
+                                             block_size))
+            memory = sum(fmt.context_info.values())
+            self.assertLess(memory, 512 * units.Ki,
+                            'Format used more than 512KiB of memory: %s' % (
+                                fmt.context_info))
+
+    def _test_format(self, format_name, subformat=None):
+        # Try a few different image sizes, including some odd and very small
+        # sizes
+        for image_size in (512, 513, 2057, 7):
+            self._test_format_at_image_size(format_name, image_size * units.Mi,
+                                            subformat=subformat)
+
+    def test_qcow2(self):
+        self._test_format('qcow2')
+
+    def test_vhd(self):
+        self._test_format('vhd')
+
+    def test_vhdx(self):
+        self._test_format('vhdx')
+
+    def test_vmdk(self):
+        self._test_format('vmdk')
+
+    def test_vmdk_stream_optimized(self):
+        self._test_format('vmdk', 'streamOptimized')
+
+    def test_from_file_reads_minimum(self):
+        img = self._create_img('qcow2', 10 * units.Mi)
+        file_size = os.stat(img).st_size
+        fmt = format_inspector.QcowInspector.from_file(img)
+        # We know everything we need from the first 512 bytes of a QCOW image,
+        # so make sure that we did not read the whole thing when we inspect
+        # a local file.
+        self.assertLess(fmt.actual_size, file_size)
+
+    def test_qed_always_unsafe(self):
+        img = self._create_img('qed', 10 * units.Mi)
+        fmt = format_inspector.get_inspector('qed').from_file(img)
+        self.assertTrue(fmt.format_match)
+        self.assertFalse(fmt.safety_check())
+
+    def _test_vmdk_bad_descriptor_offset(self, subformat=None):
+        format_name = 'vmdk'
+        image_size = 10 * units.Mi
+        descriptorOffsetAddr = 0x1c
+        BAD_ADDRESS = 0x400
+        img = self._create_img(format_name, image_size, subformat=subformat)
+
+        # Corrupt the header
+        fd = open(img, 'r+b')
+        fd.seek(descriptorOffsetAddr)
+        fd.write(struct.pack('<Q', BAD_ADDRESS // 512))
+        fd.close()
+
+        # Read the format in various sizes, some of which will read whole
+        # sections in a single read, others will be completely unaligned, etc.
+        for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
+            fmt = self._test_format_at_block_size(format_name, img, block_size)
+            self.assertTrue(fmt.format_match,
+                            'Failed to match %s at size %i block %i' % (
+                                format_name, image_size, block_size))
+            self.assertEqual(0, fmt.virtual_size,
+                             ('Calculated a virtual size for a corrupt %s at '
+                              'size %i block %i') % (format_name, image_size,
+                                                     block_size))
+
+    def test_vmdk_bad_descriptor_offset(self):
+        self._test_vmdk_bad_descriptor_offset()
+
+    def test_vmdk_bad_descriptor_offset_stream_optimized(self):
+        self._test_vmdk_bad_descriptor_offset(subformat='streamOptimized')
+
+    def _test_vmdk_bad_descriptor_mem_limit(self, subformat=None):
+        format_name = 'vmdk'
+        image_size = 5 * units.Mi
+        virtual_size = 5 * units.Mi
+        descriptorOffsetAddr = 0x1c
+        descriptorSizeAddr = descriptorOffsetAddr + 8
+        twoMBInSectors = (2 << 20) // 512
+        # We need a big VMDK because otherwise we will not have enough data to
+        # fill-up the CaptureRegion.
+        img = self._create_allocated_vmdk(image_size // units.Mi,
+                                          subformat=subformat)
+
+        # Corrupt the end of descriptor address so it "ends" at 2MB
+        fd = open(img, 'r+b')
+        fd.seek(descriptorSizeAddr)
+        fd.write(struct.pack('<Q', twoMBInSectors))
+        fd.close()
+
+        # Read the format in various sizes, some of which will read whole
+        # sections in a single read, others will be completely unaligned, etc.
+        for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
+            fmt = self._test_format_at_block_size(format_name, img, block_size)
+            self.assertTrue(fmt.format_match,
+                            'Failed to match %s at size %i block %i' % (
+                                format_name, image_size, block_size))
+            self.assertEqual(virtual_size, fmt.virtual_size,
+                             ('Failed to calculate size for %s at size %i '
+                              'block %i') % (format_name, image_size,
+                                             block_size))
+            memory = sum(fmt.context_info.values())
+            self.assertLess(memory, 1.5 * units.Mi,
+                            'Format used more than 1.5MiB of memory: %s' % (
+                                fmt.context_info))
+
+    def test_vmdk_bad_descriptor_mem_limit(self):
+        self._test_vmdk_bad_descriptor_mem_limit()
+
+    def test_vmdk_bad_descriptor_mem_limit_stream_optimized(self):
+        self._test_vmdk_bad_descriptor_mem_limit(subformat='streamOptimized')
+
+    def test_qcow2_safety_checks(self):
+        # Create backing and data-file names (and initialize the backing file)
+        backing_fn = tempfile.mktemp(prefix='backing')
+        self._created_files.append(backing_fn)
+        with open(backing_fn, 'w') as f:
+            f.write('foobar')
+        data_fn = tempfile.mktemp(prefix='data')
+        self._created_files.append(data_fn)
+
+        # A qcow with no backing or data file is safe
+        fn = self._create_img('qcow2', 5 * units.Mi, None)
+        inspector = format_inspector.QcowInspector.from_file(fn)
+        self.assertTrue(inspector.safety_check())
+
+        # A backing file makes it unsafe
+        fn = self._create_img('qcow2', 5 * units.Mi, None,
+                              backing_file=backing_fn)
+        inspector = format_inspector.QcowInspector.from_file(fn)
+        self.assertFalse(inspector.safety_check())
+
+        # Note(lajoskatona): This image create fails on bionic due to
+        # old qemu-img utilities, let's skip this only test from yoga
+        # A data-file makes it unsafe
+        # fn = self._create_img('qcow2', 5 * units.Mi,
+        #                       options={'data_file': data_fn,
+        #                                'data_file_raw': 'on'})
+        # inspector = format_inspector.QcowInspector.from_file(fn)
+        # self.assertFalse(inspector.safety_check())
+
+        # Trying to load a non-QCOW file is an error
+        self.assertRaises(format_inspector.ImageFormatError,
+                          format_inspector.QcowInspector.from_file,
+                          backing_fn)
+
+    def test_qcow2_feature_flag_checks(self):
+        data = bytearray(512)
+        data[0:4] = b'QFI\xFB'
+        inspector = format_inspector.QcowInspector()
+        inspector.region('header').data = data
+
+        # All zeros, no feature flags - all good
+        self.assertFalse(inspector.has_unknown_features)
+
+        # A feature flag set in the first byte (highest-order) is not
+        # something we know about, so fail.
+        data[0x48] = 0x01
+        self.assertTrue(inspector.has_unknown_features)
+
+        # The first bit in the last byte (lowest-order) is known (the dirty
+        # bit) so that should pass
+        data[0x48] = 0x00
+        data[0x4F] = 0x01
+        self.assertFalse(inspector.has_unknown_features)
+
+        # Currently (as of 2024), the high-order feature flag bit in the low-
+        # order byte is not assigned, so make sure we reject it.
+        data[0x4F] = 0x80
+        self.assertTrue(inspector.has_unknown_features)
+
+    def test_vdi(self):
+        self._test_format('vdi')
+
+    def _test_format_with_invalid_data(self, format_name):
+        fmt = format_inspector.get_inspector(format_name)()
+        wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
+        while True:
+            chunk = wrapper.read(32)
+            if not chunk:
+                break
+
+        wrapper.close()
+        self.assertFalse(fmt.format_match)
+        self.assertEqual(0, fmt.virtual_size)
+        memory = sum(fmt.context_info.values())
+        self.assertLess(memory, 512 * units.Ki,
+                        'Format used more than 512KiB of memory: %s' % (
+                            fmt.context_info))
+
+    def test_qcow2_invalid(self):
+        self._test_format_with_invalid_data('qcow2')
+
+    def test_vhd_invalid(self):
+        self._test_format_with_invalid_data('vhd')
+
+    def test_vhdx_invalid(self):
+        self._test_format_with_invalid_data('vhdx')
+
+    def test_vmdk_invalid(self):
+        self._test_format_with_invalid_data('vmdk')
+
+    def test_vdi_invalid(self):
+        self._test_format_with_invalid_data('vdi')
+
+    def test_vmdk_invalid_type(self):
+        fmt = format_inspector.get_inspector('vmdk')()
+        wrapper = format_inspector.InfoWrapper(open(__file__, 'rb'), fmt)
+        while True:
+            chunk = wrapper.read(32)
+            if not chunk:
+                break
+
+        wrapper.close()
+
+        fake_rgn = mock.MagicMock()
+        fake_rgn.complete = True
+        fake_rgn.data = b'foocreateType="someunknownformat"bar'
+
+        with mock.patch.object(fmt, 'has_region', return_value=True):
+            with mock.patch.object(fmt, 'region', return_value=fake_rgn):
+                self.assertEqual(0, fmt.virtual_size)
+
+
+class TestFormatInspectorInfra(test.NoDBTestCase):
+    def _test_capture_region_bs(self, bs):
+        data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
+
+        regions = [
+            format_inspector.CaptureRegion(3, 9),
+            format_inspector.CaptureRegion(0, 256),
+            format_inspector.CaptureRegion(32, 8),
+        ]
+
+        for region in regions:
+            # None of them should be complete yet
+            self.assertFalse(region.complete)
+
+        pos = 0
+        for i in range(0, len(data), bs):
+            chunk = data[i:i + bs]
+            pos += len(chunk)
+            for region in regions:
+                region.capture(chunk, pos)
+
+        self.assertEqual(data[3:12], regions[0].data)
+        self.assertEqual(data[0:256], regions[1].data)
+        self.assertEqual(data[32:40], regions[2].data)
+
+        # The small regions should be complete
+        self.assertTrue(regions[0].complete)
+        self.assertTrue(regions[2].complete)
+
+        # This region extended past the available data, so not complete
+        self.assertFalse(regions[1].complete)
+
+    def test_capture_region(self):
+        for block_size in (1, 3, 7, 13, 32, 64):
+            self._test_capture_region_bs(block_size)
+
+    def _get_wrapper(self, data):
+        source = io.BytesIO(data)
+        fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
+        return format_inspector.InfoWrapper(source, fake_fmt)
+
+    def test_info_wrapper_file_like(self):
+        data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
+        wrapper = self._get_wrapper(data)
+
+        read_data = b''
+        while True:
+            chunk = wrapper.read(8)
+            if not chunk:
+                break
+            read_data += chunk
+
+        self.assertEqual(data, read_data)
+
+    def test_info_wrapper_iter_like(self):
+        data = b''.join(chr(x).encode() for x in range(ord('A'), ord('z')))
+        wrapper = self._get_wrapper(data)
+
+        read_data = b''
+        for chunk in wrapper:
+            read_data += chunk
+
+        self.assertEqual(data, read_data)
+
+    def test_info_wrapper_file_like_eats_error(self):
+        wrapper = self._get_wrapper(b'123456')
+        wrapper._format.eat_chunk.side_effect = Exception('fail')
+
+        data = b''
+        while True:
+            chunk = wrapper.read(3)
+            if not chunk:
+                break
+            data += chunk
+
+        # Make sure we got all the data despite the error
+        self.assertEqual(b'123456', data)
+
+        # Make sure we only called this once and never again after
+        # the error was raised
+        wrapper._format.eat_chunk.assert_called_once_with(b'123')
+
+    def test_info_wrapper_iter_like_eats_error(self):
+        fake_fmt = mock.create_autospec(format_inspector.get_inspector('raw'))
+        wrapper = format_inspector.InfoWrapper(iter([b'123', b'456']),
+                                               fake_fmt)
+        fake_fmt.eat_chunk.side_effect = Exception('fail')
+
+        data = b''
+        for chunk in wrapper:
+            data += chunk
+
+        # Make sure we got all the data despite the error
+        self.assertEqual(b'123456', data)
+
+        # Make sure we only called this once and never again after
+        # the error was raised
+        fake_fmt.eat_chunk.assert_called_once_with(b'123')
+
+    def test_get_inspector(self):
+        self.assertEqual(format_inspector.QcowInspector,
+                         format_inspector.get_inspector('qcow2'))
+        self.assertIsNone(format_inspector.get_inspector('foo'))
+
+
+class TestFormatInspectorsTargeted(test.NoDBTestCase):
+    def _make_vhd_meta(self, guid_raw, item_length):
+        # Meta region header, padded to 32 bytes
+        data = struct.pack('<8sHH', b'metadata', 0, 1)
+        data += b'0' * 20
+
+        # Metadata table entry, 16-byte GUID, 12-byte information,
+        # padded to 32-bytes
+        data += guid_raw
+        data += struct.pack('<III', 256, item_length, 0)
+        data += b'0' * 6
+
+        return data
+
+    def test_vhd_table_over_limit(self):
+        ins = format_inspector.VHDXInspector()
+        meta = format_inspector.CaptureRegion(0, 0)
+        desired = b'012345678ABCDEF0'
+        # This is a poorly-crafted image that specifies a larger table size
+        # than is allowed
+        meta.data = self._make_vhd_meta(desired, 33 * 2048)
+        ins.new_region('metadata', meta)
+        new_region = ins._find_meta_entry(ins._guid(desired))
+        # Make sure we clamp to our limit of 32 * 2048
+        self.assertEqual(
+            format_inspector.VHDXInspector.VHDX_METADATA_TABLE_MAX_SIZE,
+            new_region.length)
+
+    def test_vhd_table_under_limit(self):
+        ins = format_inspector.VHDXInspector()
+        meta = format_inspector.CaptureRegion(0, 0)
+        desired = b'012345678ABCDEF0'
+        meta.data = self._make_vhd_meta(desired, 16 * 2048)
+        ins.new_region('metadata', meta)
+        new_region = ins._find_meta_entry(ins._guid(desired))
+        # Table size was under the limit, make sure we get it back
+        self.assertEqual(16 * 2048, new_region.length)

From d016c79d127e7f16ad04da5cdb4870eb6508f364 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Thu, 4 Jul 2024 13:55:41 +0100
Subject: [PATCH 77/93] Reproduce iso regression with deep format inspection

This change adds a reproducer for the regression in iso
file support when
workarounds.disable_deep_image_inspection = False

Change-Id: I56d8b9980b4871941ba5de91e60a7df6a40106a8
(cherry picked from commit b5a1d3b4b2d0aaa351479b1d7e41a3895c28fab0)
(cherry picked from commit 3a6d9a038fad2bd58bdf4fb87af04158301a6929)
(cherry picked from commit 000b435a44e905122a45d3b137a576c60bf42a58)
(cherry picked from commit 1233d7b935c018e79728c5691216fa2569affe08)
(cherry picked from commit fb86ca6cf02d93810cc5503765c3b707f70bd5c0)
---
 .../tests/unit/image/test_format_inspector.py | 72 ++++++++++++++++---
 1 file changed, 63 insertions(+), 9 deletions(-)

diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py
index 86194ef139c..e232bf82e4c 100644
--- a/nova/tests/unit/image/test_format_inspector.py
+++ b/nova/tests/unit/image/test_format_inspector.py
@@ -27,6 +27,9 @@
 from nova import test
 
 
+TEST_IMAGE_PREFIX = 'nova-unittest-formatinspector-'
+
+
 def get_size_from_qemu_img(filename):
     output = subprocess.check_output('qemu-img info "%s"' % filename,
                                      shell=True)
@@ -41,13 +44,6 @@ def get_size_from_qemu_img(filename):
 class TestFormatInspectors(test.NoDBTestCase):
     def setUp(self):
         super(TestFormatInspectors, self).setUp()
-        # these tests depend on qemu-img being installed
-        # and in the path, if it is not installed, skip
-        try:
-            subprocess.check_output('qemu-img --version', shell=True)
-        except Exception:
-            self.skipTest('qemu-img not installed')
-
         self._created_files = []
 
     def tearDown(self):
@@ -58,8 +54,55 @@ def tearDown(self):
             except Exception:
                 pass
 
+    def _create_iso(self, image_size, subformat='iso-9660'):
+        # these tests depend on mkisofs
+        # being installed and in the path,
+        # if it is not installed, skip
+        try:
+            subprocess.check_output('mkisofs --version', shell=True)
+        except Exception:
+            self.skipTest('mkisofs not installed')
+
+        size = image_size // units.Mi
+        base_cmd = "mkisofs"
+        if subformat == 'udf':
+            # depending on the distribution mkisofs may not support udf
+            # and may be provided by genisoimage instead. As a result we
+            # need to check if the command supports udf via help
+            # instead of checking the installed version.
+            # mkisofs --help outputs to stderr so we need to
+            # redirect it to stdout to use grep.
+            try:
+                subprocess.check_output(
+                    'mkisofs --help 2>&1 | grep udf', shell=True)
+            except Exception:
+                self.skipTest('mkisofs does not support udf format')
+            base_cmd += " -udf"
+        prefix = TEST_IMAGE_PREFIX
+        prefix += '-%s-' % subformat
+        fn = tempfile.mktemp(prefix=prefix, suffix='.iso')
+        self._created_files.append(fn)
+        subprocess.check_output(
+            'dd if=/dev/zero of=%s bs=1M count=%i' % (fn, size),
+            shell=True)
+        subprocess.check_output(
+            '%s -o %s -V "TEST" -J -r %s' % (base_cmd, fn, fn),
+            shell=True)
+        return fn
+
     def _create_img(self, fmt, size, subformat=None, options=None,
                     backing_file=None):
+        if fmt == 'iso':
+            return self._create_iso(size, subformat)
+
+        # these tests depend on qemu-img
+        # being installed and in the path,
+        # if it is not installed, skip
+        try:
+            subprocess.check_output('qemu-img --version', shell=True)
+        except Exception:
+            self.skipTest('qemu-img not installed')
+
         if fmt == 'vhd':
             # QEMU calls the vhd format vpc
             fmt = 'vpc'
@@ -67,7 +110,7 @@ def _create_img(self, fmt, size, subformat=None, options=None,
         if options is None:
             options = {}
         opt = ''
-        prefix = 'nova-unittest-formatinspector-'
+        prefix = TEST_IMAGE_PREFIX
 
         if subformat:
             options['subformat'] = subformat
@@ -97,7 +140,8 @@ def _create_allocated_vmdk(self, size_mb, subformat=None):
             # Matches qemu-img default, see `qemu-img convert -O vmdk -o help`
             subformat = 'monolithicSparse'
 
-        prefix = 'nova-unittest-formatinspector-%s-' % subformat
+        prefix = TEST_IMAGE_PREFIX
+        prefix += '-%s-' % subformat
         fn = tempfile.mktemp(prefix=prefix, suffix='.vmdk')
         self._created_files.append(fn)
         raw = tempfile.mktemp(prefix=prefix, suffix='.raw')
@@ -165,6 +209,16 @@ def _test_format(self, format_name, subformat=None):
     def test_qcow2(self):
         self._test_format('qcow2')
 
+    def test_iso_9660(self):
+        # reproduce iso-9660 format regression
+        self.assertRaises(
+            TypeError, self._test_format, 'iso', subformat='iso-9660')
+
+    def test_udf(self):
+        # reproduce udf format regression
+        self.assertRaises(
+            TypeError, self._test_format, 'iso', subformat='udf')
+
     def test_vhd(self):
         self._test_format('vhd')
 

From e7c2281b03a9eb52281ada9273938ad0d5c52202 Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Thu, 4 Jul 2024 20:09:31 +0100
Subject: [PATCH 78/93] Add iso file format inspector

This change includes unit tests for the ISO
format inspector using mkisofs to generate
the iso files.

A test for stashing qcow content in the system_area
of an iso file is also included.

This change modifies format_inspector.detect_file_format
to evaluate all inspectors until they are complete and
raise an InvalidDiskInfo exception if multiple formats
match.

Related-Bug: #2059809
Change-Id: I7e12718fb3e1f77eb8d1cfcb9fa64e8ddeb9e712
(cherry picked from commit b1cc39848ebe9b9cb63141a647bda52a2842ee4b)
(cherry picked from commit eeda7c333c773216c216159926673874ce4843ba)
(cherry picked from commit 24628ecbbe9d5fdd4fe6767ca92395f0d3da9e48)
(cherry picked from commit 65f0789df05e2ba7f11c0eaf2c6959367acbced2)
(cherry picked from commit e8f00617ed319aa37f6946cf10883eef6d180612)
---
 nova/image/format_inspector.py                | 109 +++++++++++++++++-
 .../tests/unit/image/test_format_inspector.py | 106 ++++++++++++++---
 nova/tests/unit/virt/test_images.py           |  28 +++++
 nova/virt/images.py                           |   5 +
 4 files changed, 230 insertions(+), 18 deletions(-)

diff --git a/nova/image/format_inspector.py b/nova/image/format_inspector.py
index 8e57d7ed2c4..49cb75930a9 100644
--- a/nova/image/format_inspector.py
+++ b/nova/image/format_inspector.py
@@ -24,6 +24,7 @@
 import struct
 
 from oslo_log import log as logging
+from oslo_utils import units
 
 LOG = logging.getLogger(__name__)
 
@@ -843,6 +844,93 @@ def __str__(self):
         return 'vdi'
 
 
+class ISOInspector(FileInspector):
+    """ISO 9660 and UDF format
+
+    we need to check the first 32KB + descriptor size
+    to look for the ISO 9660 or UDF signature.
+
+    http://wiki.osdev.org/ISO_9660
+    http://wiki.osdev.org/UDF
+    mkisofs --help  | grep udf
+
+    The Universal Disc Format or UDF is the filesystem used on DVDs and
+    Blu-Ray discs.UDF is an extension of ISO 9660 and shares the same
+    header structure and initial layout.
+
+    Like the CDFS(ISO 9660) file system,
+    the UDF file system uses a 2048 byte sector size,
+    and it designates that the first 16 sectors can be used by the OS
+    to store proprietary data or boot logic.
+
+    That means we need to check the first 32KB + descriptor size
+    to look for the ISO 9660 or UDF signature.
+    both formats have an extent based layout, so we can't determine
+    ahead of time where the descriptor will be located.
+
+    fortunately, the ISO 9660 and UDF formats have a Primary Volume Descriptor
+    located at the beginning of the image, which contains the volume size.
+
+    """
+
+    def __init__(self, *a, **k):
+        super(ISOInspector, self).__init__(*a, **k)
+        self.new_region('system_area', CaptureRegion(0, 32 * units.Ki))
+        self.new_region('header', CaptureRegion(32 * units.Ki, 2 * units.Ki))
+
+    @property
+    def format_match(self):
+        if not self.complete:
+            return False
+        signature = self.region('header').data[1:6]
+        assert len(signature) == 5
+        return signature in (b'CD001', b'NSR02', b'NSR03')
+
+    @property
+    def virtual_size(self):
+        if not self.complete:
+            return 0
+        if not self.format_match:
+            return 0
+
+        # the header size is 2KB or 1 sector
+        # the first header field is the descriptor type which is 1 byte
+        # the second field is the standard identifier which is 5 bytes
+        # the third field is the version which is 1 byte
+        # the rest of the header contains type specific data is 2041 bytes
+        # see http://wiki.osdev.org/ISO_9660#The_Primary_Volume_Descriptor
+
+        # we need to check that the descriptor type is 1
+        # to ensure that this is a primary volume descriptor
+        descriptor_type = self.region('header').data[0]
+        if descriptor_type != 1:
+            return 0
+        # The size in bytes of a logical block is stored at offset 128
+        # and is 2 bytes long encoded in both little and big endian
+        # int16_LSB-MSB so the field is 4 bytes long
+        logical_block_size_data = self.region('header').data[128:132]
+        assert len(logical_block_size_data) == 4
+        # given the encoding we only need to read half the field so we
+        # can use the first 2 bytes which are the little endian part
+        # this is normally 2048 or 2KB but we need to check as it can be
+        # different according to the ISO 9660 standard.
+        logical_block_size, = struct.unpack('<H', logical_block_size_data[:2])
+        # The volume space size is the total number of logical blocks
+        # and is stored at offset 80 and is 8 bytes long
+        # as with the logical block size the field is encoded in both
+        # little and big endian as an int32_LSB-MSB
+        volume_space_size_data = self.region('header').data[80:88]
+        assert len(volume_space_size_data) == 8
+        # given the encoding we only need to read half the field so we
+        # can use the first 4 bytes which are the little endian part
+        volume_space_size, = struct.unpack('<L', volume_space_size_data[:4])
+        # the virtual size is the volume space size * logical block size
+        return volume_space_size * logical_block_size
+
+    def __str__(self):
+        return 'iso'
+
+
 class InfoWrapper(object):
     """A file-like object that wraps another and updates a format inspector.
 
@@ -896,6 +984,7 @@ def close(self):
     'vmdk': VMDKInspector,
     'vdi': VDIInspector,
     'qed': QEDInspector,
+    'iso': ISOInspector,
 }
 
 
@@ -913,12 +1002,15 @@ def detect_file_format(filename):
     """Attempts to detect the format of a file.
 
     This runs through a file one time, running all the known inspectors in
-    parallel. It stops reading the file once one of them matches or all of
+    parallel. It stops reading the file once all of them matches or all of
     them are sure they don't match.
 
-    Returns the FileInspector that matched, if any. None if 'raw'.
+    :param filename: The path to the file to inspect.
+    :returns: A FormatInspector instance matching the file.
+    :raises: ImageFormatError if multiple formats are detected.
     """
     inspectors = {k: v() for k, v in ALL_FORMATS.items()}
+    detections = []
     with open(filename, 'rb') as f:
         for chunk in chunked_reader(f):
             for format, inspector in list(inspectors.items()):
@@ -930,10 +1022,17 @@ def detect_file_format(filename):
                     continue
                 if (inspector.format_match and inspector.complete and
                         format != 'raw'):
-                    # First complete match (other than raw) wins
-                    return inspector
+                    # record all match (other than raw)
+                    detections.append(inspector)
+                    inspectors.pop(format)
             if all(i.complete for i in inspectors.values()):
                 # If all the inspectors are sure they are not a match, avoid
                 # reading to the end of the file to settle on 'raw'.
                 break
-    return inspectors['raw']
+
+    if len(detections) > 1:
+        all_formats = [str(inspector) for inspector in detections]
+        raise ImageFormatError(
+            'Multiple formats detected: %s' % ', '.join(all_formats))
+
+    return inspectors['raw'] if not detections else detections[0]
diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py
index e232bf82e4c..cbf000f596a 100644
--- a/nova/tests/unit/image/test_format_inspector.py
+++ b/nova/tests/unit/image/test_format_inspector.py
@@ -54,7 +54,13 @@ def tearDown(self):
             except Exception:
                 pass
 
-    def _create_iso(self, image_size, subformat='iso-9660'):
+    def _create_iso(self, image_size, subformat='9660'):
+        """Create an ISO file of the given size.
+
+        :param image_size: The size of the image to create in bytes
+        :param subformat: The subformat to use, if any
+        """
+
         # these tests depend on mkisofs
         # being installed and in the path,
         # if it is not installed, skip
@@ -86,12 +92,22 @@ def _create_iso(self, image_size, subformat='iso-9660'):
             'dd if=/dev/zero of=%s bs=1M count=%i' % (fn, size),
             shell=True)
         subprocess.check_output(
-            '%s -o %s -V "TEST" -J -r %s' % (base_cmd, fn, fn),
+            '%s -V "TEST" -o %s  %s' % (base_cmd, fn, fn),
             shell=True)
         return fn
 
-    def _create_img(self, fmt, size, subformat=None, options=None,
-                    backing_file=None):
+    def _create_img(
+            self, fmt, size, subformat=None, options=None,
+            backing_file=None):
+        """Create an image file of the given format and size.
+
+        :param fmt: The format to create
+        :param size: The size of the image to create in bytes
+        :param subformat: The subformat to use, if any
+        :param options: A dictionary of options to pass to the format
+        :param backing_file: The backing file to use, if any
+        """
+
         if fmt == 'iso':
             return self._create_iso(size, subformat)
 
@@ -177,6 +193,13 @@ def _test_format_at_block_size(self, format_name, img, block_size):
 
     def _test_format_at_image_size(self, format_name, image_size,
                                    subformat=None):
+        """Test the format inspector for the given format at the
+        given image size.
+
+        :param format_name: The format to test
+        :param image_size: The size of the image to create in bytes
+        :param subformat: The subformat to use, if any
+        """
         img = self._create_img(format_name, image_size, subformat=subformat)
 
         # Some formats have internal alignment restrictions making this not
@@ -185,7 +208,15 @@ def _test_format_at_image_size(self, format_name, image_size,
 
         # Read the format in various sizes, some of which will read whole
         # sections in a single read, others will be completely unaligned, etc.
-        for block_size in (64 * units.Ki, 512, 17, 1 * units.Mi):
+        block_sizes = [64 * units.Ki, 1 * units.Mi]
+        # ISO images have a 32KB system area at the beginning of the image
+        # as a result reading that in 17 or 512 byte blocks takes too long,
+        # causing the test to fail. The 64KiB block size is enough to read
+        # the system area and header in a single read. the 1MiB block size
+        # adds very little time to the test so we include it.
+        if format_name != 'iso':
+            block_sizes.extend([17, 512])
+        for block_size in block_sizes:
             fmt = self._test_format_at_block_size(format_name, img, block_size)
             self.assertTrue(fmt.format_match,
                             'Failed to match %s at size %i block %i' % (
@@ -210,14 +241,63 @@ def test_qcow2(self):
         self._test_format('qcow2')
 
     def test_iso_9660(self):
-        # reproduce iso-9660 format regression
-        self.assertRaises(
-            TypeError, self._test_format, 'iso', subformat='iso-9660')
-
-    def test_udf(self):
-        # reproduce udf format regression
-        self.assertRaises(
-            TypeError, self._test_format, 'iso', subformat='udf')
+        self._test_format('iso', subformat='9660')
+
+    def test_iso_udf(self):
+        self._test_format('iso', subformat='udf')
+
+    def _generate_bad_iso(self):
+        # we want to emulate a malicious user who uploads a an
+        # ISO file has a qcow2 header in the system area
+        # of the ISO file
+        # we will create a qcow2 image and an ISO file
+        # and then copy the qcow2 header to the ISO file
+        # e.g.
+        #   mkisofs -o orig.iso /etc/resolv.conf
+        #   qemu-img create orig.qcow2 -f qcow2 64M
+        #   dd if=orig.qcow2 of=outcome bs=32K count=1
+        #   dd if=orig.iso of=outcome bs=32K skip=1 seek=1
+
+        qcow = self._create_img('qcow2', 10 * units.Mi)
+        iso = self._create_iso(64 * units.Mi, subformat='9660')
+        # first ensure the files are valid
+        iso_fmt = self._test_format_at_block_size('iso', iso, 4 * units.Ki)
+        self.assertTrue(iso_fmt.format_match)
+        qcow_fmt = self._test_format_at_block_size('qcow2', qcow, 4 * units.Ki)
+        self.assertTrue(qcow_fmt.format_match)
+        # now copy the qcow2 header to an ISO file
+        prefix = TEST_IMAGE_PREFIX
+        prefix += '-bad-'
+        fn = tempfile.mktemp(prefix=prefix, suffix='.iso')
+        self._created_files.append(fn)
+        subprocess.check_output(
+            'dd if=%s of=%s bs=32K count=1' % (qcow, fn),
+            shell=True)
+        subprocess.check_output(
+            'dd if=%s of=%s bs=32K skip=1 seek=1' % (iso, fn),
+            shell=True)
+        return qcow, iso, fn
+
+    def test_bad_iso_qcow2(self):
+
+        _, _, fn = self._generate_bad_iso()
+
+        iso_check = self._test_format_at_block_size('iso', fn, 4 * units.Ki)
+        qcow_check = self._test_format_at_block_size('qcow2', fn, 4 * units.Ki)
+        # this system area of the ISO file is not considered part of the format
+        # the qcow2 header is in the system area of the ISO file
+        # so the ISO file is still valid
+        self.assertTrue(iso_check.format_match)
+        # the qcow2 header is in the system area of the ISO file
+        # but that will be parsed by the qcow2 format inspector
+        # and it will match
+        self.assertTrue(qcow_check.format_match)
+        # if we call format_inspector.detect_file_format it should detect
+        # and raise an exception because both match internally.
+        e = self.assertRaises(
+            format_inspector.ImageFormatError,
+            format_inspector.detect_file_format, fn)
+        self.assertIn('Multiple formats detected', str(e))
 
     def test_vhd(self):
         self._test_format('vhd')
diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py
index 55943f7f308..8649644eb5f 100644
--- a/nova/tests/unit/virt/test_images.py
+++ b/nova/tests/unit/virt/test_images.py
@@ -235,6 +235,34 @@ def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi,
                                   images.fetch_to_raw, None, 'foo', 'anypath')
             self.assertIn('Invalid VMDK create-type specified', str(e))
 
+    @mock.patch('os.rename')
+    @mock.patch.object(images, 'IMAGE_API')
+    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch.object(images, 'fetch')
+    @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info')
+    def test_fetch_iso_is_raw(self, mock_info, mock_fetch, mock_gi,
+                              mock_glance, mock_rename):
+        mock_glance.get.return_value = {'disk_format': 'iso'}
+        inspector = mock_gi.return_value.from_file.return_value
+        inspector.safety_check.return_value = True
+        # qemu-img does not have a parser for iso so it is treated as raw
+        info = {
+            "virtual-size": 356352,
+            "filename": "foo.iso",
+            "format": "raw",
+            "actual-size": 356352,
+            "dirty-flag": False
+        }
+        mock_info.return_value = jsonutils.dumps(info)
+        with mock.patch('os.path.exists', return_value=True):
+            images.fetch_to_raw(None, 'foo', 'anypath')
+        # Make sure we called info with -f raw for an iso, since qemu-img does
+        # not support iso
+        mock_info.assert_called_once_with('anypath.part', format='raw')
+        # Make sure that since we considered this to be a raw file, we did the
+        # just-rename-don't-convert path
+        mock_rename.assert_called_once_with('anypath.part', 'anypath')
+
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch('nova.image.format_inspector.get_inspector')
     @mock.patch.object(images, 'qemu_img_info')
diff --git a/nova/virt/images.py b/nova/virt/images.py
index 5ec0dc0b6ba..813696ed7d7 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -171,6 +171,11 @@ def do_image_deep_inspection(img, image_href, path):
             raise exception.ImageUnacceptable(
                 image_id=image_href,
                 reason=_('Image not in a supported format'))
+
+    if disk_format == 'iso':
+        # ISO image passed safety check; qemu will treat this as raw from here
+        disk_format = 'raw'
+
     return disk_format
 
 

From bc912493601c5df12dc1f4b7844fbd1d8e64cf8c Mon Sep 17 00:00:00 2001
From: Sean Mooney <work@seanmooney.info>
Date: Tue, 9 Jul 2024 15:09:09 +0100
Subject: [PATCH 79/93] fix qemu-img version dependent tests

while backporting Ia34203f246f0bc574e11476287dfb33fda7954fe

We observed that several of the tests showed distro specific
behavior depending on if qemu was installed in the test env,
what version is installed and how it was compiled

This change ensures that if qemu is present that it
supprot the required formats otherwise it skips the test.

Change-Id: I131996cdd7aaf1f52d4caac33b153753ff6db869
(cherry picked from commit cc2514d02e0b0ebaf60a46d02732f7f8facc3191)
(cherry picked from commit ae10fde55b113bc0a34bc69ff63bab809bc98ef3)
(cherry picked from commit bb2645e92c98da0e02d650dab5ab90cafcbb824b)
(cherry picked from commit 673103fd63a516dad3f6da14b95d34f9dd605c21)
(cherry picked from commit dae4230fcc1c5539ecab52eb5f7755cc844420cd)
---
 .../tests/unit/image/test_format_inspector.py | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py
index cbf000f596a..210ac3a5477 100644
--- a/nova/tests/unit/image/test_format_inspector.py
+++ b/nova/tests/unit/image/test_format_inspector.py
@@ -111,18 +111,22 @@ def _create_img(
         if fmt == 'iso':
             return self._create_iso(size, subformat)
 
-        # these tests depend on qemu-img
-        # being installed and in the path,
-        # if it is not installed, skip
-        try:
-            subprocess.check_output('qemu-img --version', shell=True)
-        except Exception:
-            self.skipTest('qemu-img not installed')
-
         if fmt == 'vhd':
             # QEMU calls the vhd format vpc
             fmt = 'vpc'
 
+        # these tests depend on qemu-img being installed and in the path,
+        # if it is not installed, skip. we also need to ensure that the
+        # format is supported by qemu-img, this can vary depending on the
+        # distribution so we need to check if the format is supported via
+        # the help output.
+        try:
+            subprocess.check_output(
+                'qemu-img --help | grep %s' % fmt, shell=True)
+        except Exception:
+            self.skipTest(
+                'qemu-img not installed or does not support %s format' % fmt)
+
         if options is None:
             options = {}
         opt = ''

From 76c43c4c56221cb836cf07b2df749cd94ec276e1 Mon Sep 17 00:00:00 2001
From: Balazs Gibizer <gibi@redhat.com>
Date: Thu, 11 Jul 2024 07:29:40 +0200
Subject: [PATCH 80/93] Stabilize iso format unit tests

Some version of mkisofs does not properly handle if both the input and
the output file of the command are the same. So this commit changes the
unit tests depending on that binary to use a different files.

Related-Bug: #2059809
Change-Id: I6924eb23ff5804c22a48ec6fabcec25f061906bb
(cherry picked from commit c6d8c6972d52845774b36acb84cd08a4b2e4dcde)
(cherry picked from commit a8783a767551df3dd943bd862cdba35c51cdb7a6)
(cherry picked from commit 02147b36d35e1e462e1405c36a2e67a33de806de)
(cherry picked from commit 47428f6caf503b94583dac614b59971f60a0ba9c)
(cherry picked from commit 11613e7b3244958fa8d0b5253a185287d1ade2d8)
---
 nova/tests/unit/image/test_format_inspector.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/nova/tests/unit/image/test_format_inspector.py b/nova/tests/unit/image/test_format_inspector.py
index 210ac3a5477..f74731f22df 100644
--- a/nova/tests/unit/image/test_format_inspector.py
+++ b/nova/tests/unit/image/test_format_inspector.py
@@ -91,10 +91,15 @@ def _create_iso(self, image_size, subformat='9660'):
         subprocess.check_output(
             'dd if=/dev/zero of=%s bs=1M count=%i' % (fn, size),
             shell=True)
+        # We need to use different file as input and output as the behavior
+        # of mkisofs is version dependent if both the input and the output
+        # are the same and can cause test failures
+        out_fn = "%s.iso" % fn
         subprocess.check_output(
-            '%s -V "TEST" -o %s  %s' % (base_cmd, fn, fn),
+            '%s -V "TEST" -o %s  %s' % (base_cmd, out_fn, fn),
             shell=True)
-        return fn
+        self._created_files.append(out_fn)
+        return out_fn
 
     def _create_img(
             self, fmt, size, subformat=None, options=None,

From 13a6768af4e9400a6af35f7b130dd6b66c5a02dd Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Wed, 10 Jul 2024 14:23:33 +0100
Subject: [PATCH 81/93] Change force_format strategy to catch mismatches

When we moved the qemu-img command in fetch_to_raw() to force the
format to what we expect, we lost the ability to identify and react
to situations where qemu-img detected a file as a format that is not
supported by us (i.e. identfied and safety-checked by
format_inspector). In the case of some of the other VMDK variants
that we don't support, we need to be sure to catch any case where
qemu-img thinks it's something other than raw when we think it is,
which will be the case for those formats we don't support.

Note this also moves us from explicitly using the format_inspector
that we're told by glance is appropriate, to using our own detection.
We assert that we agree with glance and as above, qemu agrees with
us. This helps us avoid cases where the uploader lies about the
image format, causing us to not run the appropriate safety check.
AMI formats are a liability here since we have a very hard time
asserting what they are and what they will be detected as later in
the pipeline, so there is still special-casing for those.

Closes-Bug: #2071734
Change-Id: I4b792c5bc959a904854c21565682ed3a687baa1a
(cherry picked from commit 8b4c522f6699514e7d1f20ac25cf426af6ea588f)
(cherry picked from commit 8ef5ec9716c9edbd662ca27b6e39b7848b14f492)
(cherry picked from commit 45d948938314997ba400a5fc2bb48bc821c260ab)
(cherry picked from commit fbe429051e1fbcd494c71525870651e92e121449)
(cherry picked from commit a54125212ff923d055e8efc7d4f2992b0f934679)
---
 nova/tests/unit/virt/libvirt/test_utils.py | 23 +++---
 nova/tests/unit/virt/test_images.py        | 96 +++++++++++++---------
 nova/virt/images.py                        | 62 +++++++++-----
 3 files changed, 108 insertions(+), 73 deletions(-)

diff --git a/nova/tests/unit/virt/libvirt/test_utils.py b/nova/tests/unit/virt/libvirt/test_utils.py
index 8a43c45f34b..0d0fad30b33 100644
--- a/nova/tests/unit/virt/libvirt/test_utils.py
+++ b/nova/tests/unit/virt/libvirt/test_utils.py
@@ -390,12 +390,12 @@ def test_fetch_initrd_image(self, mock_images):
             _context, image_id, target, trusted_certs)
 
     @mock.patch.object(images, 'IMAGE_API')
-    @mock.patch.object(format_inspector, 'get_inspector')
+    @mock.patch.object(format_inspector, 'detect_file_format')
     @mock.patch.object(compute_utils, 'disk_ops_semaphore')
     @mock.patch('nova.privsep.utils.supports_direct_io', return_value=True)
     @mock.patch('nova.privsep.qemu.unprivileged_convert_image')
     def test_fetch_raw_image(self, mock_convert_image, mock_direct_io,
-                             mock_disk_op_sema, mock_gi, mock_glance):
+                             mock_disk_op_sema, mock_detect, mock_glance):
 
         def fake_rename(old, new):
             self.executes.append(('mv', old, new))
@@ -435,7 +435,7 @@ class FakeImgInfo(object):
         self.stub_out('oslo_utils.fileutils.delete_if_exists',
                       fake_rm_on_error)
 
-        mock_inspector = mock_gi.return_value.from_file.return_value
+        mock_inspector = mock_detect.return_value
 
         # Since the remove param of fileutils.remove_path_on_error()
         # is initialized at load time, we must provide a wrapper
@@ -449,6 +449,7 @@ class FakeImgInfo(object):
 
         # Make sure qcow2 gets converted to raw
         mock_inspector.safety_check.return_value = True
+        mock_inspector.__str__.return_value = 'qcow2'
         mock_glance.get.return_value = {'disk_format': 'qcow2'}
         target = 't.qcow2'
         self.executes = []
@@ -462,12 +463,13 @@ class FakeImgInfo(object):
             CONF.instances_path, False)
         mock_convert_image.reset_mock()
         mock_inspector.safety_check.assert_called_once_with()
-        mock_gi.assert_called_once_with('qcow2')
+        mock_detect.assert_called_once_with('t.qcow2.part')
 
         # Make sure raw does not get converted
-        mock_gi.reset_mock()
+        mock_detect.reset_mock()
         mock_inspector.safety_check.reset_mock()
         mock_inspector.safety_check.return_value = True
+        mock_inspector.__str__.return_value = 'raw'
         mock_glance.get.return_value = {'disk_format': 'raw'}
         target = 't.raw'
         self.executes = []
@@ -476,12 +478,13 @@ class FakeImgInfo(object):
         self.assertEqual(self.executes, expected_commands)
         mock_convert_image.assert_not_called()
         mock_inspector.safety_check.assert_called_once_with()
-        mock_gi.assert_called_once_with('raw')
+        mock_detect.assert_called_once_with('t.raw.part')
 
         # Make sure safety check failure prevents us from proceeding
-        mock_gi.reset_mock()
+        mock_detect.reset_mock()
         mock_inspector.safety_check.reset_mock()
         mock_inspector.safety_check.return_value = False
+        mock_inspector.__str__.return_value = 'qcow2'
         mock_glance.get.return_value = {'disk_format': 'qcow2'}
         target = 'backing.qcow2'
         self.executes = []
@@ -491,10 +494,10 @@ class FakeImgInfo(object):
         self.assertEqual(self.executes, expected_commands)
         mock_convert_image.assert_not_called()
         mock_inspector.safety_check.assert_called_once_with()
-        mock_gi.assert_called_once_with('qcow2')
+        mock_detect.assert_called_once_with('backing.qcow2.part')
 
         # Make sure a format mismatch prevents us from proceeding
-        mock_gi.reset_mock()
+        mock_detect.reset_mock()
         mock_inspector.safety_check.reset_mock()
         mock_inspector.safety_check.side_effect = (
             format_inspector.ImageFormatError)
@@ -507,7 +510,7 @@ class FakeImgInfo(object):
         self.assertEqual(self.executes, expected_commands)
         mock_convert_image.assert_not_called()
         mock_inspector.safety_check.assert_called_once_with()
-        mock_gi.assert_called_once_with('qcow2')
+        mock_detect.assert_called_once_with('backing.qcow2.part')
 
         del self.executes
 
diff --git a/nova/tests/unit/virt/test_images.py b/nova/tests/unit/virt/test_images.py
index 8649644eb5f..0f5cb7c1928 100644
--- a/nova/tests/unit/virt/test_images.py
+++ b/nova/tests/unit/virt/test_images.py
@@ -21,7 +21,6 @@
 
 from nova.compute import utils as compute_utils
 from nova import exception
-from nova.image import format_inspector
 from nova import test
 from nova.virt import images
 
@@ -101,15 +100,16 @@ def test_qemu_img_info_with_disk_not_found(self, exists, mocked_execute):
         mocked_execute.assert_called_once()
 
     @mock.patch.object(images, 'IMAGE_API')
-    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'convert_image',
                        side_effect=exception.ImageUnacceptable)
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
     def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch,
-                                 get_inspector, glance):
-        inspector = get_inspector.return_value.from_file.return_value
+                                 mock_detect, glance):
+        inspector = mock_detect.return_value
         inspector.safety_check.return_value = True
+        inspector.__str__.return_value = 'qcow2'
         glance.get.return_value = {'disk_format': 'qcow2'}
         qemu_img_info.backing_file = None
         qemu_img_info.file_format = 'qcow2'
@@ -120,16 +120,17 @@ def test_fetch_to_raw_errors(self, convert_image, qemu_img_info, fetch,
                                None, 'href123', '/no/path')
 
     @mock.patch.object(images, 'IMAGE_API')
-    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'convert_image',
                        side_effect=exception.ImageUnacceptable)
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
     def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn,
-                                    fetch, mock_gi, mock_glance):
+                                    fetch, mock_detect, mock_glance):
         mock_glance.get.return_value = {'disk_format': 'qcow2'}
-        inspector = mock_gi.return_value.from_file.return_value
+        inspector = mock_detect.return_value
         inspector.safety_check.return_value = True
+        inspector.__str__.return_value = 'qcow2'
         # NOTE(danms): the above test needs the following line as well, as it
         # is broken without it.
         qemu_img_info = qemu_img_info_fn.return_value
@@ -142,16 +143,17 @@ def test_fetch_to_raw_data_file(self, convert_image, qemu_img_info_fn,
                                images.fetch_to_raw,
                                None, 'href123', '/no/path')
 
-    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch('os.rename')
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
     def test_fetch_to_raw_from_raw(self, fetch, qemu_img_info_fn, mock_rename,
-                                   mock_glance, mock_gi):
+                                   mock_glance, mock_detect):
         # Make sure we support a case where we fetch an already-raw image and
         # qemu-img returns None for "format_specific".
         mock_glance.get.return_value = {'disk_format': 'raw'}
+        mock_detect.return_value.__str__.return_value = 'raw'
         qemu_img_info = qemu_img_info_fn.return_value
         qemu_img_info.file_format = 'raw'
         qemu_img_info.backing_file = None
@@ -215,14 +217,15 @@ def test_convert_image_vmdk_allowed_list_checking(self):
                                                  format='json'))
 
     @mock.patch.object(images, 'IMAGE_API')
-    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'fetch')
     @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info')
-    def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi,
+    def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_detect,
                                      mock_glance):
         mock_glance.get.return_value = {'disk_format': 'vmdk'}
-        inspector = mock_gi.return_value.from_file.return_value
+        inspector = mock_detect.return_value
         inspector.safety_check.return_value = True
+        inspector.__str__.return_value = 'vmdk'
         info = {'format': 'vmdk',
                 'format-specific': {
                     'type': 'vmdk',
@@ -238,13 +241,17 @@ def test_fetch_checks_vmdk_rules(self, mock_info, mock_fetch, mock_gi,
     @mock.patch('os.rename')
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'fetch')
     @mock.patch('nova.privsep.qemu.unprivileged_qemu_img_info')
-    def test_fetch_iso_is_raw(self, mock_info, mock_fetch, mock_gi,
-                              mock_glance, mock_rename):
+    def test_fetch_iso_is_raw(
+        self, mock_info, mock_fetch, mock_detect_file_format, mock_gi,
+        mock_glance, mock_rename):
         mock_glance.get.return_value = {'disk_format': 'iso'}
         inspector = mock_gi.return_value.from_file.return_value
         inspector.safety_check.return_value = True
+        inspector.__str__.return_value = 'iso'
+        mock_detect_file_format.return_value = inspector
         # qemu-img does not have a parser for iso so it is treated as raw
         info = {
             "virtual-size": 356352,
@@ -258,27 +265,27 @@ def test_fetch_iso_is_raw(self, mock_info, mock_fetch, mock_gi,
             images.fetch_to_raw(None, 'foo', 'anypath')
         # Make sure we called info with -f raw for an iso, since qemu-img does
         # not support iso
-        mock_info.assert_called_once_with('anypath.part', format='raw')
+        mock_info.assert_called_once_with('anypath.part', format=None)
         # Make sure that since we considered this to be a raw file, we did the
         # just-rename-don't-convert path
         mock_rename.assert_called_once_with('anypath.part', 'anypath')
 
     @mock.patch.object(images, 'IMAGE_API')
-    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
-    def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_gi,
+    def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_detect,
                                     mock_glance):
         # Image claims to be qcow2, is qcow2, but fails safety check, so we
         # abort before qemu-img-info
         mock_glance.get.return_value = {'disk_format': 'qcow2'}
-        inspector = mock_gi.return_value.from_file.return_value
+        inspector = mock_detect.return_value
         inspector.safety_check.return_value = False
+        inspector.__str__.return_value = 'qcow2'
         self.assertRaises(exception.ImageUnacceptable,
                           images.fetch_to_raw, None, 'href123', '/no.path')
         qemu_img_info.assert_not_called()
-        mock_gi.assert_called_once_with('qcow2')
-        mock_gi.return_value.from_file.assert_called_once_with('/no.path.part')
+        mock_detect.assert_called_once_with('/no.path.part')
         inspector.safety_check.assert_called_once_with()
         mock_glance.get.assert_called_once_with(None, 'href123')
 
@@ -292,18 +299,17 @@ def test_fetch_to_raw_inspector(self, fetch, qemu_img_info, mock_gi,
         # Image claims to be qcow2 in glance, but the image is something else,
         # so we abort before qemu-img-info
         qemu_img_info.reset_mock()
-        mock_gi.reset_mock()
+        mock_detect.reset_mock()
         inspector.safety_check.reset_mock()
-        mock_gi.return_value.from_file.side_effect = (
-            format_inspector.ImageFormatError)
+        mock_detect.return_value.__str__.return_value = 'vmdk'
         self.assertRaises(exception.ImageUnacceptable,
                           images.fetch_to_raw, None, 'href123', '/no.path')
-        mock_gi.assert_called_once_with('qcow2')
-        inspector.safety_check.assert_not_called()
+        mock_detect.assert_called_once_with('/no.path.part')
+        inspector.safety_check.assert_called_once_with()
         qemu_img_info.assert_not_called()
 
     @mock.patch.object(images, 'IMAGE_API')
-    @mock.patch('nova.image.format_inspector.get_inspector')
+    @mock.patch('nova.image.format_inspector.detect_file_format')
     @mock.patch.object(images, 'qemu_img_info')
     @mock.patch.object(images, 'fetch')
     def test_fetch_to_raw_inspector_disabled(self, fetch, qemu_img_info,
@@ -316,36 +322,41 @@ def test_fetch_to_raw_inspector_disabled(self, fetch, qemu_img_info,
         # If deep inspection is disabled, we should never call the inspector
         mock_gi.assert_not_called()
         # ... and we let qemu-img detect the format itself.
-        qemu_img_info.assert_called_once_with('/no.path.part',
-                                              format=None)
+        qemu_img_info.assert_called_once_with('/no.path.part')
         mock_glance.get.assert_not_called()
 
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch.object(images, 'qemu_img_info')
-    def test_fetch_inspect_ami(self, imginfo, glance):
+    @mock.patch('nova.image.format_inspector.detect_file_format')
+    def test_fetch_inspect_ami(self, detect, imginfo, glance):
         glance.get.return_value = {'disk_format': 'ami'}
+        detect.return_value.__str__.return_value = 'raw'
         self.assertRaises(exception.ImageUnacceptable,
                           images.fetch_to_raw, None, 'href123', '/no.path')
         # Make sure 'ami was translated into 'raw' before we call qemu-img
-        imginfo.assert_called_once_with('/no.path.part', format='raw')
+        imginfo.assert_called_once_with('/no.path.part')
 
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch.object(images, 'qemu_img_info')
-    def test_fetch_inspect_aki(self, imginfo, glance):
+    @mock.patch('nova.image.format_inspector.detect_file_format')
+    def test_fetch_inspect_aki(self, detect, imginfo, glance):
         glance.get.return_value = {'disk_format': 'aki'}
+        detect.return_value.__str__.return_value = 'raw'
         self.assertRaises(exception.ImageUnacceptable,
                           images.fetch_to_raw, None, 'href123', '/no.path')
         # Make sure 'aki was translated into 'raw' before we call qemu-img
-        imginfo.assert_called_once_with('/no.path.part', format='raw')
+        imginfo.assert_called_once_with('/no.path.part')
 
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch.object(images, 'qemu_img_info')
-    def test_fetch_inspect_ari(self, imginfo, glance):
+    @mock.patch('nova.image.format_inspector.detect_file_format')
+    def test_fetch_inspect_ari(self, detect, imginfo, glance):
         glance.get.return_value = {'disk_format': 'ari'}
+        detect.return_value.__str__.return_value = 'raw'
         self.assertRaises(exception.ImageUnacceptable,
                           images.fetch_to_raw, None, 'href123', '/no.path')
         # Make sure 'aki was translated into 'raw' before we call qemu-img
-        imginfo.assert_called_once_with('/no.path.part', format='raw')
+        imginfo.assert_called_once_with('/no.path.part')
 
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch.object(images, 'qemu_img_info')
@@ -358,13 +369,16 @@ def test_fetch_inspect_unknown_format(self, imginfo, glance):
 
     @mock.patch.object(images, 'IMAGE_API')
     @mock.patch.object(images, 'qemu_img_info')
-    @mock.patch('nova.image.format_inspector.get_inspector')
-    def test_fetch_inspect_disagrees_qemu(self, mock_gi, imginfo, glance):
+    @mock.patch('nova.image.format_inspector.detect_file_format')
+    def test_fetch_inspect_disagrees_qemu(self, mock_detect, imginfo, glance):
         glance.get.return_value = {'disk_format': 'qcow2'}
+        mock_detect.return_value.__str__.return_value = 'qcow2'
         # Glance and inspector think it is a qcow2 file, but qemu-img does not
-        # agree. It was forced to interpret as a qcow2, but returned no
-        # format information as a result.
+        # agree.
         imginfo.return_value.data_file = None
-        self.assertRaises(exception.ImageUnacceptable,
-                          images.fetch_to_raw, None, 'href123', '/no.path')
-        imginfo.assert_called_once_with('/no.path.part', format='qcow2')
+        imginfo.return_value.file_format = 'vmdk'
+        ex = self.assertRaises(exception.ImageUnacceptable,
+                               images.fetch_to_raw,
+                               None, 'href123', '/no.path')
+        self.assertIn('content does not match disk_format', str(ex))
+        imginfo.assert_called_once_with('/no.path.part')
diff --git a/nova/virt/images.py b/nova/virt/images.py
index 813696ed7d7..193c80fb636 100644
--- a/nova/virt/images.py
+++ b/nova/virt/images.py
@@ -140,42 +140,50 @@ def check_vmdk_image(image_id, data):
 
 
 def do_image_deep_inspection(img, image_href, path):
+    ami_formats = ('ami', 'aki', 'ari')
     disk_format = img['disk_format']
     try:
         # NOTE(danms): Use our own cautious inspector module to make sure
         # the image file passes safety checks.
         # See https://bugs.launchpad.net/nova/+bug/2059809 for details.
-        inspector_cls = format_inspector.get_inspector(disk_format)
-        if not inspector_cls.from_file(path).safety_check():
+
+        # Make sure we have a format inspector for the claimed format, else
+        # it is something we do not support and must reject. AMI is excluded.
+        if (disk_format not in ami_formats and
+                not format_inspector.get_inspector(disk_format)):
+            raise exception.ImageUnacceptable(
+                image_id=image_href,
+                reason=_('Image not in a supported format'))
+
+        inspector = format_inspector.detect_file_format(path)
+        if not inspector.safety_check():
             raise exception.ImageUnacceptable(
                 image_id=image_href,
                 reason=(_('Image does not pass safety check')))
+
+        # AMI formats can be other things, so don't obsess over this
+        # requirement for them. Otherwise, make sure our detection agrees
+        # with glance.
+        if disk_format not in ami_formats and str(inspector) != disk_format:
+            # If we detected the image as something other than glance claimed,
+            # we abort.
+            raise exception.ImageUnacceptable(
+                image_id=image_href,
+                reason=_('Image content does not match disk_format'))
     except format_inspector.ImageFormatError:
         # If the inspector we chose based on the image's metadata does not
         # think the image is the proper format, we refuse to use it.
         raise exception.ImageUnacceptable(
             image_id=image_href,
             reason=_('Image content does not match disk_format'))
-    except AttributeError:
-        # No inspector was found
-        LOG.warning('Unable to perform deep image inspection on type %r',
-                    img['disk_format'])
-        if disk_format in ('ami', 'aki', 'ari'):
-            # A lot of things can be in a UEC, although it is typically a raw
-            # filesystem. We really have nothing we can do other than treat it
-            # like a 'raw', which is what qemu-img will detect a filesystem as
-            # anyway. If someone puts a qcow2 inside, we should fail because
-            # we won't do our inspection.
-            disk_format = 'raw'
-        else:
-            raise exception.ImageUnacceptable(
-                image_id=image_href,
-                reason=_('Image not in a supported format'))
-
-    if disk_format == 'iso':
-        # ISO image passed safety check; qemu will treat this as raw from here
+    except Exception:
+        raise exception.ImageUnacceptable(
+            image_id=image_href,
+            reason=_('Image not in a supported format'))
+    if disk_format in ('iso',) + ami_formats:
+        # ISO or AMI image passed safety check; qemu will treat this as raw
+        # from here so return the expected formats it will find.
         disk_format = 'raw'
-
     return disk_format
 
 
@@ -194,12 +202,22 @@ def fetch_to_raw(context, image_href, path, trusted_certs=None):
 
         # Only run qemu-img after we have done deep inspection (if enabled).
         # If it was not enabled, we will let it detect the format.
-        data = qemu_img_info(path_tmp, format=force_format)
+        data = qemu_img_info(path_tmp)
         fmt = data.file_format
         if fmt is None:
             raise exception.ImageUnacceptable(
                 reason=_("'qemu-img info' parsing failed."),
                 image_id=image_href)
+        elif force_format is not None and fmt != force_format:
+            # Format inspector and qemu-img must agree on the format, else
+            # we reject. This will catch VMDK some variants that we don't
+            # explicitly support because qemu will identify them as such
+            # and we will not.
+            LOG.warning('Image %s detected by qemu as %s but we expected %s',
+                        image_href, fmt, force_format)
+            raise exception.ImageUnacceptable(
+                image_id=image_href,
+                reason=_('Image content does not match disk_format'))
 
         backing_file = data.backing_file
         if backing_file is not None:

From fc02bdf830054268df814adc269443e470972719 Mon Sep 17 00:00:00 2001
From: Bence Romsics <bence.romsics@gmail.com>
Date: Mon, 14 Aug 2023 13:03:13 +0200
Subject: [PATCH 82/93] Reproduce bug #2025480 in a functional test

Written by gibi, I just cleaned it up.

Change-Id: I8386a846b3685b8d03c59334ccfb2efbd4afe427
Co-Authored-By: Balazs Gibizer <gibizer@gmail.com>
Related-Bug: #2025480
(cherry picked from commit 62300d4885549368f874b3e07b756017ff96c659)
(cherry picked from commit 477ff2667d7ecd218fa5163d86d2719979dcdcd3)
(cherry picked from commit 23c190a35839b396418d3e98af1e67587f9e9296)
(cherry picked from commit 7422642dd6e222959470b73dc6ba2450a792b9c7)
---
 .../regressions/test_bug_2025480.py           | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 nova/tests/functional/regressions/test_bug_2025480.py

diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py
new file mode 100644
index 00000000000..f6c87109f79
--- /dev/null
+++ b/nova/tests/functional/regressions/test_bug_2025480.py
@@ -0,0 +1,87 @@
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations
+# under the License.
+from unittest import mock
+
+from nova import context
+from nova.objects import compute_node
+from nova import test
+from nova.tests import fixtures as nova_fixtures
+from nova.tests.functional import fixtures as func_fixtures
+from nova.tests.functional import integrated_helpers
+
+
+class UnshelveUpdateAvailableResourcesPeriodicRace(
+        test.TestCase, integrated_helpers.InstanceHelperMixin):
+    def setUp(self):
+        super(UnshelveUpdateAvailableResourcesPeriodicRace, self).setUp()
+
+        placement = func_fixtures.PlacementFixture()
+        self.useFixture(placement)
+        self.placement = placement.api
+        self.neutron = nova_fixtures.NeutronFixture(self)
+        self.useFixture(self.neutron)
+        self.useFixture(nova_fixtures.GlanceFixture(self))
+        # Start nova services.
+        self.api = self.useFixture(nova_fixtures.OSAPIFixture(
+            api_version='v2.1')).admin_api
+        self.api.microversion = 'latest'
+        self.notifier = self.useFixture(
+            nova_fixtures.NotificationFixture(self))
+
+        self.start_service('conductor')
+        self.start_service('scheduler')
+
+    def test_unshelve_spawning_update_available_resources(self):
+        compute = self._start_compute('compute1')
+
+        server = self._create_server(
+            networks=[{'port': self.neutron.port_1['id']}])
+
+        node = compute_node.ComputeNode.get_by_nodename(
+            context.get_admin_context(), 'compute1')
+        self.assertEqual(1, node.vcpus_used)
+
+        # with default config shelve means immediate offload as well
+        req = {
+            'shelve': {}
+        }
+        self.api.post_server_action(server['id'], req)
+        self._wait_for_server_parameter(
+            server, {'status': 'SHELVED_OFFLOADED',
+                     'OS-EXT-SRV-ATTR:host': None})
+
+        node = compute_node.ComputeNode.get_by_nodename(
+            context.get_admin_context(), 'compute1')
+        self.assertEqual(0, node.vcpus_used)
+
+        def fake_spawn(*args, **kwargs):
+            self._run_periodics()
+
+        with mock.patch.object(
+                compute.driver, 'spawn', side_effect=fake_spawn):
+            req = {'unshelve': None}
+            self.api.post_server_action(server['id'], req)
+            self.notifier.wait_for_versioned_notifications(
+                'instance.unshelve.start')
+            self._wait_for_server_parameter(
+                server,
+                {
+                    'status': 'ACTIVE',
+                    'OS-EXT-STS:task_state': None,
+                    'OS-EXT-SRV-ATTR:host': 'compute1',
+                })
+
+        node = compute_node.ComputeNode.get_by_nodename(
+            context.get_admin_context(), 'compute1')
+        # This is the bug, the instance should have resources claimed
+        # self.assertEqual(1, node.vcpus_used)
+        self.assertEqual(0, node.vcpus_used)

From 22dfd1fa42aaff6bd0a63b61d47b569422a5be14 Mon Sep 17 00:00:00 2001
From: Bence Romsics <bence.romsics@gmail.com>
Date: Wed, 2 Aug 2023 16:22:55 +0200
Subject: [PATCH 83/93] Do not untrack resources of a server being unshelved

This patch concerns the time when a VM is being unshelved and the
compute manager set the task_state to spawning, claimed resources of
the VM and then called driver.spawn(). So the instance is in vm_state
SHELVED_OFFLOADED, task_state spawning.

If at this point a new update_available_resource periodic job is
started that collects all the instances assigned to the node to
calculate resource usage. However the calculation assumed that a
VM in SHELVED_OFFLOADED state does not need resource allocation on
the node (probably being removed from the node as it is offloaded)
and deleted the resource claim.

Given all this we ended up with the VM spawned successfully but having
lost the resource claim on the node.

This patch changes what we do in vm_state SHELVED_OFFLOADED, task_state
spawning. We no longer delete the resource claim in this state and
keep tracking the resource in stats.

Change-Id: I8c9944810c09d501a6d3f60f095d9817b756872d
Closes-Bug: #2025480
(cherry picked from commit f1dc4ec39bcfda1bd4b97e233a9da498b6378c4f)
(cherry picked from commit 4239d1fec2814c074482b740a2fd38a5d5ce6942)
(cherry picked from commit 683ecc060e3bca818b9fb514d297e323bc8cb220)
(cherry picked from commit e41962f5fa59c47e63468945e82c2e7164c24c38)
---
 nova/compute/manager.py                          |  6 +++---
 nova/compute/resource_tracker.py                 |  7 +++++--
 nova/compute/stats.py                            |  3 ++-
 nova/compute/vm_states.py                        | 11 +++++++++--
 .../functional/regressions/test_bug_2025480.py   |  5 ++---
 nova/tests/unit/compute/test_stats.py            | 16 ++++++++++++++++
 6 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/nova/compute/manager.py b/nova/compute/manager.py
index 44185ef667a..427cb3aac35 100644
--- a/nova/compute/manager.py
+++ b/nova/compute/manager.py
@@ -6646,9 +6646,9 @@ def _shelve_offload_instance(self, context, instance, clean_shutdown,
 
         instance.power_state = current_power_state
         # NOTE(mriedem): The vm_state has to be set before updating the
-        # resource tracker, see vm_states.ALLOW_RESOURCE_REMOVAL. The host/node
-        # values cannot be nulled out until after updating the resource tracker
-        # though.
+        # resource tracker, see vm_states.allow_resource_removal(). The
+        # host/node values cannot be nulled out until after updating the
+        # resource tracker though.
         instance.vm_state = vm_states.SHELVED_OFFLOADED
         instance.task_state = None
         instance.save(expected_task_state=[task_states.SHELVING,
diff --git a/nova/compute/resource_tracker.py b/nova/compute/resource_tracker.py
index 058777d1ed0..d195044a57c 100644
--- a/nova/compute/resource_tracker.py
+++ b/nova/compute/resource_tracker.py
@@ -1495,7 +1495,8 @@ def _update_usage_from_instance(self, context, instance, nodename,
         # NOTE(sfinucan): Both brand new instances as well as instances that
         # are being unshelved will have is_new_instance == True
         is_removed_instance = not is_new_instance and (is_removed or
-            instance['vm_state'] in vm_states.ALLOW_RESOURCE_REMOVAL)
+            vm_states.allow_resource_removal(
+                vm_state=instance['vm_state'], task_state=instance.task_state))
 
         if is_new_instance:
             self.tracked_instances.add(uuid)
@@ -1554,7 +1555,9 @@ def _update_usage_from_instances(self, context, instances, nodename):
 
         instance_by_uuid = {}
         for instance in instances:
-            if instance.vm_state not in vm_states.ALLOW_RESOURCE_REMOVAL:
+            if not vm_states.allow_resource_removal(
+                    vm_state=instance['vm_state'],
+                    task_state=instance.task_state):
                 self._update_usage_from_instance(context, instance, nodename)
             instance_by_uuid[instance.uuid] = instance
         return instance_by_uuid
diff --git a/nova/compute/stats.py b/nova/compute/stats.py
index cfbee2e6bc1..e9180ec6d6d 100644
--- a/nova/compute/stats.py
+++ b/nova/compute/stats.py
@@ -105,7 +105,8 @@ def update_stats_for_instance(self, instance, is_removed=False):
         (vm_state, task_state, os_type, project_id) = \
                 self._extract_state_from_instance(instance)
 
-        if is_removed or vm_state in vm_states.ALLOW_RESOURCE_REMOVAL:
+        if is_removed or vm_states.allow_resource_removal(
+                vm_state=vm_state, task_state=task_state):
             self._decrement("num_instances")
             self.states.pop(uuid)
         else:
diff --git a/nova/compute/vm_states.py b/nova/compute/vm_states.py
index 633894c1ea4..1c4da06d155 100644
--- a/nova/compute/vm_states.py
+++ b/nova/compute/vm_states.py
@@ -27,6 +27,7 @@
 See http://wiki.openstack.org/VMState
 """
 
+from nova.compute import task_states
 from nova.objects import fields
 
 
@@ -74,5 +75,11 @@
 # states we allow to trigger crash dump
 ALLOW_TRIGGER_CRASH_DUMP = [ACTIVE, PAUSED, RESCUED, RESIZED, ERROR]
 
-# states we allow resources to be freed in
-ALLOW_RESOURCE_REMOVAL = [DELETED, SHELVED_OFFLOADED]
+
+def allow_resource_removal(vm_state, task_state=None):
+    """(vm_state, task_state) combinations we allow resources to be freed in"""
+
+    return (
+        vm_state == DELETED or
+        vm_state == SHELVED_OFFLOADED and task_state != task_states.SPAWNING
+    )
diff --git a/nova/tests/functional/regressions/test_bug_2025480.py b/nova/tests/functional/regressions/test_bug_2025480.py
index f6c87109f79..c707a40a846 100644
--- a/nova/tests/functional/regressions/test_bug_2025480.py
+++ b/nova/tests/functional/regressions/test_bug_2025480.py
@@ -82,6 +82,5 @@ def fake_spawn(*args, **kwargs):
 
         node = compute_node.ComputeNode.get_by_nodename(
             context.get_admin_context(), 'compute1')
-        # This is the bug, the instance should have resources claimed
-        # self.assertEqual(1, node.vcpus_used)
-        self.assertEqual(0, node.vcpus_used)
+        # After the fix, the instance should have resources claimed
+        self.assertEqual(1, node.vcpus_used)
diff --git a/nova/tests/unit/compute/test_stats.py b/nova/tests/unit/compute/test_stats.py
index e713794a19a..b95475f09db 100644
--- a/nova/tests/unit/compute/test_stats.py
+++ b/nova/tests/unit/compute/test_stats.py
@@ -208,6 +208,22 @@ def test_update_stats_for_instance_offloaded(self):
         self.assertEqual(0, self.stats.num_os_type("Linux"))
         self.assertEqual(0, self.stats["num_vm_" + vm_states.BUILDING])
 
+    def test_update_stats_for_instance_being_unshelved(self):
+        instance = self._create_instance()
+        self.stats.update_stats_for_instance(instance)
+        self.assertEqual(1, self.stats.num_instances_for_project("1234"))
+
+        instance["vm_state"] = vm_states.SHELVED_OFFLOADED
+        instance["task_state"] = task_states.SPAWNING
+        self.stats.update_stats_for_instance(instance)
+
+        self.assertEqual(1, self.stats.num_instances)
+        self.assertEqual(1, self.stats.num_instances_for_project(1234))
+        self.assertEqual(1, self.stats["num_os_type_Linux"])
+        self.assertEqual(1, self.stats["num_vm_%s" %
+                                       vm_states.SHELVED_OFFLOADED])
+        self.assertEqual(1, self.stats["num_task_%s" % task_states.SPAWNING])
+
     def test_io_workload(self):
         vms = [vm_states.ACTIVE, vm_states.BUILDING, vm_states.PAUSED]
         tasks = [task_states.RESIZE_MIGRATING, task_states.REBUILDING,

From c5dae90923fdda6e406afc623d77c36a58837ba5 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Tue, 10 Sep 2024 13:04:20 +0200
Subject: [PATCH 84/93] [stable-only] Cap setuptools <20.26.4

py39 jobs (on ubuntu-focal) started to fail due to recent setuptools
release (20.26.4) on Yoga, because we have 'packaging==21.3' in this
branch that is not compatible with the new setuptools [1].

setuptools is bundled in virtualenv, so it has to be capped via the
virtualenv package. tox also needed to be capped (<4) as gate uses
tox 3.28.0, but with capping virtualenv we pull in latest tox as well,
which would cause other errors.

[1] https://github.com/pypa/setuptools/issues/4483

Change-Id: Ie3fe3060d05d2553c5eeb7fd75b41e8f04bf11a7
---
 tox.ini | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tox.ini b/tox.ini
index 92edcd7b469..8df61ef43bb 100644
--- a/tox.ini
+++ b/tox.ini
@@ -5,6 +5,11 @@ envlist = py39,functional,pep8
 # env and ignore basepython inherited from [testenv] if we set
 # ignore_basepython_conflict.
 ignore_basepython_conflict = True
+# Cap setuptools via virtualenv to prevent compatibility issue with yoga
+# branch's upper constraint of 'packaging' package (21.3).
+requires =
+  virtualenv<20.26.4
+  tox<4
 
 [testenv]
 basepython = python3

From d10f41e1ef6761b155a2cf8339c3e3cfb936a9dd Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Fri, 11 Mar 2022 14:52:42 -0800
Subject: [PATCH 85/93] Attempt to thin out nova-ceph-multistore

This job is pretty heavy and has been triggering OOMs that take out
mysqld lately. This disables swift (and c-bak as a result) to try to
reduce the runtime footprint. Losing coverage of these services
should not be a problem for the goal of this job.

Change-Id: Icc18ddd847465069aea34b226851afaeb94594fc
(cherry picked from commit 16a463a68ff24cfe95abd3cf9e305c60ab625fa5)
---
 .zuul.yaml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.zuul.yaml b/.zuul.yaml
index 92b88560f6b..07c05b782da 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -556,6 +556,15 @@
         GLANCE_STANDALONE: True
         GLANCE_USE_IMPORT_WORKFLOW: True
         DEVSTACK_PARALLEL: True
+      # NOTE(danms): This job is pretty heavy as it is, so we disable some
+      # services that are not relevant to the nova-glance-ceph scenario
+      # that this job is intended to validate.
+      devstack_services:
+        c-bak: false
+        s-account: false
+        s-container: false
+        s-object: false
+        s-proxy: false
       devstack_local_conf:
         post-config:
           $NOVA_CONF:

From b721c04bfcba4689d1497174f118c90865a0d3b4 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Tue, 21 Feb 2023 08:43:13 -0800
Subject: [PATCH 86/93] Use mysql memory reduction flags for ceph job

This makes the ceph-multistore job use the MYSQL_REDUCE_MEMORY
flag in devstack to try to address the frequent OOMs we see in that
job.

Conflicts:
  .zuul.yaml

Change-Id: Ibc203bd10dcb530027c2c9f58eb840ccc088280d
Closes-Bug: #1961068
(cherry picked from commit 84d1f25446731e4e51beb83a017cdf7bfda8c5d5)
(cherry picked from commit 6c9ebea5906d961e31c26d2cc035ce48671911a7)
---
 .zuul.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.zuul.yaml b/.zuul.yaml
index 07c05b782da..ea488401cf2 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -556,6 +556,7 @@
         GLANCE_STANDALONE: True
         GLANCE_USE_IMPORT_WORKFLOW: True
         DEVSTACK_PARALLEL: True
+        MYSQL_REDUCE_MEMORY: True
       # NOTE(danms): This job is pretty heavy as it is, so we disable some
       # services that are not relevant to the nova-glance-ceph scenario
       # that this job is intended to validate.

From d86bb1001e9d81c76b528bf24a1d710d6a8d9401 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Wed, 24 Jul 2024 09:01:31 -0700
Subject: [PATCH 87/93] Remove AMI snapshot format special case

Note that this includes seemingly-unrelated test changes because we
were actually skipping the snapshot_running test for libvirt, which
has been a bug for years. In that test case, when we went to look
for image_meta.disk_format, that attribute was not set on the o.vo
object, which raised a NotImplementedError. That error is also checked
by the test to skip the test for drivers that do not support snapshot,
which meant that for libvirt, we haven't been running that case
beyond the point at which we create snapshot metadata and trip that
exception. Thus, once removing that, there are other mocks not in
place that are required for the test to actually run. So, this adds
mocks for qemu_img_info() calls that actually try to read the file on
disk, as well as the privsep chown() that attempts to run after.

Change-Id: Ie731045629f0899840a4680d21793a16ade9b98e
(cherry picked from commit d5a631ba7791b37e49213707e4ea650a56d2ed9e)
(cherry picked from commit 8c5929ff5156d5409d41872f1b8ee0abb04f35a8)
(cherry picked from commit d2d3b2c9e87fe2247a34a776310221c8b12be515)
(cherry picked from commit 77dfa4f6f3c39048b5d3bb9eb2b14dd6998b406b)
(cherry picked from commit e6f4503fe3e20c3fb8afef0aa944d4994665786b)
---
 nova/tests/unit/virt/libvirt/test_driver.py | 11 +++++++----
 nova/tests/unit/virt/test_virt_drivers.py   |  5 +++++
 nova/virt/libvirt/driver.py                 |  6 +-----
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index 0e053f7d531..bdb8bc18aa8 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -8848,7 +8848,7 @@ def test_unquiesce(self, mock_has_min_version):
 
     def test_create_snapshot_metadata(self):
         base = objects.ImageMeta.from_dict(
-            {'disk_format': 'raw'})
+            {'disk_format': 'qcow2'})
         instance_data = {'kernel_id': 'kernel',
                     'project_id': 'prj_id',
                     'ramdisk_id': 'ram_id',
@@ -8880,10 +8880,12 @@ def test_create_snapshot_metadata(self):
             {'disk_format': 'ami',
              'container_format': 'test_container'})
         expected['properties']['os_type'] = instance['os_type']
-        expected['disk_format'] = base.disk_format
+        # The disk_format of the snapshot should be the *actual* format of the
+        # thing we upload, regardless of what type of image we booted from.
+        expected['disk_format'] = img_fmt
         expected['container_format'] = base.container_format
         ret = drvr._create_snapshot_metadata(base, instance, img_fmt, snp_name)
-        self.assertEqual(ret, expected)
+        self.assertEqual(expected, ret)
 
     def test_get_volume_driver(self):
         conn = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
@@ -28225,7 +28227,8 @@ def test_ami(self):
           utils.get_system_metadata_from_image(
             {'disk_format': 'ami'})
 
-        self._test_snapshot(disk_format='ami')
+        # If we're uploading a qcow2, we must set the disk_format as such
+        self._test_snapshot(disk_format='qcow2')
 
     @mock.patch('nova.virt.libvirt.utils.get_disk_type_from_path',
                 new=mock.Mock(return_value=None))
diff --git a/nova/tests/unit/virt/test_virt_drivers.py b/nova/tests/unit/virt/test_virt_drivers.py
index e275cd3e3aa..3ca1f5dcb5e 100644
--- a/nova/tests/unit/virt/test_virt_drivers.py
+++ b/nova/tests/unit/virt/test_virt_drivers.py
@@ -838,6 +838,11 @@ def setUp(self):
         # since we don't care about it.
         self.stub_out('os_vif.unplug', lambda a, kw: None)
         self.stub_out('nova.compute.utils.get_machine_ips', lambda: [])
+        self.stub_out('nova.virt.libvirt.utils.get_disk_size',
+                      lambda *a, **k: 123456)
+        self.stub_out('nova.virt.libvirt.utils.get_disk_backing_file',
+                      lambda *a, **k: None)
+        self.stub_out('nova.privsep.path.chown', lambda *a, **k: None)
 
     def test_init_host_image_type_rbd_force_raw_images_true(self):
         CONF.set_override('images_type', 'rbd', group='libvirt')
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 4de51ce8e31..3353d712b39 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -2929,11 +2929,7 @@ def _create_snapshot_metadata(self, image_meta, instance,
         if instance.os_type:
             metadata['properties']['os_type'] = instance.os_type
 
-        # NOTE(vish): glance forces ami disk format to be ami
-        if image_meta.disk_format == 'ami':
-            metadata['disk_format'] = 'ami'
-        else:
-            metadata['disk_format'] = img_fmt
+        metadata['disk_format'] = img_fmt
 
         if image_meta.obj_attr_is_set("container_format"):
             metadata['container_format'] = image_meta.container_format

From e31c2a5c7df19e5b8fb357f46581199f9f3fa978 Mon Sep 17 00:00:00 2001
From: "zhong.zhou" <zhong.zhou@easystack.cn>
Date: Wed, 17 Jul 2024 18:29:46 +0800
Subject: [PATCH 88/93] nova-manage: modify image properties in request_spec

At present, we can modify the properties in the instance
system_metadata through the sub command image_property of
nova-manage, but there may be inconsistencies between their
values and those in request_specs.

And the migration is based on request_specs, so the same image
properties are also written to request_specs.

Closes-Bug: 2078999
Change-Id: Id36ecd022cb6f7f9a0fb131b0d202b79715870a9
(cherry picked from commit 2a1fad41453ca7ce15b1cd9b517055c4ccdd12cf)
(cherry picked from commit ebae97c62f1af6b3b9f6da2abfa920d6528ddb1b)
(cherry picked from commit ee30457accabcea10a62652d14d2cf08a6d57ac0)
(cherry picked from commit 3fe5c69b73f01a95fa6df017ea0557298fd6126c)
(cherry picked from commit 44995b430a604ec24e56f49f9dde27c65ed8cc45)
---
 nova/cmd/manage.py                                 | 10 ++++++++--
 nova/tests/unit/cmd/test_manage.py                 | 14 ++++++++++++--
 ...mage-property-bug-2078999-c493fc259d316c24.yaml |  8 ++++++++
 3 files changed, 28 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml

diff --git a/nova/cmd/manage.py b/nova/cmd/manage.py
index 7067facde70..f13f6fddfc2 100644
--- a/nova/cmd/manage.py
+++ b/nova/cmd/manage.py
@@ -3266,9 +3266,10 @@ def _validate_image_properties(self, image_properties):
         # Return the dict so we can update the instance system_metadata
         return image_properties
 
-    def _update_image_properties(self, instance, image_properties):
+    def _update_image_properties(self, ctxt, instance, image_properties):
         """Update instance image properties
 
+        :param ctxt: nova.context.RequestContext
         :param instance: The instance to update
         :param image_properties: List of image properties and values to update
         """
@@ -3292,8 +3293,13 @@ def _update_image_properties(self, instance, image_properties):
         for image_property, value in image_properties.items():
             instance.system_metadata[f'image_{image_property}'] = value
 
+        request_spec = objects.RequestSpec.get_by_instance_uuid(
+            ctxt, instance.uuid)
+        request_spec.image = instance.image_meta
+
         # Save and return 0
         instance.save()
+        request_spec.save()
         return 0
 
     @action_description(_(
@@ -3328,7 +3334,7 @@ def set(self, instance_uuid=None, image_properties=None):
                 instance = objects.Instance.get_by_uuid(
                     cctxt, instance_uuid, expected_attrs=['system_metadata'])
                 return self._update_image_properties(
-                    instance, image_properties)
+                    ctxt, instance, image_properties)
         except ValueError as e:
             print(str(e))
             return 6
diff --git a/nova/tests/unit/cmd/test_manage.py b/nova/tests/unit/cmd/test_manage.py
index 82c3d3c84ad..3775a0b2c96 100644
--- a/nova/tests/unit/cmd/test_manage.py
+++ b/nova/tests/unit/cmd/test_manage.py
@@ -4052,6 +4052,8 @@ def test_show_image_properties_unknown_failure(
             image_property='hw_disk_bus')
         self.assertEqual(1, ret, 'return code')
 
+    @mock.patch('nova.objects.RequestSpec.save')
+    @mock.patch('nova.objects.RequestSpec.get_by_instance_uuid')
     @mock.patch('nova.objects.Instance.get_by_uuid')
     @mock.patch('nova.context.target_cell')
     @mock.patch('nova.objects.Instance.save')
@@ -4060,7 +4062,8 @@ def test_show_image_properties_unknown_failure(
     @mock.patch('nova.context.get_admin_context',
                 new=mock.Mock(return_value=mock.sentinel.ctxt))
     def test_set_image_properties(
-        self, mock_instance_save, mock_target_cell, mock_get_instance
+        self, mock_instance_save, mock_target_cell, mock_get_instance,
+            mock_get_request_spec, mock_request_spec_save
     ):
         mock_target_cell.return_value.__enter__.return_value = \
             mock.sentinel.cctxt
@@ -4069,9 +4072,11 @@ def test_set_image_properties(
             vm_state=obj_fields.InstanceState.STOPPED,
             system_metadata={
                 'image_hw_disk_bus': 'virtio',
-            }
+            },
+            image_ref=''
         )
         mock_get_instance.return_value = instance
+        mock_get_request_spec.return_value = objects.RequestSpec()
         ret = self.commands.set(
             instance_uuid=uuidsentinel.instance,
             image_properties=['hw_cdrom_bus=sata']
@@ -4088,7 +4093,12 @@ def test_set_image_properties(
             instance.system_metadata.get('image_hw_disk_bus'),
             'image_hw_disk_bus'
         )
+        image_props = mock_get_request_spec.return_value.image.properties
+        self.assertEqual('sata', image_props.get('hw_cdrom_bus'))
+        self.assertEqual('virtio', image_props.get('hw_disk_bus'))
+
         mock_instance_save.assert_called_once()
+        mock_request_spec_save.assert_called_once()
 
     @mock.patch('nova.objects.Instance.get_by_uuid')
     @mock.patch('nova.objects.InstanceMapping.get_by_instance_uuid',
diff --git a/releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml b/releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml
new file mode 100644
index 00000000000..03123855e0e
--- /dev/null
+++ b/releasenotes/notes/nova-manage-image-property-bug-2078999-c493fc259d316c24.yaml
@@ -0,0 +1,8 @@
+---
+fixes:
+  - |
+    Before the `Bug 2078999 <https://bugs.launchpad.net/nova/+bug/2078999>`_ was fixed,
+    the ``nova-manage image_property set`` command would update the image properties
+    embedded in the instance but would not update the ones in the request specs. This
+    led to an unexpected rollback of the image properties that were updated by the
+    command after an instance migration.

From bde1adca16ac8fd4377320222f7037af957dc6cd Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Tue, 13 May 2025 15:06:27 +0200
Subject: [PATCH 89/93] [tool] Fix backport validator for non-SLURP

non-SLURP branches are EOL'd in case they reach their end of maintained
phase. This could produce a situation when a patch is merged in a
non-SLURP branch that was deleted in the meantime and it's further
backports fail on gate with backport validator as the hash of the
non-SLURP version of the patch is not on any branch.

This patch fixes the above issue as follows: in case a hash is not
found on any branch, then it checks if it can be found under any *-eol
tag and only fails if there is not found either.

Change-Id: I56705bce8ee4354cd5cb1577a520c2d1c525f57b
(cherry picked from commit e383b465458969ec9271013f2b9e9f24b8225418)
(cherry picked from commit 8b0ae7243f8d581e1e73f0b9dcccf710666d931f)
(cherry picked from commit 88e49dd65c58536ba8dd39ab7cfde669a433f3f6)
(cherry picked from commit db438e55e62599faf2931d0992a5c7689ade3610)
(cherry picked from commit 0fdd21fb4ba4d8c0f5ad45cb8bf1d2698c382c6d)
(cherry picked from commit 75497b0ba61e85afed3f4a0660c20b2d13cb2ae0)
---
 tools/check-cherry-picks.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/check-cherry-picks.sh b/tools/check-cherry-picks.sh
index 74887a9178b..fe75867e59f 100755
--- a/tools/check-cherry-picks.sh
+++ b/tools/check-cherry-picks.sh
@@ -26,8 +26,11 @@ branches+=""
 for hash in $hashes; do
     branch=$(git branch -a --contains "$hash" 2>/dev/null| grep -oE '(master|stable/[a-z0-9.]+|unmaintained/[a-z0-9.]+)')
     if [ $? -ne 0 ]; then
-        echo "Cherry pick hash $hash not on any master, stable or unmaintained branches"
-        exit 1
+        branch=$(git tag --contains "$hash" 2>/dev/null| grep -oE '([0-9.]+-eol)')
+        if [ $? -ne 0 ]; then
+            echo "Cherry pick hash $hash not on any master, stable, unmaintained or EOL'd branches"
+            exit 1
+        fi
     fi
     branches+=" $branch"
     checked=$(($checked + 1))

From 371892dbcf507dbaf466f4b4ab2906cb2d9084c3 Mon Sep 17 00:00:00 2001
From: Elod Illes <elod.illes@est.tech>
Date: Thu, 24 Jul 2025 15:37:20 +0200
Subject: [PATCH 90/93] [CI][stable-only] Remove fedora based jobs

devstack-platform-fedora-latest and
devstack-platform-fedora-latest-virt-preview jobs (in experimental
queue) are not defined anymore, so zuul drops config error because
of them, thus they have to be removed.

Change-Id: I29e4c3640c6187b2c97596ec09b96c2c6feb74de
Signed-off-by: Elod Illes <elod.illes@est.tech>
---
 .zuul.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index ea488401cf2..4a0d9e03281 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -715,10 +715,6 @@
             irrelevant-files: *nova-base-irrelevant-files
         - os-vif-ovs:
             irrelevant-files: *nova-base-irrelevant-files
-        - devstack-platform-fedora-latest:
-            irrelevant-files: *nova-base-irrelevant-files
-        - devstack-platform-fedora-latest-virt-preview:
-            irrelevant-files: *nova-base-irrelevant-files
         - devstack-plugin-ceph-compute-local-ephemeral:
             irrelevant-files: *nova-base-irrelevant-files
         - devstack-tobiko-nova:

From 55c19aca651f7dd0d2f1e4c628df98baa4e1c6f1 Mon Sep 17 00:00:00 2001
From: Takashi Kajinami <kajinamit@oss.nttdata.com>
Date: Mon, 5 Jan 2026 21:19:54 +0900
Subject: [PATCH 91/93] Replace removed os-vif-ovs job and remove
 grenade-skip-level

It was replaced by os-vif-ovn job.

Changes:
  .zuul.yaml

NOTE(elod.illes): the change is because patch in version
unmaintained/2024.1 included the remove of grenade job from 2023.2,
because it went to End of Life, hence the job couldn't run anymore.
So this is basically a partial backport from 2024.1, but technically
the same patch as on stable/* branches.

This patch also includes the following patch to fix the gate:

[stable-only] Remvoe grenade-skip-level from yoga

unmaintained/yoga is the last available open branch, besides,
grenade-skip-level only relevant between SLURP branches, so here it
was only experimental. So this patch simply removes it to unblock
the gate.

Depends-on: https://review.opendev.org/c/openstack/os-vif/+/798038
Change-Id: I4fc595eb51c05c4875bc94e0e812f117a35df7cf
Signed-off-by: Takashi Kajinami <kajinamit@oss.nttdata.com>
(cherry picked from commit ad911932ff90a25af1abc8fa4a95b07d03f55705)
(cherry picked from commit 1783a30680410ebac553e422b0e287f91d101c9e)
(cherry picked from commit ccf3e18af9ed0162553b5b6524e90445b162a6db)
(cherry picked from commit 95c43bcf7c1ad852bd77e57232acec9f06b9f34f)
(cherry picked from commit d7ca3d90ab97547a1ea21fe953b61d7b421592b7)
(cherry picked from commit eaa65f0b85123a4ee3432466a15534b4dd19d3eb)
(cherry picked from commit 422d33ecc1682f8880b08ec1b302a1be88bfb39e)
---
 .zuul.yaml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.zuul.yaml b/.zuul.yaml
index 4a0d9e03281..c76f430707a 100644
--- a/.zuul.yaml
+++ b/.zuul.yaml
@@ -653,8 +653,6 @@
              - ^setup.cfg$
              - ^tools/.*$
              - ^tox.ini$
-        - grenade-skip-level:
-            irrelevant-files: *policies-irrelevant-files
         - nova-grenade-multinode:
             irrelevant-files: *policies-irrelevant-files
         - tempest-ipv6-only:
@@ -713,7 +711,7 @@
             irrelevant-files: *nova-base-irrelevant-files
         - neutron-ovs-tempest-iptables_hybrid:
             irrelevant-files: *nova-base-irrelevant-files
-        - os-vif-ovs:
+        - os-vif-ovn:
             irrelevant-files: *nova-base-irrelevant-files
         - devstack-plugin-ceph-compute-local-ephemeral:
             irrelevant-files: *nova-base-irrelevant-files

From cc8940f661244a6a539816197d4aad11d230dbf0 Mon Sep 17 00:00:00 2001
From: Dan Smith <dansmith@redhat.com>
Date: Tue, 17 Feb 2026 06:39:04 -0800
Subject: [PATCH 92/93] Make disk.extend() pass format to qemu-img

This fixes an instance of us passing a disk image to qemu-img for
resize where we don't constrain the format. As has previously been
identified, it is never safe to do that when the image itself is not
trusted. In this case, an instance with a previously-raw disk image
being used by imagebackend.Flat is susceptible to the user writing a
qcow2 (or other) header to their disk causing the unconstrained
qemu-img resize operation to interpret it as a qcow2 file.

Since Flat maintains the intended disk format in the disk.info file,
and since we would have safety-checked images we got from glance,
we should be able to trust the image.format specifier, which comes
from driver_format in imagebackend, which is read from disk.info.
Since only raw or qcow2 files should be resized anyway, we can further
constrain it to those.

Notes:
 1. qemu-img refuses to resize some types of VMDK files, but it may
    be able to resize others (there are many subformats). Technically,
    Flat will allow running an instance directly from a VMDK file,
    and so this change _could_ be limiting existing "unintentionally
    works" behavior.
 2. This assumes that disk.info is correct, present, etc. The code to
    handle disk.info will regenerate the file if it's missing or
    unreadable by probing the image without a safety check, which
    would be unsafe. However, that is a much more sophisticated attack,
    requiring either access to the system to delete the file or an
    errant operator action in the first place.

Change-Id: I07cbe90b7a7a0a416ef13fbc3a1b7e2272c90951
Closes-Bug: #2137507
(cherry picked from commit 3eba22ff09c81a61750fbb4882e5f1f01a20fdf5)
(cherry picked from commit f448173e3c531f3b298ed2f6f02ff9b47981fbc1)
(cherry picked from commit 992646e49b4b4d96f3258dc154d6f00a43d18d01)
Signed-off-by: Dan Smith <dansmith@redhat.com>
(cherry picked from commit 92d5d741e4018435d84d2f0886953031c33c3e4d)
(cherry picked from commit 06d1077186e215f15f38ef62d1704ee3379b7fe7)
(cherry picked from commit cfef09ac2286fad57ad4e40088091376598c1ea8)
(cherry picked from commit ece9853c56b506431a9ae1b25f1268f2fc76719b)
---
 nova/tests/unit/virt/disk/test_api.py | 35 ++++++++++++++++++++++++---
 nova/virt/disk/api.py                 | 16 +++++++++++-
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/nova/tests/unit/virt/disk/test_api.py b/nova/tests/unit/virt/disk/test_api.py
index 5b90fd186e4..23426c081db 100644
--- a/nova/tests/unit/virt/disk/test_api.py
+++ b/nova/tests/unit/virt/disk/test_api.py
@@ -19,6 +19,7 @@
 from oslo_concurrency import processutils
 from oslo_utils import units
 
+from nova import exception
 from nova import test
 from nova.virt.disk import api
 from nova.virt.disk.mount import api as mount
@@ -127,7 +128,7 @@ def test_extend_qcow_success(self, mock_exec, mock_inst, mock_resize,
 
             mock_can_resize.assert_called_once_with(imgfile, imgsize)
             mock_exec.assert_called_once_with('qemu-img', 'resize',
-                                              imgfile, imgsize)
+                                              '-f', 'qcow2', imgfile, imgsize)
             mock_extendable.assert_called_once_with(image)
             mock_inst.assert_called_once_with(image, None, None)
             mock_resize.assert_called_once_with(mounter.device,
@@ -153,8 +154,8 @@ def test_extend_qcow_no_resize(self, mock_execute, mock_extendable,
         api.extend(image, imgsize)
 
         mock_can_resize_image.assert_called_once_with(imgfile, imgsize)
-        mock_execute.assert_called_once_with('qemu-img', 'resize', imgfile,
-                                             imgsize)
+        mock_execute.assert_called_once_with('qemu-img', 'resize', '-f',
+                                             'qcow2', imgfile, imgsize)
         self.assertFalse(mock_extendable.called)
 
     @mock.patch.object(api, 'can_resize_image', autospec=True,
@@ -185,8 +186,34 @@ def test_extend_raw_success(self, mock_exec, mock_resize,
         api.extend(image, imgsize)
 
         mock_exec.assert_has_calls(
-            [mock.call('qemu-img', 'resize', imgfile, imgsize),
+            [mock.call('qemu-img', 'resize', '-f', 'raw', imgfile, imgsize),
              mock.call('e2label', image.path)])
         mock_resize.assert_called_once_with(imgfile, run_as_root=False,
                                             check_exit_code=[0])
         mock_can_resize.assert_called_once_with(imgfile, imgsize)
+
+    @mock.patch.object(api, 'can_resize_image', autospec=True,
+                       return_value=True)
+    @mock.patch.object(api, 'resize2fs', autospec=True)
+    @mock.patch('oslo_concurrency.processutils.execute', autospec=True)
+    def test_extend_vmdk_failure(self, mock_exec, mock_resize,
+                                 mock_can_resize):
+
+        imgfile = tempfile.NamedTemporaryFile()
+        self.addCleanup(imgfile.close)
+        imgsize = 10
+        # NOTE(danms): There is no image.model.FORMAT_VMDK, but since the
+        # code initializes this directly from Image.disk_format without using
+        # the constant (tsk), this can actually happen at runtime.
+        self.assertRaises(exception.InvalidImageFormat,
+                          imgmodel.LocalFileImage, imgfile, 'vmdk')
+
+        # Patch ALL_FORMATS to include vmdk as if it got added at some point
+        with mock.patch('nova.virt.image.model.ALL_FORMATS',
+                        new=['vmdk']):
+            image = imgmodel.LocalFileImage(imgfile, 'vmdk')
+
+        # Make sure that we still don't call qemu-img resize on the image
+        self.assertRaises(exception.InvalidDiskFormat,
+                          api.extend, image, imgsize)
+        mock_exec.assert_not_called()
diff --git a/nova/virt/disk/api.py b/nova/virt/disk/api.py
index 9902c0608ba..580e4daf1f1 100644
--- a/nova/virt/disk/api.py
+++ b/nova/virt/disk/api.py
@@ -125,7 +125,21 @@ def extend(image, size):
         nova.privsep.libvirt.ploop_resize(image.path, size)
         return
 
-    processutils.execute('qemu-img', 'resize', image.path, size)
+    # NOTE(danms): We should not call qemu-img without a format, and
+    # only qcow2 and raw are supported. So check which one we're being
+    # told this is supposed to be and pass that to qemu-img. Also note
+    # that we need to pass the qemu format string to this command, which
+    # may or may not be the same as the FORMAT_* constant, so be
+    # explicit here.
+    if image.format == imgmodel.FORMAT_RAW:
+        format = 'raw'
+    elif image.format == imgmodel.FORMAT_QCOW2:
+        format = 'qcow2'
+    else:
+        LOG.warning('Attempting to resize image %s with format %s, '
+        'which is not supported', image.path, image.format)
+        raise exception.InvalidDiskFormat(disk_format=image.format)
+    processutils.execute('qemu-img', 'resize', '-f', format, image.path, size)
 
     if (image.format != imgmodel.FORMAT_RAW and
         not CONF.resize_fs_using_block_device):

From f2c65e36fa4b34006459d335bc769ddcd45fe664 Mon Sep 17 00:00:00 2001
From: melanie witt <melwittt@gmail.com>
Date: Wed, 16 Apr 2025 15:20:23 -0700
Subject: [PATCH 93/93] libvirt: Use common naming convention for ephemeral
 disk labels

The _create_ephemeral() method is responsible for creating ephemeral
disks with image type "raw" and formatting them with mkfs. In the case
of [libvirt]images_type "qcow2", _create_ephemeral() will create
backing files.

Currently we are not using a consistent naming convention for choosing
the filesystem label for ephemeral disks. When we create a server for
example, we go through the disks and label them "ephemeral0",
"ephemeral1", "ephemeral2", etc.

When we hard reboot a server, there is a check to create missing
backing files and if so, a new backing file will be created but instead
of being labeled "ephemeralN" the code attempts to label them with the
name of the backing file itself for example "ephemeral_1_40d1d2c". This
will fail if the filesystem used for ephemeral disks has limitations on
the length of filesystem label names (VFAT, XFS, ...). For example:

  mkfs.vfat: Label can be no longer than 11 characters

This adds a helper method for obtaining ephemeral disks filesystem
label names and uses it the same way in the few places fs_label is
specified.

Closes-Bug: #2061701

Change-Id: Id033a5760272e4fb06dee2342414b26aa16ffe24
(cherry picked from commit 82856f95c69bb07bd2a61decae9abe827a2a1567)
(cherry picked from commit 09fc2fae424493ff9580c6d38e63f207b916529c)
(cherry picked from commit 2fd65bd14ac33b58e247bf9d3c8066fa5dac2215)
Signed-off-by: Pierre Riteau <pierre@stackhpc.com>
(cherry picked from commit d6cdd73c980b31080710f9a8cf10d7c779780c43)
(cherry picked from commit 911cc31b8cbbee9088a5c7b30ed34f3f2a327b3d)
(cherry picked from commit a8c53ef4c33e355b7e15251aeb862a42a434b8bf)
---
 nova/tests/unit/virt/libvirt/test_driver.py         |  5 ++++-
 nova/virt/libvirt/driver.py                         | 13 ++++++++++---
 ...01-ephemeral-disk-fs-label-504484c4522e6d6a.yaml |  6 ++++++
 3 files changed, 20 insertions(+), 4 deletions(-)
 create mode 100644 releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml

diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py
index bdb8bc18aa8..85ea8f0283f 100644
--- a/nova/tests/unit/virt/libvirt/test_driver.py
+++ b/nova/tests/unit/virt/libvirt/test_driver.py
@@ -13830,8 +13830,11 @@ def test_create_images_and_backing_ephemeral_gets_created(
                              'ephemeral_foo')
             ]
 
+            # This also asserts that the filesystem label name is generated
+            # correctly as 'ephemeral0' to help prevent regression of the
+            # related bug fix from https://launchpad.net/bugs/2061701
             create_ephemeral_mock.assert_called_once_with(
-                ephemeral_size=1, fs_label='ephemeral_foo',
+                ephemeral_size=1, fs_label='ephemeral0',
                 os_type='linux', target=ephemeral_backing)
 
             fetch_image_mock.assert_called_once_with(
diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
index 3353d712b39..da2da25ea9f 100644
--- a/nova/virt/libvirt/driver.py
+++ b/nova/virt/libvirt/driver.py
@@ -4609,6 +4609,13 @@ def _inject_data(self, disk, instance, injection_info):
                               {'img_id': img_id, 'e': e},
                               instance=instance)
 
+    @staticmethod
+    def _get_fs_label_ephemeral(index: int) -> str:
+        # Use a consistent naming convention for FS labels. We need to be
+        # mindful of various filesystems label name length limitations.
+        # See for example: https://bugs.launchpad.net/nova/+bug/2061701
+        return f'ephemeral{index}'
+
     # NOTE(sileht): many callers of this method assume that this
     # method doesn't fail if an image already exists but instead
     # think that it will be reused (ie: (live)-migration/resize)
@@ -4715,7 +4722,7 @@ def raw(fname):
             created_disks = created_disks or not disk_image.exists()
 
             fn = functools.partial(self._create_ephemeral,
-                                   fs_label='ephemeral0',
+                                   fs_label=self._get_fs_label_ephemeral(0),
                                    os_type=instance.os_type,
                                    is_block_dev=disk_image.is_block_dev,
                                    vm_mode=vm_mode)
@@ -4739,7 +4746,7 @@ def raw(fname):
                 raise exception.InvalidBDMFormat(details=msg)
 
             fn = functools.partial(self._create_ephemeral,
-                                   fs_label='ephemeral%d' % idx,
+                                   fs_label=self._get_fs_label_ephemeral(idx),
                                    os_type=instance.os_type,
                                    is_block_dev=disk_image.is_block_dev,
                                    vm_mode=vm_mode)
@@ -10766,7 +10773,7 @@ def _create_images_and_backing(self, context, instance, instance_dir,
                     # cached.
                     disk.cache(
                         fetch_func=self._create_ephemeral,
-                        fs_label=cache_name,
+                        fs_label=self._get_fs_label_ephemeral(0),
                         os_type=instance.os_type,
                         filename=cache_name,
                         size=info['virt_disk_size'],
diff --git a/releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml b/releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml
new file mode 100644
index 00000000000..5f4c22ca248
--- /dev/null
+++ b/releasenotes/notes/bug-2061701-ephemeral-disk-fs-label-504484c4522e6d6a.yaml
@@ -0,0 +1,6 @@
+fixes:
+  - |
+    Fixed an issue where certain server actions could fail for servers with
+    ephemeral disks due to filesystem label name length limitations
+    (VFAT, XFS, ...). Filesystem label name generation has been fixed for these
+    cases. See https://launchpad.net/bugs/2061701 for more details.