aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenrik Riomar <henrik.riomar@gmail.com>2020-01-03 18:53:26 +0100
committerLeonardo Arena <rnalrd@alpinelinux.org>2020-01-06 07:12:21 +0000
commit4e803597c91a0cf312e53ce62458e81f5294c6e9 (patch)
treead879bd1c1e5e8e9729ab14a2d45857b6febea93
parent9ae93588cf9f0b3f7543a75152e40e0fa4f3dcee (diff)
downloadaports-4e803597c91a0cf312e53ce62458e81f5294c6e9.tar.bz2
aports-4e803597c91a0cf312e53ce62458e81f5294c6e9.tar.xz
main/xen: security upgrade to 4.11.3
Fixes CVE-2019-19579 / XSA-306
-rw-r--r--main/xen/APKBUILD61
-rw-r--r--main/xen/xsa298-4.11.patch87
-rw-r--r--main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch94
-rw-r--r--main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch99
-rw-r--r--main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch609
-rw-r--r--main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch140
-rw-r--r--main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch79
-rw-r--r--main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch101
-rw-r--r--main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch374
-rw-r--r--main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch227
-rw-r--r--main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch106
-rw-r--r--main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch169
-rw-r--r--main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch413
-rw-r--r--main/xen/xsa301-4.11-1.patch80
-rw-r--r--main/xen/xsa301-4.11-2.patch92
-rw-r--r--main/xen/xsa301-4.11-3.patch49
-rw-r--r--main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch37
-rw-r--r--main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch498
-rw-r--r--main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch74
-rw-r--r--main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch97
-rw-r--r--main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch226
-rw-r--r--main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch114
-rw-r--r--main/xen/xsa304-4.11-1.patch71
-rw-r--r--main/xen/xsa304-4.11-2.patch298
-rw-r--r--main/xen/xsa304-4.11-3.patch110
-rw-r--r--main/xen/xsa305-4.11-1.patch288
-rw-r--r--main/xen/xsa305-4.11-2.patch192
27 files changed, 5 insertions, 4780 deletions
diff --git a/main/xen/APKBUILD b/main/xen/APKBUILD
index 49f301542e..9987b66222 100644
--- a/main/xen/APKBUILD
+++ b/main/xen/APKBUILD
@@ -2,8 +2,8 @@
# Contributor: Roger Pau Monne <roger.pau@entel.upc.edu>
# Maintainer: William Pitcock <nenolod@dereferenced.org>
pkgname=xen
-pkgver=4.11.2
-pkgrel=1
+pkgver=4.11.3
+pkgrel=0
pkgdesc="Xen hypervisor"
url="https://www.xenproject.org/"
arch="x86_64 armhf aarch64" # enable armv7 when builds with gcc8
@@ -165,6 +165,8 @@ options="!strip"
# - CVE-2019-18422 XSA-303
# - CVE-2018-12207 XSA-304
# - CVE-2019-11135 XSA-305
+# 4.11.3-r0:
+# - CVE-2019-19579 XSA-306
case "$CARCH" in
x86*)
@@ -232,33 +234,6 @@ source="https://downloads.xenproject.org/release/$pkgname/$pkgver/$pkgname-$pkgv
hotplug-Linux-iscsi-block-handle-lun-1.patch
- xsa298-4.11.patch
- xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
- xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
- xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
- xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
- xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
- xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
- xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
- xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
- xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
- xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
- xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
- xsa301-4.11-1.patch
- xsa301-4.11-2.patch
- xsa301-4.11-3.patch
- xsa302-0001-IOMMU-add-missing-HVM-check.patch
- xsa302-0002-passthrough-quarantine-PCI-devices.patch
- xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
- xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
- xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
- xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
- xsa304-4.11-1.patch
- xsa304-4.11-2.patch
- xsa304-4.11-3.patch
- xsa305-4.11-1.patch
- xsa305-4.11-2.patch
-
xenstored.initd
xenstored.confd
xenconsoled.initd
@@ -492,7 +467,7 @@ EOF
}
-sha512sums="48d3d926d35eb56c79c06d0abc6e6be2564fadb43367cc7f46881c669a75016707672179c2cca1c4cfb14af2cefd46e2e7f99470cddf7df2886d8435a2de814e xen-4.11.2.tar.gz
+sha512sums="2204e490e9fc357a05983a9bf4e7345e1d364fe00400ce473988dcb9ca7d4e2b921fe10f095cbbc64248130a92d22c6f0d154dcae250a57a7f915df32e3dc436 xen-4.11.3.tar.gz
2e0b0fd23e6f10742a5517981e5171c6e88b0a93c83da701b296f5c0861d72c19782daab589a7eac3f9032152a0fc7eff7f5362db8fccc4859564a9aa82329cf gmp-4.3.2.tar.bz2
c2bc9ffc8583aeae71cee9ddcc4418969768d4e3764d47307da54f93981c0109fb07d84b061b3a3628bd00ba4d14a54742bc04848110eb3ae8ca25dbfbaabadb grub-0.97.tar.gz
1465b58279af1647f909450e394fe002ca165f0ff4a0254bfa9fe0e64316f50facdde2729d79a4e632565b4500cf4d6c74192ac0dd3bc9fe09129bbd67ba089d lwip-1.3.0.tar.gz
@@ -515,32 +490,6 @@ e76816c6ad0e91dc5f81947f266da3429b20e6d976c3e8c41202c6179532eec878a3f0913921ef3a
69dfa60628ca838678862383528654ecbdf4269cbb5c9cfb6b84d976202a8dea85d711aa65a52fa1b477fb0b30604ca70cf1337192d6fb9388a08bbe7fe56077 xenstore_client_transaction_fix.patch
2094ea964fa610b2bf72fd2c7ede7e954899a75c0f5b08030cf1d74460fb759ade84866176e32f8fe29c921dfdc6dafd2b31e23ab9b0a3874d3dceeabdd1913b xenqemu-xattr-size-max.patch
8c9cfc6afca325df1d8026e21ed03fa8cd2c7e1a21a56cc1968301c5ab634bfe849951899e75d328951d7a41273d1e49a2448edbadec0029ed410c43c0549812 hotplug-Linux-iscsi-block-handle-lun-1.patch
-a4e1a0c080c51dad386b0c276eb1b9a5fc2acbca5886e1c390325cbe81102a2f5c570e7d4e9c18a8214036e80504ada0216a19e0f63a69926558fae7680d765b xsa298-4.11.patch
-12b1dd1c320c0fbf5cc651e018b7051875d826014c993f04a0002ed9658d84969e413e2baaaeb8fc42d57dc0ae4b904bcb8baa75a38bff43484b94790a34eb92 xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
-19385d9e713b806c237d7505b8a5e9bb5b286d3cf0f0a452a2638735e4204a8eeb14a66f761e78f6ed00098e1e773d865e0aa2c388620961f83aac9d149d8e2c xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
-9652e3c20448bbb1a13e801b451c13077699e32626f544ec125fd5cb6f06fae7ed0308a950ec1783f7d91fc2d101133bbcf5a3545fec5aa9575e7e38078a176e xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
-22adf9c5fd4a2a85ccaeb0552a67aabab97259f8419b62082922fcadb41c754cf70bbd9d1746589074f601b820290eb6f0ca7eda4d512dceaabff71d86ea1ad3 xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
-2b58855bcd5fa8c15c99a5e3e5ab57c8a1053ff5f5b6fa75d3352c913af56dc9f12e85bd0bee3b79ff1d808f7f893b3c3a5bc2c5edd1539a5d6e1a9faa1053d9 xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
-776898c266bdd3c2c20947ae048a5c83cc78ae719407d55037f396fea6c52339c5c8859adaa8626fe9c70498a11ae0becc33e12dd92684492b7d37fd0846d0a0 xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
-f2feef23419530e059cdddf4e76990183b365171647dba89a2b7f59c7c7a084da21f3ea834148b0b3d328dbec3d960023ca4f807094bb746ca3be3b0e4098aae xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
-856cd9bb37c1253423f1d117fa2421e2296a08f0b7716c39cff8db0959f94f90588ab978611b670b630ccc7b6776325afc366460d899c9932d881a25526c813b xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
-fa9b497941a063fc378849f9a93f1a1500c05299fade739a322d2686041542f1a2d54e46c979b9a5bae342e8eba6226c606d38cb5ad52787c5f541890d2906b6 xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
-b6af837ba78964605a8640cffcf2295c0d92df621864bfc4f7999aee0c8c68e502d6e4ffe98bcba77216090335ed40f6ae4bfe506221373d35139905cd94288a xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
-4c7458f90f31c4755e5dd3b4c01f2d72f8a8fdc94822e1d70a03b28b769461c1be4098ceeb91c66e69211d74034bf955ac29a60c008f870c52228de258ee285c xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
-916dc53eddb225a5c118630553baaa784c14a2a2ddc1f031ea83dbbb0241f0b103e664d3429414236951f6de689ff234be3fb900b83d1e1a4a4227385b32d496 xsa301-4.11-1.patch
-555d6586543f4b8661d1104851073277290ccce17d05279531c685966186948f933706d834ac1dd748678340df0aaa0423d18ea88be9a35bec67685aeb6258ac xsa301-4.11-2.patch
-5cf43aeb65988b7395a2d87bef0587cc172456ebebc2b9b67c231802ebcfb6bc1bdccccaaa09be463f1a79158234cb4003c0cd478990b99f020d812b90acc011 xsa301-4.11-3.patch
-c44b8b5b6d64bed624fa045efa88a3d34af1c14d3186fe9ef316eee288ea9ebb0d55414fb612d3873f9a6d76bd316176d68e284d4fdc1b59205c6b5ebd03365e xsa302-0001-IOMMU-add-missing-HVM-check.patch
-997c45c485cd382d9d3919cdedf4c9c50a579e21a98503408a2f9854437c8af6e7eafb83639261e1fdd8c4438fb1ff8ea196cfde00e8db82f26244086be31537 xsa302-0002-passthrough-quarantine-PCI-devices.patch
-b65de69f7c0097177652fc6fe7c0c12ab44c6bb0a8823b19ee315a574b04f9151a572d518d684fec467b995c9c9756bd5b2d88f7546199c0b807155c5dca43b5 xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
-440869c1d7212820ba0c7d4b35681483897d1dcc4aa2f833af1370ac5bd8995b3d2712c598e6309488b90f37e36ca36db232e5de06242afa017d1c991f5d6af6 xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
-7d56d0576fcd90ce4296e59cd2eae35929ecae6a7fa40e88c2f66f54234083b09be92630f299e5bb24d23b32949e58d49bafa1bed1e73719b73a4c640b86206f xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
-49b540f2585f43685c9f37ea1b6f166a98e71d85e6e0fbf2807b5788b3e71cb35dd71f71b7ad5a6d230ba60706cd11ef4bcecec7c2f250f28fd95dbd50fffc2b xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
-35a047ba201c5ef718a961e4bd6f5a9760e736ee9d28a87e2de91fa2475d1883c2acb71817c4bcd1bb582ef986c2defd230030783a56dda3e9125d66189d5898 xsa304-4.11-1.patch
-947d016de1356782ed9f44ce30649c3a1898a230d492ef17a8d1724686ac93e069fb15a8110ea7fe4fe34ce409f067d3558e53a8ae7ba988c2c6390cccff04c3 xsa304-4.11-2.patch
-cd5d1967a38e74555465f71eb99501ef0c39d4a22e4e2f737b7a478e414caf5992b2a7d8257d68e6c8f48f260742931df22ff5dbc4d4cc35eb2d8075d1c0798b xsa304-4.11-3.patch
-9b7730142f63d87d962b1ec148a440a0da3d331053df34c92d535084e3044adc4cb7f1004e9a7028c9cce8f11150714d86f03f067ab2a631a15ef0eca8300581 xsa305-4.11-1.patch
-58a73ae86b37d59aad469cdadd8d325f6daa91d0af068ea27f022cfcfa6770c52a29a12a15867a5e95d5e2e23a8ee442070dac6e0b4d5b389543c8769de63f12 xsa305-4.11-2.patch
52c43beb2596d645934d0f909f2d21f7587b6898ed5e5e7046799a8ed6d58f7a09c5809e1634fa26152f3fd4f3e7cfa07da7076f01b4a20cc8f5df8b9cb77e50 xenstored.initd
093f7fbd43faf0a16a226486a0776bade5dc1681d281c5946a3191c32d74f9699c6bf5d0ab8de9d1195a2461165d1660788e92a3156c9b3c7054d7b2d52d7ff0 xenstored.confd
3c86ed48fbee0af4051c65c4a3893f131fa66e47bf083caf20c9b6aa4b63fdead8832f84a58d0e27964bc49ec8397251b34e5be5c212c139f556916dc8da9523 xenconsoled.initd
diff --git a/main/xen/xsa298-4.11.patch b/main/xen/xsa298-4.11.patch
deleted file mode 100644
index 9f649e307d..0000000000
--- a/main/xen/xsa298-4.11.patch
+++ /dev/null
@@ -1,87 +0,0 @@
-From: Jan Beulich <jbeulich@suse.com>
-Subject: x86/PV: check GDT/LDT limits during emulation
-
-Accesses beyond the LDT limit originating from emulation would trigger
-the ASSERT() in pv_map_ldt_shadow_page(). On production builds such
-accesses would cause an attempt to promote the touched page (offset from
-the present LDT base address) to a segment descriptor one. If this
-happens to succeed, guest user mode would be able to elevate its
-privileges to that of the guest kernel. This is particularly easy when
-there's no LDT at all, in which case the LDT base stored internally to
-Xen is simply zero.
-
-Also adjust the ASSERT() that was triggering: It was off by one to
-begin with, and for production builds we also better use
-ASSERT_UNREACHABLE() instead with suitable recovery code afterwards.
-
-This is XSA-298.
-
-Reported-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
-
---- a/xen/arch/x86/pv/emul-gate-op.c
-+++ b/xen/arch/x86/pv/emul-gate-op.c
-@@ -51,7 +51,13 @@ static int read_gate_descriptor(unsigned
- const struct desc_struct *pdesc = gdt_ldt_desc_ptr(gate_sel);
-
- if ( (gate_sel < 4) ||
-- ((gate_sel >= FIRST_RESERVED_GDT_BYTE) && !(gate_sel & 4)) ||
-+ /*
-+ * We're interested in call gates only, which occupy a single
-+ * seg_desc_t for 32-bit and a consecutive pair of them for 64-bit.
-+ */
-+ ((gate_sel >> 3) + !is_pv_32bit_vcpu(v) >=
-+ (gate_sel & 4 ? v->arch.pv_vcpu.ldt_ents
-+ : v->arch.pv_vcpu.gdt_ents)) ||
- __get_user(desc, pdesc) )
- return 0;
-
-@@ -70,7 +76,7 @@ static int read_gate_descriptor(unsigned
- if ( !is_pv_32bit_vcpu(v) )
- {
- if ( (*ar & 0x1f00) != 0x0c00 ||
-- (gate_sel >= FIRST_RESERVED_GDT_BYTE - 8 && !(gate_sel & 4)) ||
-+ /* Limit check done above already. */
- __get_user(desc, pdesc + 1) ||
- (desc.b & 0x1f00) )
- return 0;
---- a/xen/arch/x86/pv/emulate.c
-+++ b/xen/arch/x86/pv/emulate.c
-@@ -31,7 +31,14 @@ int pv_emul_read_descriptor(unsigned int
- {
- struct desc_struct desc;
-
-- if ( sel < 4)
-+ if ( sel < 4 ||
-+ /*
-+ * Don't apply the GDT limit here, as the selector may be a Xen
-+ * provided one. __get_user() will fail (without taking further
-+ * action) for ones falling in the gap between guest populated
-+ * and Xen ones.
-+ */
-+ ((sel & 4) && (sel >> 3) >= v->arch.pv_vcpu.ldt_ents) )
- desc.b = desc.a = 0;
- else if ( __get_user(desc, gdt_ldt_desc_ptr(sel)) )
- return 0;
---- a/xen/arch/x86/pv/mm.c
-+++ b/xen/arch/x86/pv/mm.c
-@@ -92,12 +92,16 @@ bool pv_map_ldt_shadow_page(unsigned int
- BUG_ON(unlikely(in_irq()));
-
- /*
-- * Hardware limit checking should guarantee this property. NB. This is
-+ * Prior limit checking should guarantee this property. NB. This is
- * safe as updates to the LDT can only be made by MMUEXT_SET_LDT to the
- * current vcpu, and vcpu_reset() will block until this vcpu has been
- * descheduled before continuing.
- */
-- ASSERT((offset >> 3) <= curr->arch.pv_vcpu.ldt_ents);
-+ if ( unlikely((offset >> 3) >= curr->arch.pv_vcpu.ldt_ents) )
-+ {
-+ ASSERT_UNREACHABLE();
-+ return false;
-+ }
-
- if ( is_pv_32bit_domain(currd) )
- linear = (uint32_t)linear;
diff --git a/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch b/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
deleted file mode 100644
index 64753287f0..0000000000
--- a/main/xen/xsa299-0001-x86-mm-L1TF-checks-don-t-leave-a-partial-entry.patch
+++ /dev/null
@@ -1,94 +0,0 @@
-From 852df269d247e177d5f2e9b8f3a4301a6fdd76bd Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 01/11] x86/mm: L1TF checks don't leave a partial entry
-
-On detection of a potential L1TF issue, most validation code returns
--ERESTART to allow the switch to shadow mode to happen and cause the
-original operation to be restarted.
-
-However, in the validation code, the return value -ERESTART has been
-repurposed to indicate 1) the function has partially completed
-something which needs to be undone, and 2) calling put_page_type()
-should cleanly undo it. This causes problems in several places.
-
-For L1 tables, on receiving an -ERESTART return from alloc_l1_table(),
-alloc_page_type() will set PGT_partial on the page. If for some
-reason the original operation never restarts, then on domain
-destruction, relinquish_memory() will call free_page_type() on the
-page.
-
-Unfortunately, alloc_ and free_l1_table() aren't set up to deal with
-PGT_partial. When returning a failure, alloc_l1_table() always
-de-validates whatever it's validated so far, and free_l1_table()
-always devalidates the whole page. This means that if
-relinquish_memory() calls free_page_type() on an L1 that didn't
-complete due to an L1TF, it will call put_page_from_l1e() on "page
-entries" that have never been validated.
-
-For L2+ tables, setting rc to ERESTART causes the rest of the
-alloc_lN_table() function to *think* that the entry in question will
-have PGT_partial set. This will cause it to set partial_pte = 1. If
-relinqush_memory() then calls free_page_type() on one of those pages,
-then free_lN_table() will call put_page_from_lNe() on the entry when
-it shouldn't.
-
-Rather than indicating -ERESTART, indicate -EINTR. This is the code
-to indicate that nothing has changed from when you started the call
-(which is effectively how alloc_l1_table() handles errors).
-
-mod_lN_entry() shouldn't have any of these types of problems, so leave
-potential changes there for a clean-up patch later.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 8 ++++----
- 1 file changed, 4 insertions(+), 4 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index e6a4cb28f8..8ced185b49 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -1110,7 +1110,7 @@ get_page_from_l2e(
- int rc;
-
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
-- return pv_l1tf_check_l2e(d, l2e) ? -ERESTART : 1;
-+ return pv_l1tf_check_l2e(d, l2e) ? -EINTR : 1;
-
- if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
- {
-@@ -1142,7 +1142,7 @@ get_page_from_l3e(
- int rc;
-
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
-- return pv_l1tf_check_l3e(d, l3e) ? -ERESTART : 1;
-+ return pv_l1tf_check_l3e(d, l3e) ? -EINTR : 1;
-
- if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
- {
-@@ -1175,7 +1175,7 @@ get_page_from_l4e(
- int rc;
-
- if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
-- return pv_l1tf_check_l4e(d, l4e) ? -ERESTART : 1;
-+ return pv_l1tf_check_l4e(d, l4e) ? -EINTR : 1;
-
- if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
- {
-@@ -1404,7 +1404,7 @@ static int alloc_l1_table(struct page_info *page)
- {
- if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
- {
-- ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -ERESTART : 0;
-+ ret = pv_l1tf_check_l1e(d, pl1e[i]) ? -EINTR : 0;
- if ( ret )
- goto out;
- }
---
-2.23.0
-
diff --git a/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch b/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
deleted file mode 100644
index a369f93750..0000000000
--- a/main/xen/xsa299-0002-x86-mm-Don-t-re-set-PGT_pinned-on-a-partially-de-val.patch
+++ /dev/null
@@ -1,99 +0,0 @@
-From 6bdddd7980eac0cc883945d823986f24682ca47a Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 02/11] x86/mm: Don't re-set PGT_pinned on a partially
- de-validated page
-
-When unpinning pagetables, if an operation is interrupted,
-relinquish_memory() re-sets PGT_pinned so that the un-pin will
-pickedup again when the hypercall restarts.
-
-This is appropriate when put_page_and_type_preemptible() returns
--EINTR, which indicates that the page is back in its initial state
-(i.e., completely validated). However, for -ERESTART, this leads to a
-state where a page has both PGT_pinned and PGT_partial set.
-
-This happens to work at the moment, although it's not really a
-"canonical" state; but in subsequent patches, where we need to make a
-distinction in handling between PGT_validated and PGT_partial pages,
-this causes issues.
-
-Move to a "canonical" state by:
-- Only re-setting PGT_pinned on -EINTR
-- Re-dropping the refcount held by PGT_pinned on -ERESTART
-
-In the latter case, the PGT_partial bit will be cleared further down
-with the rest of the other PGT_partial pages.
-
-While here, clean up some trainling whitespace.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/domain.c | 31 ++++++++++++++++++++++++++++---
- 1 file changed, 28 insertions(+), 3 deletions(-)
-
-diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
-index 29f892c04c..8fbecbb169 100644
---- a/xen/arch/x86/domain.c
-+++ b/xen/arch/x86/domain.c
-@@ -112,7 +112,7 @@ static void play_dead(void)
- * this case, heap corruption or #PF can occur (when heap debugging is
- * enabled). For example, even printk() can involve tasklet scheduling,
- * which touches per-cpu vars.
-- *
-+ *
- * Consider very carefully when adding code to *dead_idle. Most hypervisor
- * subsystems are unsafe to call.
- */
-@@ -1838,9 +1838,34 @@ static int relinquish_memory(
- break;
- case -ERESTART:
- case -EINTR:
-+ /*
-+ * -EINTR means PGT_validated has been re-set; re-set
-+ * PGT_pinned again so that it gets picked up next time
-+ * around.
-+ *
-+ * -ERESTART, OTOH, means PGT_partial is set instead. Put
-+ * it back on the list, but don't set PGT_pinned; the
-+ * section below will finish off de-validation. But we do
-+ * need to drop the general ref associated with
-+ * PGT_pinned, since put_page_and_type_preemptible()
-+ * didn't do it.
-+ *
-+ * NB we can do an ASSERT for PGT_validated, since we
-+ * "own" the type ref; but theoretically, the PGT_partial
-+ * could be cleared by someone else.
-+ */
-+ if ( ret == -EINTR )
-+ {
-+ ASSERT(page->u.inuse.type_info & PGT_validated);
-+ set_bit(_PGT_pinned, &page->u.inuse.type_info);
-+ }
-+ else
-+ put_page(page);
-+
- ret = -ERESTART;
-+
-+ /* Put the page back on the list and drop the ref we grabbed above */
- page_list_add(page, list);
-- set_bit(_PGT_pinned, &page->u.inuse.type_info);
- put_page(page);
- goto out;
- default:
-@@ -2062,7 +2087,7 @@ void vcpu_kick(struct vcpu *v)
- * pending flag. These values may fluctuate (after all, we hold no
- * locks) but the key insight is that each change will cause
- * evtchn_upcall_pending to be polled.
-- *
-+ *
- * NB2. We save the running flag across the unblock to avoid a needless
- * IPI for domains that we IPI'd to unblock.
- */
---
-2.23.0
-
diff --git a/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch b/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
deleted file mode 100644
index fa6914b91e..0000000000
--- a/main/xen/xsa299-0003-x86-mm-Separate-out-partial_pte-tristate-into-indivi.patch
+++ /dev/null
@@ -1,609 +0,0 @@
-From 7c0a37005f52d10903ce22851b52ae9b6f4f0ee2 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 03/11] x86/mm: Separate out partial_pte tristate into
- individual flags
-
-At the moment, partial_pte is a tri-state that contains two distinct bits
-of information:
-
-1. If zero, the pte at index [nr_validated_ptes] is un-validated. If
- non-zero, the pte was last seen with PGT_partial set.
-
-2. If positive, the pte at index [nr_validated_ptes] does not hold a
- general reference count. If negative, it does.
-
-To make future patches more clear, separate out this functionality
-into two distinct, named bits: PTF_partial_set (for #1) and
-PTF_partial_general_ref (for #2).
-
-Additionally, a number of functions which need this information also
-take other flags to control behavior (such as `preemptible` and
-`defer`). These are hard to read in the caller (since you only see
-'true' or 'false'), and ugly when many are added together. In
-preparation for adding yet another flag in a future patch, collapse
-all of these into a single `flag` variable.
-
-NB that this does mean checking for what was previously the '-1'
-condition a bit more ugly in the put_page_from_lNe functions (since
-you have to check for both partial_set and general ref); but this
-clause will go away in a future patch.
-
-Also note that the original comment had an off-by-one error:
-partial_flags (like partial_pte before it) concerns
-plNe[nr_validated_ptes], not plNe[nr_validated_ptes+1].
-
-No functional change intended.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 164 +++++++++++++++++++++++----------------
- xen/include/asm-x86/mm.h | 41 ++++++----
- 2 files changed, 127 insertions(+), 78 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 8ced185b49..1c4f54e328 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -610,20 +610,34 @@ static int alloc_segdesc_page(struct page_info *page)
- static int _get_page_type(struct page_info *page, unsigned long type,
- bool preemptible);
-
-+/*
-+ * The following flags are used to specify behavior of various get and
-+ * put commands. The first two are also stored in page->partial_flags
-+ * to indicate the state of the page pointed to by
-+ * page->pte[page->nr_validated_entries]. See the comment in mm.h for
-+ * more information.
-+ */
-+#define PTF_partial_set (1 << 0)
-+#define PTF_partial_general_ref (1 << 1)
-+#define PTF_preemptible (1 << 2)
-+#define PTF_defer (1 << 3)
-+
- static int get_page_and_type_from_mfn(
- mfn_t mfn, unsigned long type, struct domain *d,
-- int partial, int preemptible)
-+ unsigned int flags)
- {
- struct page_info *page = mfn_to_page(mfn);
- int rc;
-+ bool preemptible = flags & PTF_preemptible,
-+ partial_ref = flags & PTF_partial_general_ref;
-
-- if ( likely(partial >= 0) &&
-+ if ( likely(!partial_ref) &&
- unlikely(!get_page_from_mfn(mfn, d)) )
- return -EINVAL;
-
- rc = _get_page_type(page, type, preemptible);
-
-- if ( unlikely(rc) && partial >= 0 &&
-+ if ( unlikely(rc) && !partial_ref &&
- (!preemptible || page != current->arch.old_guest_table) )
- put_page(page);
-
-@@ -1104,7 +1118,7 @@ get_page_from_l1e(
- define_get_linear_pagetable(l2);
- static int
- get_page_from_l2e(
-- l2_pgentry_t l2e, unsigned long pfn, struct domain *d, int partial)
-+ l2_pgentry_t l2e, unsigned long pfn, struct domain *d, unsigned int flags)
- {
- unsigned long mfn = l2e_get_pfn(l2e);
- int rc;
-@@ -1119,8 +1133,9 @@ get_page_from_l2e(
- return -EINVAL;
- }
-
-- rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d,
-- partial, false);
-+ ASSERT(!(flags & PTF_preemptible));
-+
-+ rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, flags);
- if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
- rc = 0;
-
-@@ -1137,7 +1152,7 @@ get_page_from_l2e(
- define_get_linear_pagetable(l3);
- static int
- get_page_from_l3e(
-- l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
-+ l3_pgentry_t l3e, unsigned long pfn, struct domain *d, unsigned int flags)
- {
- int rc;
-
-@@ -1152,7 +1167,7 @@ get_page_from_l3e(
- }
-
- rc = get_page_and_type_from_mfn(
-- l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
-+ l3e_get_mfn(l3e), PGT_l2_page_table, d, flags | PTF_preemptible);
- if ( unlikely(rc == -EINVAL) &&
- !is_pv_32bit_domain(d) &&
- get_l3_linear_pagetable(l3e, pfn, d) )
-@@ -1170,7 +1185,7 @@ get_page_from_l3e(
- define_get_linear_pagetable(l4);
- static int
- get_page_from_l4e(
-- l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
-+ l4_pgentry_t l4e, unsigned long pfn, struct domain *d, unsigned int flags)
- {
- int rc;
-
-@@ -1185,7 +1200,7 @@ get_page_from_l4e(
- }
-
- rc = get_page_and_type_from_mfn(
-- l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
-+ l4e_get_mfn(l4e), PGT_l3_page_table, d, flags | PTF_preemptible);
- if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
- rc = 0;
-
-@@ -1275,7 +1290,7 @@ void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
- * Note also that this automatically deals correctly with linear p.t.'s.
- */
- static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
-- int partial, bool defer)
-+ unsigned int flags)
- {
- int rc = 0;
-
-@@ -1295,12 +1310,13 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- struct page_info *pg = l2e_get_page(l2e);
- struct page_info *ptpg = mfn_to_page(_mfn(pfn));
-
-- if ( unlikely(partial > 0) )
-+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
-+ PTF_partial_set )
- {
-- ASSERT(!defer);
-+ ASSERT(!(flags & PTF_defer));
- rc = _put_page_type(pg, true, ptpg);
- }
-- else if ( defer )
-+ else if ( flags & PTF_defer )
- {
- current->arch.old_guest_ptpg = ptpg;
- current->arch.old_guest_table = pg;
-@@ -1317,7 +1333,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- }
-
- static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-- int partial, bool defer)
-+ unsigned int flags)
- {
- struct page_info *pg;
- int rc;
-@@ -1340,13 +1356,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-
- pg = l3e_get_page(l3e);
-
-- if ( unlikely(partial > 0) )
-+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
-+ PTF_partial_set )
- {
-- ASSERT(!defer);
-+ ASSERT(!(flags & PTF_defer));
- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
- }
-
-- if ( defer )
-+ if ( flags & PTF_defer )
- {
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
- current->arch.old_guest_table = pg;
-@@ -1361,7 +1378,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- }
-
- static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-- int partial, bool defer)
-+ unsigned int flags)
- {
- int rc = 1;
-
-@@ -1370,13 +1387,14 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- {
- struct page_info *pg = l4e_get_page(l4e);
-
-- if ( unlikely(partial > 0) )
-+ if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
-+ PTF_partial_set )
- {
-- ASSERT(!defer);
-+ ASSERT(!(flags & PTF_defer));
- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
- }
-
-- if ( defer )
-+ if ( flags & PTF_defer )
- {
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
- current->arch.old_guest_table = pg;
-@@ -1483,12 +1501,13 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l2_pgentry_t *pl2e;
- unsigned int i;
-- int rc = 0, partial = page->partial_pte;
-+ int rc = 0;
-+ unsigned int partial_flags = page->partial_flags;
-
- pl2e = map_domain_page(_mfn(pfn));
-
- for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
-- i++, partial = 0 )
-+ i++, partial_flags = 0 )
- {
- if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
- {
-@@ -1498,18 +1517,19 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- }
-
- if ( !is_guest_l2_slot(d, type, i) ||
-- (rc = get_page_from_l2e(pl2e[i], pfn, d, partial)) > 0 )
-+ (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = partial ?: 1;
-+ /* Set 'set', retain 'general ref' */
-+ page->partial_flags = partial_flags | PTF_partial_set;
- }
- else if ( rc == -EINTR && i )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- rc = -ERESTART;
- }
- else if ( rc < 0 && rc != -EINTR )
-@@ -1518,7 +1538,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- if ( i )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
-@@ -1542,7 +1562,8 @@ static int alloc_l3_table(struct page_info *page)
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l3_pgentry_t *pl3e;
- unsigned int i;
-- int rc = 0, partial = page->partial_pte;
-+ int rc = 0;
-+ unsigned int partial_flags = page->partial_flags;
-
- pl3e = map_domain_page(_mfn(pfn));
-
-@@ -1557,7 +1578,7 @@ static int alloc_l3_table(struct page_info *page)
- memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
-
- for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
-- i++, partial = 0 )
-+ i++, partial_flags = 0 )
- {
- if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
- {
-@@ -1574,20 +1595,22 @@ static int alloc_l3_table(struct page_info *page)
- else
- rc = get_page_and_type_from_mfn(
- l3e_get_mfn(pl3e[i]),
-- PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
-+ PGT_l2_page_table | PGT_pae_xen_l2, d,
-+ partial_flags | PTF_preemptible);
- }
-- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
-+ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = partial ?: 1;
-+ /* Set 'set', leave 'general ref' set if this entry was set */
-+ page->partial_flags = partial_flags | PTF_partial_set;
- }
- else if ( rc == -EINTR && i )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- rc = -ERESTART;
- }
- if ( rc < 0 )
-@@ -1604,7 +1627,7 @@ static int alloc_l3_table(struct page_info *page)
- if ( i )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
-@@ -1736,19 +1759,21 @@ static int alloc_l4_table(struct page_info *page)
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
- unsigned int i;
-- int rc = 0, partial = page->partial_pte;
-+ int rc = 0;
-+ unsigned int partial_flags = page->partial_flags;
-
- for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
-- i++, partial = 0 )
-+ i++, partial_flags = 0 )
- {
- if ( !is_guest_l4_slot(d, i) ||
-- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
-+ (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = partial ?: 1;
-+ /* Set 'set', leave 'general ref' set if this entry was set */
-+ page->partial_flags = partial_flags | PTF_partial_set;
- }
- else if ( rc < 0 )
- {
-@@ -1758,7 +1783,7 @@ static int alloc_l4_table(struct page_info *page)
- if ( i )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- if ( rc == -EINTR )
- rc = -ERESTART;
- else
-@@ -1811,19 +1836,20 @@ static int free_l2_table(struct page_info *page)
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l2_pgentry_t *pl2e;
-- int rc = 0, partial = page->partial_pte;
-- unsigned int i = page->nr_validated_ptes - !partial;
-+ int rc = 0;
-+ unsigned int partial_flags = page->partial_flags,
-+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
-
- pl2e = map_domain_page(_mfn(pfn));
-
- for ( ; ; )
- {
- if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) )
-- rc = put_page_from_l2e(pl2e[i], pfn, partial, false);
-+ rc = put_page_from_l2e(pl2e[i], pfn, partial_flags);
- if ( rc < 0 )
- break;
-
-- partial = 0;
-+ partial_flags = 0;
-
- if ( !i-- )
- break;
-@@ -1845,12 +1871,14 @@ static int free_l2_table(struct page_info *page)
- else if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = partial ?: -1;
-+ page->partial_flags = (partial_flags & PTF_partial_set) ?
-+ partial_flags :
-+ (PTF_partial_set | PTF_partial_general_ref);
- }
- else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
- {
- page->nr_validated_ptes = i + 1;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- rc = -ERESTART;
- }
-
-@@ -1862,18 +1890,19 @@ static int free_l3_table(struct page_info *page)
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l3_pgentry_t *pl3e;
-- int rc = 0, partial = page->partial_pte;
-- unsigned int i = page->nr_validated_ptes - !partial;
-+ int rc = 0;
-+ unsigned int partial_flags = page->partial_flags,
-+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
-
- pl3e = map_domain_page(_mfn(pfn));
-
- for ( ; ; )
- {
-- rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
-+ rc = put_page_from_l3e(pl3e[i], pfn, partial_flags);
- if ( rc < 0 )
- break;
-
-- partial = 0;
-+ partial_flags = 0;
- if ( rc == 0 )
- pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
-
-@@ -1892,12 +1921,14 @@ static int free_l3_table(struct page_info *page)
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = partial ?: -1;
-+ page->partial_flags = (partial_flags & PTF_partial_set) ?
-+ partial_flags :
-+ (PTF_partial_set | PTF_partial_general_ref);
- }
- else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
- {
- page->nr_validated_ptes = i + 1;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- rc = -ERESTART;
- }
- return rc > 0 ? 0 : rc;
-@@ -1908,26 +1939,29 @@ static int free_l4_table(struct page_info *page)
- struct domain *d = page_get_owner(page);
- unsigned long pfn = mfn_x(page_to_mfn(page));
- l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
-- int rc = 0, partial = page->partial_pte;
-- unsigned int i = page->nr_validated_ptes - !partial;
-+ int rc = 0;
-+ unsigned partial_flags = page->partial_flags,
-+ i = page->nr_validated_ptes - !(partial_flags & PTF_partial_set);
-
- do {
- if ( is_guest_l4_slot(d, i) )
-- rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
-+ rc = put_page_from_l4e(pl4e[i], pfn, partial_flags);
- if ( rc < 0 )
- break;
-- partial = 0;
-+ partial_flags = 0;
- } while ( i-- );
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_pte = partial ?: -1;
-+ page->partial_flags = (partial_flags & PTF_partial_set) ?
-+ partial_flags :
-+ (PTF_partial_set | PTF_partial_general_ref);
- }
- else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
- {
- page->nr_validated_ptes = i + 1;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- rc = -ERESTART;
- }
-
-@@ -2203,7 +2237,7 @@ static int mod_l2_entry(l2_pgentry_t *pl2e,
- return -EBUSY;
- }
-
-- put_page_from_l2e(ol2e, pfn, 0, true);
-+ put_page_from_l2e(ol2e, pfn, PTF_defer);
-
- return rc;
- }
-@@ -2271,7 +2305,7 @@ static int mod_l3_entry(l3_pgentry_t *pl3e,
- if ( !create_pae_xen_mappings(d, pl3e) )
- BUG();
-
-- put_page_from_l3e(ol3e, pfn, 0, 1);
-+ put_page_from_l3e(ol3e, pfn, PTF_defer);
- return rc;
- }
-
-@@ -2334,7 +2368,7 @@ static int mod_l4_entry(l4_pgentry_t *pl4e,
- return -EFAULT;
- }
-
-- put_page_from_l4e(ol4e, pfn, 0, 1);
-+ put_page_from_l4e(ol4e, pfn, PTF_defer);
- return rc;
- }
-
-@@ -2598,7 +2632,7 @@ int free_page_type(struct page_info *page, unsigned long type,
- if ( !(type & PGT_partial) )
- {
- page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- }
-
- switch ( type & PGT_type_mask )
-@@ -2889,7 +2923,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- if ( !(x & PGT_partial) )
- {
- page->nr_validated_ptes = 0;
-- page->partial_pte = 0;
-+ page->partial_flags = 0;
- }
- page->linear_pt_count = 0;
- rc = alloc_page_type(page, type, preemptible);
-@@ -3064,7 +3098,7 @@ int new_guest_cr3(mfn_t mfn)
- return 0;
- }
-
-- rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
-+ rc = get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, PTF_preemptible);
- switch ( rc )
- {
- case 0:
-@@ -3452,7 +3486,7 @@ long do_mmuext_op(
- if ( op.arg1.mfn != 0 )
- {
- rc = get_page_and_type_from_mfn(
-- _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
-+ _mfn(op.arg1.mfn), PGT_root_page_table, currd, PTF_preemptible);
-
- if ( unlikely(rc) )
- {
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index 1ea173c555..46cba52941 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -228,19 +228,34 @@ struct page_info
- * setting the flag must not drop that reference, whereas the instance
- * clearing it will have to.
- *
-- * If @partial_pte is positive then PTE at @nr_validated_ptes+1 has
-- * been partially validated. This implies that the general reference
-- * to the page (acquired from get_page_from_lNe()) would be dropped
-- * (again due to the apparent failure) and hence must be re-acquired
-- * when resuming the validation, but must not be dropped when picking
-- * up the page for invalidation.
-+ * If partial_flags & PTF_partial_set is set, then the page at
-+ * at @nr_validated_ptes had PGT_partial set as a result of an
-+ * operation on the current page. (That page may or may not
-+ * still have PGT_partial set.)
- *
-- * If @partial_pte is negative then PTE at @nr_validated_ptes+1 has
-- * been partially invalidated. This is basically the opposite case of
-- * above, i.e. the general reference to the page was not dropped in
-- * put_page_from_lNe() (due to the apparent failure), and hence it
-- * must be dropped when the put operation is resumed (and completes),
-- * but it must not be acquired if picking up the page for validation.
-+ * If PTF_partial_general_ref is set, then the PTE at
-+ * @nr_validated_ptef holds a general reference count for the
-+ * page.
-+ *
-+ * This happens:
-+ * - During de-validation, if de-validation of the page was
-+ * interrupted
-+ * - During validation, if an invalid entry is encountered and
-+ * validation is preemptible
-+ * - During validation, if PTF_partial_general_ref was set on
-+ * this entry to begin with (perhaps because we're picking
-+ * up from a partial de-validation).
-+ *
-+ * When resuming validation, if PTF_partial_general_ref is clear,
-+ * then a general reference must be re-acquired; if it is set, no
-+ * reference should be acquired.
-+ *
-+ * When resuming de-validation, if PTF_partial_general_ref is
-+ * clear, no reference should be dropped; if it is set, a
-+ * reference should be dropped.
-+ *
-+ * NB that PTF_partial_set and PTF_partial_general_ref are
-+ * defined in mm.c, the only place where they are used.
- *
- * The 3rd field, @linear_pt_count, indicates
- * - by a positive value, how many same-level page table entries a page
-@@ -251,7 +266,7 @@ struct page_info
- struct {
- u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
- u16 :16 - PAGETABLE_ORDER - 1 - 2;
-- s16 partial_pte:2;
-+ u16 partial_flags:2;
- s16 linear_pt_count;
- };
-
---
-2.23.0
-
diff --git a/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch b/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
deleted file mode 100644
index 767ebc1c03..0000000000
--- a/main/xen/xsa299-0004-x86-mm-Use-flags-for-_put_page_type-rather-than-a-bo.patch
+++ /dev/null
@@ -1,140 +0,0 @@
-From 20b8a6702c6839bafd252789396b443d4b5c5474 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 04/11] x86/mm: Use flags for _put_page_type rather than a
- boolean
-
-This is in mainly in preparation for _put_page_type taking the
-partial_flags value in the future. It also makes it easier to read in
-the caller (since you see a flag name rather than `true` or `false`).
-
-No functional change intended.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 25 +++++++++++++------------
- 1 file changed, 13 insertions(+), 12 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 1c4f54e328..e2fba15d86 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -1207,7 +1207,7 @@ get_page_from_l4e(
- return rc;
- }
-
--static int _put_page_type(struct page_info *page, bool preemptible,
-+static int _put_page_type(struct page_info *page, unsigned int flags,
- struct page_info *ptpg);
-
- void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
-@@ -1314,7 +1314,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- PTF_partial_set )
- {
- ASSERT(!(flags & PTF_defer));
-- rc = _put_page_type(pg, true, ptpg);
-+ rc = _put_page_type(pg, PTF_preemptible, ptpg);
- }
- else if ( flags & PTF_defer )
- {
-@@ -1323,7 +1323,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- }
- else
- {
-- rc = _put_page_type(pg, true, ptpg);
-+ rc = _put_page_type(pg, PTF_preemptible, ptpg);
- if ( likely(!rc) )
- put_page(pg);
- }
-@@ -1360,7 +1360,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- PTF_partial_set )
- {
- ASSERT(!(flags & PTF_defer));
-- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
-+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
- }
-
- if ( flags & PTF_defer )
-@@ -1370,7 +1370,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- return 0;
- }
-
-- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
-+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
- if ( likely(!rc) )
- put_page(pg);
-
-@@ -1391,7 +1391,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- PTF_partial_set )
- {
- ASSERT(!(flags & PTF_defer));
-- return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
-+ return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
- }
-
- if ( flags & PTF_defer )
-@@ -1401,7 +1401,7 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- return 0;
- }
-
-- rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
-+ rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
- if ( likely(!rc) )
- put_page(pg);
- }
-@@ -2701,10 +2701,11 @@ static int _put_final_page_type(struct page_info *page, unsigned long type,
- }
-
-
--static int _put_page_type(struct page_info *page, bool preemptible,
-+static int _put_page_type(struct page_info *page, unsigned int flags,
- struct page_info *ptpg)
- {
- unsigned long nx, x, y = page->u.inuse.type_info;
-+ bool preemptible = flags & PTF_preemptible;
-
- ASSERT(current_locked_page_ne_check(page));
-
-@@ -2911,7 +2912,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-
- if ( unlikely(iommu_ret) )
- {
-- _put_page_type(page, false, NULL);
-+ _put_page_type(page, 0, NULL);
- rc = iommu_ret;
- goto out;
- }
-@@ -2938,7 +2939,7 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-
- void put_page_type(struct page_info *page)
- {
-- int rc = _put_page_type(page, false, NULL);
-+ int rc = _put_page_type(page, 0, NULL);
- ASSERT(rc == 0);
- (void)rc;
- }
-@@ -2955,7 +2956,7 @@ int get_page_type(struct page_info *page, unsigned long type)
-
- int put_page_type_preemptible(struct page_info *page)
- {
-- return _put_page_type(page, true, NULL);
-+ return _put_page_type(page, PTF_preemptible, NULL);
- }
-
- int get_page_type_preemptible(struct page_info *page, unsigned long type)
-@@ -2972,7 +2973,7 @@ int put_old_guest_table(struct vcpu *v)
- if ( !v->arch.old_guest_table )
- return 0;
-
-- switch ( rc = _put_page_type(v->arch.old_guest_table, true,
-+ switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
- v->arch.old_guest_ptpg) )
- {
- case -EINTR:
---
-2.23.0
-
diff --git a/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch b/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
deleted file mode 100644
index 523ea25948..0000000000
--- a/main/xen/xsa299-0005-x86-mm-Rework-get_page_and_type_from_mfn-conditional.patch
+++ /dev/null
@@ -1,79 +0,0 @@
-From 7b3f9f9a797459902bebba962e31be5cbfe7b515 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 05/11] x86/mm: Rework get_page_and_type_from_mfn conditional
-
-Make it easier to read by declaring the conditions in which we will
-retain the ref, rather than the conditions under which we release it.
-
-The only way (page == current->arch.old_guest_table) can be true is if
-preemptible is true; so remove this from the query itself, and add an
-ASSERT() to that effect on the opposite path.
-
-No functional change intended.
-
-NB that alloc_lN_table() mishandle the "linear pt failure" situation
-described in the comment; this will be addressed in a future patch.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 39 +++++++++++++++++++++++++++++++++++++--
- 1 file changed, 37 insertions(+), 2 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index e2fba15d86..eaf7b14245 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -637,8 +637,43 @@ static int get_page_and_type_from_mfn(
-
- rc = _get_page_type(page, type, preemptible);
-
-- if ( unlikely(rc) && !partial_ref &&
-- (!preemptible || page != current->arch.old_guest_table) )
-+ /*
-+ * Retain the refcount if:
-+ * - page is fully validated (rc == 0)
-+ * - page is not validated (rc < 0) but:
-+ * - We came in with a reference (partial_ref)
-+ * - page is partially validated but there's been an error
-+ * (page == current->arch.old_guest_table)
-+ *
-+ * The partial_ref-on-error clause is worth an explanation. There
-+ * are two scenarios where partial_ref might be true coming in:
-+ * - mfn has been partially demoted as type `type`; i.e. has
-+ * PGT_partial set
-+ * - mfn has been partially demoted as L(type+1) (i.e., a linear
-+ * page; e.g. we're being called from get_page_from_l2e with
-+ * type == PGT_l1_table, but the mfn is PGT_l2_table)
-+ *
-+ * If there's an error, in the first case, _get_page_type will
-+ * either return -ERESTART, in which case we want to retain the
-+ * ref (as the caller will consider it retained), or -EINVAL, in
-+ * which case old_guest_table will be set; in both cases, we need
-+ * to retain the ref.
-+ *
-+ * In the second case, if there's an error, _get_page_type() can
-+ * *only* return -EINVAL, and *never* set old_guest_table. In
-+ * that case we also want to retain the reference, to allow the
-+ * page to continue to be torn down (i.e., PGT_partial cleared)
-+ * safely.
-+ *
-+ * Also note that we shouldn't be able to leave with the reference
-+ * count retained unless we succeeded, or the operation was
-+ * preemptible.
-+ */
-+ if ( likely(!rc) || partial_ref )
-+ /* nothing */;
-+ else if ( page == current->arch.old_guest_table )
-+ ASSERT(preemptible);
-+ else
- put_page(page);
-
- return rc;
---
-2.23.0
-
diff --git a/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch b/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
deleted file mode 100644
index c6118010e5..0000000000
--- a/main/xen/xsa299-0006-x86-mm-Have-alloc_l-23-_table-clear-partial_flags-wh.patch
+++ /dev/null
@@ -1,101 +0,0 @@
-From d28893777be56ef51562ed32502377974f738fd3 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 06/11] x86/mm: Have alloc_l[23]_table clear partial_flags when
- preempting
-
-In order to allow recursive pagetable promotions and demotions to be
-interrupted, Xen must keep track of the state of the sub-pages
-promoted or demoted. This is stored in two elements in the page
-struct: nr_entries_validated and partial_flags.
-
-The rule is that entries [0, nr_entries_validated) should always be
-validated and hold a general reference count. If partial_flags is
-zero, then [nr_entries_validated] is not validated and no reference
-count is held. If PTF_partial_set is set, then [nr_entries_validated]
-is partially validated.
-
-At the moment, a distinction is made between promotion and demotion
-with regard to whether the entry itself "holds" a general reference
-count: when entry promotion is interrupted (i.e., returns -ERESTART),
-the entry is not considered to hold a reference; when entry demotion
-is interrupted, the entry is still considered to hold a general
-reference.
-
-PTF_partial_general_ref is used to distinguish between these cases.
-If clear, it's a partial promotion => no general reference count held
-by the entry; if set, it's partial demotion, so a general reference
-count held. Because promotions and demotions can be interleaved, this
-value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
-to be able to properly handle reference counts.
-
-Unfortunately, when alloc_l[23]_table check hypercall_preempt_check()
-and return -ERESTART, they set nr_entries_validated, but don't clear
-partial_flags.
-
-If we were picking up from a previously-interrupted promotion, that
-means that PTF_partial_set would be set even though
-[nr_entries_validated] was not partially validated. This means that
-if the page in this state were de-validated, put_page_type() would
-erroneously be called on that entry.
-
-Perhaps worse, if we were racing with a de-validation, then we might
-leave both PTF_partial_set and PTF_partial_general_ref; and when
-de-validation picked up again, both the type and the general ref would
-be erroneously dropped from [nr_entries_validated].
-
-In a sense, the real issue here is code duplication. Rather than
-duplicate the interruption code, set rc to -EINTR and fall through to
-the code which already handles that case correctly.
-
-Given the logic at this point, it should be impossible for
-partial_flags to be non-zero; add an ASSERT() to catch any changes.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 18 ++++--------------
- 1 file changed, 4 insertions(+), 14 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index eaf7b14245..053465cb7c 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -1545,13 +1545,8 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- i++, partial_flags = 0 )
- {
- if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
-- {
-- page->nr_validated_ptes = i;
-- rc = -ERESTART;
-- break;
-- }
--
-- if ( !is_guest_l2_slot(d, type, i) ||
-+ rc = -EINTR;
-+ else if ( !is_guest_l2_slot(d, type, i) ||
- (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
- continue;
-
-@@ -1616,13 +1611,8 @@ static int alloc_l3_table(struct page_info *page)
- i++, partial_flags = 0 )
- {
- if ( i > page->nr_validated_ptes && hypercall_preempt_check() )
-- {
-- page->nr_validated_ptes = i;
-- rc = -ERESTART;
-- break;
-- }
--
-- if ( is_pv_32bit_domain(d) && (i == 3) )
-+ rc = -EINTR;
-+ else if ( is_pv_32bit_domain(d) && (i == 3) )
- {
- if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
- (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
---
-2.23.0
-
diff --git a/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch b/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
deleted file mode 100644
index 0d54cc535d..0000000000
--- a/main/xen/xsa299-0007-x86-mm-Always-retain-a-general-ref-on-partial.patch
+++ /dev/null
@@ -1,374 +0,0 @@
-From f608a53c25806a7a4318cbe225bc5f5bbf154d69 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 07/11] x86/mm: Always retain a general ref on partial
-
-In order to allow recursive pagetable promotions and demotions to be
-interrupted, Xen must keep track of the state of the sub-pages
-promoted or demoted. This is stored in two elements in the page struct:
-nr_entries_validated and partial_flags.
-
-The rule is that entries [0, nr_entries_validated) should always be
-validated and hold a general reference count. If partial_flags is
-zero, then [nr_entries_validated] is not validated and no reference
-count is held. If PTF_partial_set is set, then [nr_entries_validated]
-is partially validated.
-
-At the moment, a distinction is made between promotion and demotion
-with regard to whether the entry itself "holds" a general reference
-count: when entry promotion is interrupted (i.e., returns -ERESTART),
-the entry is not considered to hold a reference; when entry demotion
-is interrupted, the entry is still considered to hold a general
-reference.
-
-PTF_partial_general_ref is used to distinguish between these cases.
-If clear, it's a partial promotion => no general reference count held
-by the entry; if set, it's partial demotion, so a general reference
-count held. Because promotions and demotions can be interleaved, this
-value is passed to get_page_and_type_from_mfn and put_page_from_l*e,
-to be able to properly handle reference counts.
-
-Unfortunately, because a refcount is not held, it is possible to
-engineer a situation where PFT_partial_set is set but the page in
-question has been assigned to another domain. A sketch is provided in
-the appendix.
-
-Fix this by having the parent page table entry hold a general
-reference count whenever PFT_partial_set is set. (For clarity of
-change, keep two separate flags. These will be collapsed in a
-subsequent changeset.)
-
-This has two basic implications. On the put_page_from_lNe() side,
-this mean that the (partial_set && !partial_ref) case can never happen,
-and no longer needs to be special-cased.
-
-Secondly, because both flags are set together, there's no need to carry over
-existing bits from partial_pte.
-
-(NB there is still another issue with calling _put_page_type() on a
-page which had PGT_partial set; that will be handled in a subsequent
-patch.)
-
-On the get_page_and_type_from_mfn() side, we need to distinguish
-between callers which hold a reference on partial (i.e.,
-alloc_lN_table()), and those which do not (new_cr3, PIN_LN_TABLE, and
-so on): pass a flag if the type should be retained on interruption.
-
-NB that since l1 promotion can't be preempted, that get_page_from_l2e
-can't return -ERESTART.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
------
-* Appendix: Engineering PTF_partial_set while a page belongs to a
- foreign domain
-
-Suppose A is a page which can be promoted to an l3, and B is a page
-which can be promoted to an l2, and A[x] points to B. B has
-PGC_allocated set but no other general references.
-
-V1: PIN_L3 A.
- A is validated, B is validated.
- A.type_count = 1 | PGT_validated | PGT_pinned
- B.type_count = 1 | PGT_validated
- B.count = 2 | PGC_allocated (A[x] holds a general ref)
-
-V1: UNPIN A.
- A begins de-validation.
- Arrange to be interrupted when i < x
- V1->old_guest_table = A
- V1->old_guest_table_ref_held = false
- A.type_count = 1 | PGT_partial
- A.nr_validated_entries = i < x
- B.type_count = 0
- B.count = 1 | PGC_allocated
-
-V2: MOD_L4_ENTRY to point some l4e to A.
- Picks up re-validation of A.
- Arrange to be interrupted halfway through B's validation
- B.type_count = 1 | PGT_partial
- B.count = 2 | PGC_allocated (PGT_partial holds a general ref)
- A.type_count = 1 | PGT_partial
- A.nr_validated_entries = x
- A.partial_pte = PTF_partial_set
-
-V3: MOD_L3_ENTRY to point some other l3e (not in A) to B.
- Validates B.
- B.type_count = 1 | PGT_validated
- B.count = 2 | PGC_allocated ("other l3e" holds a general ref)
-
-V3: MOD_L3_ENTRY to clear l3e pointing to B.
- Devalidates B.
- B.type_count = 0
- B.count = 1 | PGC_allocated
-
-V3: decrease_reservation(B)
- Clears PGC_allocated
- B.count = 0 => B is freed
-
-B gets assigned to a different domain
-
-V1: Restarts UNPIN of A
- put_old_guest_table(A)
- ...
- free_l3_table(A)
-
-Now since A.partial_flags has PTF_partial_set, free_l3_table() will
-call put_page_from_l3e() on A[x], which points to B, while B is owned
-by another domain.
-
-If A[x] held a general refcount for B on partial validation, as it does
-for partial de-validation, then B would still have a reference count of
-1 after PGC_allocated was freed; so B wouldn't be freed until after
-put_page_from_l3e() had happend on A[x].
----
- xen/arch/x86/mm.c | 84 +++++++++++++++++++++++-----------------
- xen/include/asm-x86/mm.h | 15 ++++---
- 2 files changed, 58 insertions(+), 41 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 053465cb7c..68a9e74002 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -617,10 +617,11 @@ static int _get_page_type(struct page_info *page, unsigned long type,
- * page->pte[page->nr_validated_entries]. See the comment in mm.h for
- * more information.
- */
--#define PTF_partial_set (1 << 0)
--#define PTF_partial_general_ref (1 << 1)
--#define PTF_preemptible (1 << 2)
--#define PTF_defer (1 << 3)
-+#define PTF_partial_set (1 << 0)
-+#define PTF_partial_general_ref (1 << 1)
-+#define PTF_preemptible (1 << 2)
-+#define PTF_defer (1 << 3)
-+#define PTF_retain_ref_on_restart (1 << 4)
-
- static int get_page_and_type_from_mfn(
- mfn_t mfn, unsigned long type, struct domain *d,
-@@ -629,7 +630,11 @@ static int get_page_and_type_from_mfn(
- struct page_info *page = mfn_to_page(mfn);
- int rc;
- bool preemptible = flags & PTF_preemptible,
-- partial_ref = flags & PTF_partial_general_ref;
-+ partial_ref = flags & PTF_partial_general_ref,
-+ partial_set = flags & PTF_partial_set,
-+ retain_ref = flags & PTF_retain_ref_on_restart;
-+
-+ ASSERT(partial_ref == partial_set);
-
- if ( likely(!partial_ref) &&
- unlikely(!get_page_from_mfn(mfn, d)) )
-@@ -642,13 +647,15 @@ static int get_page_and_type_from_mfn(
- * - page is fully validated (rc == 0)
- * - page is not validated (rc < 0) but:
- * - We came in with a reference (partial_ref)
-+ * - page is partially validated (rc == -ERESTART), and the
-+ * caller has asked the ref to be retained in that case
- * - page is partially validated but there's been an error
- * (page == current->arch.old_guest_table)
- *
- * The partial_ref-on-error clause is worth an explanation. There
- * are two scenarios where partial_ref might be true coming in:
-- * - mfn has been partially demoted as type `type`; i.e. has
-- * PGT_partial set
-+ * - mfn has been partially promoted / demoted as type `type`;
-+ * i.e. has PGT_partial set
- * - mfn has been partially demoted as L(type+1) (i.e., a linear
- * page; e.g. we're being called from get_page_from_l2e with
- * type == PGT_l1_table, but the mfn is PGT_l2_table)
-@@ -671,7 +678,8 @@ static int get_page_and_type_from_mfn(
- */
- if ( likely(!rc) || partial_ref )
- /* nothing */;
-- else if ( page == current->arch.old_guest_table )
-+ else if ( page == current->arch.old_guest_table ||
-+ (retain_ref && rc == -ERESTART) )
- ASSERT(preemptible);
- else
- put_page(page);
-@@ -1348,8 +1356,8 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
- PTF_partial_set )
- {
-- ASSERT(!(flags & PTF_defer));
-- rc = _put_page_type(pg, PTF_preemptible, ptpg);
-+ /* partial_set should always imply partial_ref */
-+ BUG();
- }
- else if ( flags & PTF_defer )
- {
-@@ -1394,8 +1402,8 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
- PTF_partial_set )
- {
-- ASSERT(!(flags & PTF_defer));
-- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
-+ /* partial_set should always imply partial_ref */
-+ BUG();
- }
-
- if ( flags & PTF_defer )
-@@ -1425,8 +1433,8 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
- PTF_partial_set )
- {
-- ASSERT(!(flags & PTF_defer));
-- return _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
-+ /* partial_set should always imply partial_ref */
-+ BUG();
- }
-
- if ( flags & PTF_defer )
-@@ -1550,13 +1558,22 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- (rc = get_page_from_l2e(pl2e[i], pfn, d, partial_flags)) > 0 )
- continue;
-
-- if ( rc == -ERESTART )
-- {
-- page->nr_validated_ptes = i;
-- /* Set 'set', retain 'general ref' */
-- page->partial_flags = partial_flags | PTF_partial_set;
-- }
-- else if ( rc == -EINTR && i )
-+ /*
-+ * It shouldn't be possible for get_page_from_l2e to return
-+ * -ERESTART, since we never call this with PTF_preemptible.
-+ * (alloc_l1_table may return -EINTR on an L1TF-vulnerable
-+ * entry.)
-+ *
-+ * NB that while on a "clean" promotion, we can never get
-+ * PGT_partial. It is possible to arrange for an l2e to
-+ * contain a partially-devalidated l2; but in that case, both
-+ * of the following functions will fail anyway (the first
-+ * because the page in question is not an l1; the second
-+ * because the page is not fully validated).
-+ */
-+ ASSERT(rc != -ERESTART);
-+
-+ if ( rc == -EINTR && i )
- {
- page->nr_validated_ptes = i;
- page->partial_flags = 0;
-@@ -1565,6 +1582,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- else if ( rc < 0 && rc != -EINTR )
- {
- gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
-+ ASSERT(current->arch.old_guest_table == NULL);
- if ( i )
- {
- page->nr_validated_ptes = i;
-@@ -1621,16 +1639,17 @@ static int alloc_l3_table(struct page_info *page)
- rc = get_page_and_type_from_mfn(
- l3e_get_mfn(pl3e[i]),
- PGT_l2_page_table | PGT_pae_xen_l2, d,
-- partial_flags | PTF_preemptible);
-+ partial_flags | PTF_preemptible | PTF_retain_ref_on_restart);
- }
-- else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial_flags)) > 0 )
-+ else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d,
-+ partial_flags | PTF_retain_ref_on_restart)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
- /* Set 'set', leave 'general ref' set if this entry was set */
-- page->partial_flags = partial_flags | PTF_partial_set;
-+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
- }
- else if ( rc == -EINTR && i )
- {
-@@ -1791,14 +1810,15 @@ static int alloc_l4_table(struct page_info *page)
- i++, partial_flags = 0 )
- {
- if ( !is_guest_l4_slot(d, i) ||
-- (rc = get_page_from_l4e(pl4e[i], pfn, d, partial_flags)) > 0 )
-+ (rc = get_page_from_l4e(pl4e[i], pfn, d,
-+ partial_flags | PTF_retain_ref_on_restart)) > 0 )
- continue;
-
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
- /* Set 'set', leave 'general ref' set if this entry was set */
-- page->partial_flags = partial_flags | PTF_partial_set;
-+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
- }
- else if ( rc < 0 )
- {
-@@ -1896,9 +1916,7 @@ static int free_l2_table(struct page_info *page)
- else if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = (partial_flags & PTF_partial_set) ?
-- partial_flags :
-- (PTF_partial_set | PTF_partial_general_ref);
-+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
- }
- else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
- {
-@@ -1946,9 +1964,7 @@ static int free_l3_table(struct page_info *page)
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = (partial_flags & PTF_partial_set) ?
-- partial_flags :
-- (PTF_partial_set | PTF_partial_general_ref);
-+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
- }
- else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
- {
-@@ -1979,9 +1995,7 @@ static int free_l4_table(struct page_info *page)
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = (partial_flags & PTF_partial_set) ?
-- partial_flags :
-- (PTF_partial_set | PTF_partial_general_ref);
-+ page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
- }
- else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
- {
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index 46cba52941..dc9cb869dd 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -238,22 +238,25 @@ struct page_info
- * page.
- *
- * This happens:
-- * - During de-validation, if de-validation of the page was
-+ * - During validation or de-validation, if the operation was
- * interrupted
- * - During validation, if an invalid entry is encountered and
- * validation is preemptible
- * - During validation, if PTF_partial_general_ref was set on
-- * this entry to begin with (perhaps because we're picking
-- * up from a partial de-validation).
-+ * this entry to begin with (perhaps because it picked up a
-+ * previous operation)
- *
-- * When resuming validation, if PTF_partial_general_ref is clear,
-- * then a general reference must be re-acquired; if it is set, no
-- * reference should be acquired.
-+ * When resuming validation, if PTF_partial_general_ref is
-+ * clear, then a general reference must be re-acquired; if it
-+ * is set, no reference should be acquired.
- *
- * When resuming de-validation, if PTF_partial_general_ref is
- * clear, no reference should be dropped; if it is set, a
- * reference should be dropped.
- *
-+ * NB at the moment, PTF_partial_set should be set if and only if
-+ * PTF_partial_general_ref is set.
-+ *
- * NB that PTF_partial_set and PTF_partial_general_ref are
- * defined in mm.c, the only place where they are used.
- *
---
-2.23.0
-
diff --git a/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch b/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
deleted file mode 100644
index dd847e8ef7..0000000000
--- a/main/xen/xsa299-0008-x86-mm-Collapse-PTF_partial_set-and-PTF_partial_gene.patch
+++ /dev/null
@@ -1,227 +0,0 @@
-From 6811df7fb7a1d4bb5a75fec9cf41519b5c86c605 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 08/11] x86/mm: Collapse PTF_partial_set and
- PTF_partial_general_ref into one
-
-...now that they are equivalent. No functional change intended.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 50 +++++++++++-----------------------------
- xen/include/asm-x86/mm.h | 29 +++++++++++------------
- 2 files changed, 26 insertions(+), 53 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 68a9e74002..4970b19aff 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -612,13 +612,12 @@ static int _get_page_type(struct page_info *page, unsigned long type,
-
- /*
- * The following flags are used to specify behavior of various get and
-- * put commands. The first two are also stored in page->partial_flags
-- * to indicate the state of the page pointed to by
-+ * put commands. The first is also stored in page->partial_flags to
-+ * indicate the state of the page pointed to by
- * page->pte[page->nr_validated_entries]. See the comment in mm.h for
- * more information.
- */
- #define PTF_partial_set (1 << 0)
--#define PTF_partial_general_ref (1 << 1)
- #define PTF_preemptible (1 << 2)
- #define PTF_defer (1 << 3)
- #define PTF_retain_ref_on_restart (1 << 4)
-@@ -630,13 +629,10 @@ static int get_page_and_type_from_mfn(
- struct page_info *page = mfn_to_page(mfn);
- int rc;
- bool preemptible = flags & PTF_preemptible,
-- partial_ref = flags & PTF_partial_general_ref,
- partial_set = flags & PTF_partial_set,
- retain_ref = flags & PTF_retain_ref_on_restart;
-
-- ASSERT(partial_ref == partial_set);
--
-- if ( likely(!partial_ref) &&
-+ if ( likely(!partial_set) &&
- unlikely(!get_page_from_mfn(mfn, d)) )
- return -EINVAL;
-
-@@ -646,14 +642,14 @@ static int get_page_and_type_from_mfn(
- * Retain the refcount if:
- * - page is fully validated (rc == 0)
- * - page is not validated (rc < 0) but:
-- * - We came in with a reference (partial_ref)
-+ * - We came in with a reference (partial_set)
- * - page is partially validated (rc == -ERESTART), and the
- * caller has asked the ref to be retained in that case
- * - page is partially validated but there's been an error
- * (page == current->arch.old_guest_table)
- *
-- * The partial_ref-on-error clause is worth an explanation. There
-- * are two scenarios where partial_ref might be true coming in:
-+ * The partial_set-on-error clause is worth an explanation. There
-+ * are two scenarios where partial_set might be true coming in:
- * - mfn has been partially promoted / demoted as type `type`;
- * i.e. has PGT_partial set
- * - mfn has been partially demoted as L(type+1) (i.e., a linear
-@@ -676,7 +672,7 @@ static int get_page_and_type_from_mfn(
- * count retained unless we succeeded, or the operation was
- * preemptible.
- */
-- if ( likely(!rc) || partial_ref )
-+ if ( likely(!rc) || partial_set )
- /* nothing */;
- else if ( page == current->arch.old_guest_table ||
- (retain_ref && rc == -ERESTART) )
-@@ -1353,13 +1349,7 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- struct page_info *pg = l2e_get_page(l2e);
- struct page_info *ptpg = mfn_to_page(_mfn(pfn));
-
-- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
-- PTF_partial_set )
-- {
-- /* partial_set should always imply partial_ref */
-- BUG();
-- }
-- else if ( flags & PTF_defer )
-+ if ( flags & PTF_defer )
- {
- current->arch.old_guest_ptpg = ptpg;
- current->arch.old_guest_table = pg;
-@@ -1399,13 +1389,6 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-
- pg = l3e_get_page(l3e);
-
-- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
-- PTF_partial_set )
-- {
-- /* partial_set should always imply partial_ref */
-- BUG();
-- }
--
- if ( flags & PTF_defer )
- {
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
-@@ -1430,13 +1413,6 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
- {
- struct page_info *pg = l4e_get_page(l4e);
-
-- if ( (flags & (PTF_partial_set | PTF_partial_general_ref)) ==
-- PTF_partial_set )
-- {
-- /* partial_set should always imply partial_ref */
-- BUG();
-- }
--
- if ( flags & PTF_defer )
- {
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
-@@ -1649,7 +1625,7 @@ static int alloc_l3_table(struct page_info *page)
- {
- page->nr_validated_ptes = i;
- /* Set 'set', leave 'general ref' set if this entry was set */
-- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
-+ page->partial_flags = PTF_partial_set;
- }
- else if ( rc == -EINTR && i )
- {
-@@ -1818,7 +1794,7 @@ static int alloc_l4_table(struct page_info *page)
- {
- page->nr_validated_ptes = i;
- /* Set 'set', leave 'general ref' set if this entry was set */
-- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
-+ page->partial_flags = PTF_partial_set;
- }
- else if ( rc < 0 )
- {
-@@ -1916,7 +1892,7 @@ static int free_l2_table(struct page_info *page)
- else if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
-+ page->partial_flags = PTF_partial_set;
- }
- else if ( rc == -EINTR && i < L2_PAGETABLE_ENTRIES - 1 )
- {
-@@ -1964,7 +1940,7 @@ static int free_l3_table(struct page_info *page)
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
-+ page->partial_flags = PTF_partial_set;
- }
- else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
- {
-@@ -1995,7 +1971,7 @@ static int free_l4_table(struct page_info *page)
- if ( rc == -ERESTART )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = PTF_partial_set | PTF_partial_general_ref;
-+ page->partial_flags = PTF_partial_set;
- }
- else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
- {
-diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
-index dc9cb869dd..c6ba9e4d73 100644
---- a/xen/include/asm-x86/mm.h
-+++ b/xen/include/asm-x86/mm.h
-@@ -233,7 +233,7 @@ struct page_info
- * operation on the current page. (That page may or may not
- * still have PGT_partial set.)
- *
-- * If PTF_partial_general_ref is set, then the PTE at
-+ * Additionally, if PTF_partial_set is set, then the PTE at
- * @nr_validated_ptef holds a general reference count for the
- * page.
- *
-@@ -242,23 +242,20 @@ struct page_info
- * interrupted
- * - During validation, if an invalid entry is encountered and
- * validation is preemptible
-- * - During validation, if PTF_partial_general_ref was set on
-- * this entry to begin with (perhaps because it picked up a
-+ * - During validation, if PTF_partial_set was set on this
-+ * entry to begin with (perhaps because it picked up a
- * previous operation)
- *
-- * When resuming validation, if PTF_partial_general_ref is
-- * clear, then a general reference must be re-acquired; if it
-- * is set, no reference should be acquired.
-+ * When resuming validation, if PTF_partial_set is clear, then
-+ * a general reference must be re-acquired; if it is set, no
-+ * reference should be acquired.
- *
-- * When resuming de-validation, if PTF_partial_general_ref is
-- * clear, no reference should be dropped; if it is set, a
-- * reference should be dropped.
-+ * When resuming de-validation, if PTF_partial_set is clear,
-+ * no reference should be dropped; if it is set, a reference
-+ * should be dropped.
- *
-- * NB at the moment, PTF_partial_set should be set if and only if
-- * PTF_partial_general_ref is set.
-- *
-- * NB that PTF_partial_set and PTF_partial_general_ref are
-- * defined in mm.c, the only place where they are used.
-+ * NB that PTF_partial_set is defined in mm.c, the only place
-+ * where it is used.
- *
- * The 3rd field, @linear_pt_count, indicates
- * - by a positive value, how many same-level page table entries a page
-@@ -268,8 +265,8 @@ struct page_info
- */
- struct {
- u16 nr_validated_ptes:PAGETABLE_ORDER + 1;
-- u16 :16 - PAGETABLE_ORDER - 1 - 2;
-- u16 partial_flags:2;
-+ u16 :16 - PAGETABLE_ORDER - 1 - 1;
-+ u16 partial_flags:1;
- s16 linear_pt_count;
- };
-
---
-2.23.0
-
diff --git a/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch b/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
deleted file mode 100644
index f62d7747a8..0000000000
--- a/main/xen/xsa299-0009-x86-mm-Properly-handle-linear-pagetable-promotion-fa.patch
+++ /dev/null
@@ -1,106 +0,0 @@
-From a6098b8920b02149220641cb13358e9012b5fc4d Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 09/11] x86/mm: Properly handle linear pagetable promotion
- failures
-
-In order to allow recursive pagetable promotions and demotions to be
-interrupted, Xen must keep track of the state of the sub-pages
-promoted or demoted. This is stored in two elements in the page
-struct: nr_entries_validated and partial_flags.
-
-The rule is that entries [0, nr_entries_validated) should always be
-validated and hold a general reference count. If partial_flags is
-zero, then [nr_entries_validated] is not validated and no reference
-count is held. If PTF_partial_set is set, then [nr_entries_validated]
-is partially validated, and a general reference count is held.
-
-Unfortunately, in cases where an entry began with PTF_partial_set set,
-and get_page_from_lNe() returns -EINVAL, the PTF_partial_set bit is
-erroneously dropped. (This scenario can be engineered mainly by the
-use of interleaving of promoting and demoting a page which has "linear
-pagetable" entries; see the appendix for a sketch.) This means that
-we will "leak" a general reference count on the page in question,
-preventing the page from being freed.
-
-Fix this by setting page->partial_flags to the partial_flags local
-variable.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
------
-Appendix
-
-Suppose A and B can both be promoted to L2 pages, and A[x] points to B.
-
-V1: PIN_L2 B.
- B.type_count = 1 | PGT_validated
- B.count = 2 | PGC_allocated
-
-V1: MOD_L3_ENTRY pointing something to A.
- In the process of validating A[x], grab an extra type / ref on B:
- B.type_count = 2 | PGT_validated
- B.count = 3 | PGC_allocated
- A.type_count = 1 | PGT_validated
- A.count = 2 | PGC_allocated
-
-V1: UNPIN B.
- B.type_count = 1 | PGT_validate
- B.count = 2 | PGC_allocated
-
-V1: MOD_L3_ENTRY removing the reference to A.
- De-validate A, down to A[x], which points to B.
- Drop the final type on B. Arrange to be interrupted.
- B.type_count = 1 | PGT_partial
- B.count = 2 | PGC_allocated
- A.type_count = 1 | PGT_partial
- A.nr_validated_entries = x
- A.partial_pte = -1
-
-V2: MOD_L3_ENTRY adds a reference to A.
-
-At this point, get_page_from_l2e(A[x]) tries
-get_page_and_type_from_mfn(), which fails because it's the wrong type;
-and get_l2_linear_pagetable() also fails, because B isn't validated as
-an l2 anymore.
----
- xen/arch/x86/mm.c | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index 4970b19aff..cfb7538403 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -1562,7 +1562,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- if ( i )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = 0;
-+ page->partial_flags = partial_flags;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
-@@ -1647,7 +1647,7 @@ static int alloc_l3_table(struct page_info *page)
- if ( i )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = 0;
-+ page->partial_flags = partial_flags;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
-@@ -1804,7 +1804,7 @@ static int alloc_l4_table(struct page_info *page)
- if ( i )
- {
- page->nr_validated_ptes = i;
-- page->partial_flags = 0;
-+ page->partial_flags = partial_flags;
- if ( rc == -EINTR )
- rc = -ERESTART;
- else
---
-2.23.0
-
diff --git a/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch b/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
deleted file mode 100644
index 643ef5314a..0000000000
--- a/main/xen/xsa299-0010-x86-mm-Fix-nested-de-validation-on-error.patch
+++ /dev/null
@@ -1,169 +0,0 @@
-From eabd77b59f4006128501d6e15f9e620dfb349420 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:49 +0100
-Subject: [PATCH 10/11] x86/mm: Fix nested de-validation on error
-
-If an invalid entry is discovered when validating a page-table tree,
-the entire tree which has so far been validated must be de-validated.
-Since this may take a long time, alloc_l[2-4]_table() set current
-vcpu's old_guest_table immediately; put_old_guest_table() will make
-sure that put_page_type() will be called to finish off the
-de-validation before any other MMU operations can happen on the vcpu.
-
-The invariant for partial pages should be:
-
-* Entries [0, nr_validated_ptes) should be completely validated;
- put_page_type() will de-validate these.
-
-* If [nr_validated_ptes] is partially validated, partial_flags should
- set PTF_partiaL_set. put_page_type() will be called on this page to
- finish off devalidation, and the appropriate refcount adjustments
- will be done.
-
-alloc_l[2-3]_table() indicates partial validation to its callers by
-setting current->old_guest_table.
-
-Unfortunately, this is mishandled.
-
-Take the case where validating lNe[x] returns an error.
-
-First, alloc_l3_table() doesn't check old_guest_table at all; as a
-result, partial_flags is not set when it should be. nr_validated_ptes
-is set to x; and since PFT_partial_set clear, de-validation resumes at
-nr_validated_ptes-1. This means that the l2 page at pl3e[x] will not
-have put_page_type() called on it when de-validating the rest of the
-l3: it will be stuck in the PGT_partial state until the domain is
-destroyed, or until it is re-used as an l2. (Any other page type will
-fail.)
-
-Worse, alloc_l4_table(), rather than setting PTF_partial_set as it
-should, sets nr_validated_ptes to x+1. When de-validating, since
-partial is 0, this will correctly resume calling put_page_type at [x];
-but, if the put_page_type() is never called, but instead
-get_page_type() is called, validation will pick up at [x+1],
-neglecting to validate [x]. If the rest of the validation succeeds,
-the l4 will be validated even though [x] is invalid.
-
-Fix this in both cases by setting PTF_partial_set if old_guest_table
-is set.
-
-While here, add some safety catches:
-- old_guest_table must point to the page contained in
- [nr_validated_ptes].
-- alloc_l1_page shouldn't set old_guest_table
-
-If we experience one of these situations in production builds, it's
-safer to avoid calling put_page_type for the pages in question. If
-they have PGT_partial set, they will be cleaned up on domain
-destruction; if not, we have no idea whether a type count is safe to
-drop. Retaining an extra type ref that should have been dropped may
-trigger a BUG() on the free_domain_page() path, but dropping a type
-count that shouldn't be dropped may cause a privilege escalation.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
----
- xen/arch/x86/mm.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-
- 1 file changed, 54 insertions(+), 1 deletion(-)
-
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index cfb7538403..aa03cb8b40 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -1561,6 +1561,20 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- ASSERT(current->arch.old_guest_table == NULL);
- if ( i )
- {
-+ /*
-+ * alloc_l1_table() doesn't set old_guest_table; it does
-+ * its own tear-down immediately on failure. If it
-+ * did we'd need to check it and set partial_flags as we
-+ * do in alloc_l[34]_table().
-+ *
-+ * Note on the use of ASSERT: if it's non-null and
-+ * hasn't been cleaned up yet, it should have
-+ * PGT_partial set; and so the type will be cleaned up
-+ * on domain destruction. Unfortunately, we would
-+ * leak the general ref held by old_guest_table; but
-+ * leaking a page is less bad than a host crash.
-+ */
-+ ASSERT(current->arch.old_guest_table == NULL);
- page->nr_validated_ptes = i;
- page->partial_flags = partial_flags;
- current->arch.old_guest_ptpg = NULL;
-@@ -1588,6 +1602,7 @@ static int alloc_l3_table(struct page_info *page)
- unsigned int i;
- int rc = 0;
- unsigned int partial_flags = page->partial_flags;
-+ l3_pgentry_t l3e = l3e_empty();
-
- pl3e = map_domain_page(_mfn(pfn));
-
-@@ -1634,7 +1649,11 @@ static int alloc_l3_table(struct page_info *page)
- rc = -ERESTART;
- }
- if ( rc < 0 )
-+ {
-+ /* XSA-299 Backport: Copy l3e for checking */
-+ l3e = pl3e[i];
- break;
-+ }
-
- pl3e[i] = adjust_guest_l3e(pl3e[i], d);
- }
-@@ -1648,6 +1667,24 @@ static int alloc_l3_table(struct page_info *page)
- {
- page->nr_validated_ptes = i;
- page->partial_flags = partial_flags;
-+ if ( current->arch.old_guest_table )
-+ {
-+ /*
-+ * We've experienced a validation failure. If
-+ * old_guest_table is set, "transfer" the general
-+ * reference count to pl3e[nr_validated_ptes] by
-+ * setting PTF_partial_set.
-+ *
-+ * As a precaution, check that old_guest_table is the
-+ * page pointed to by pl3e[nr_validated_ptes]. If
-+ * not, it's safer to leak a type ref on production
-+ * builds.
-+ */
-+ if ( current->arch.old_guest_table == l3e_get_page(l3e) )
-+ page->partial_flags = PTF_partial_set;
-+ else
-+ ASSERT_UNREACHABLE();
-+ }
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
-@@ -1810,7 +1847,23 @@ static int alloc_l4_table(struct page_info *page)
- else
- {
- if ( current->arch.old_guest_table )
-- page->nr_validated_ptes++;
-+ {
-+ /*
-+ * We've experienced a validation failure. If
-+ * old_guest_table is set, "transfer" the general
-+ * reference count to pl3e[nr_validated_ptes] by
-+ * setting PTF_partial_set.
-+ *
-+ * As a precaution, check that old_guest_table is the
-+ * page pointed to by pl4e[nr_validated_ptes]. If
-+ * not, it's safer to leak a type ref on production
-+ * builds.
-+ */
-+ if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
-+ page->partial_flags = PTF_partial_set;
-+ else
-+ ASSERT_UNREACHABLE();
-+ }
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
- }
---
-2.23.0
-
diff --git a/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch b/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
deleted file mode 100644
index 24970da251..0000000000
--- a/main/xen/xsa299-0011-x86-mm-Don-t-drop-a-type-ref-unless-you-held-a-ref-t.patch
+++ /dev/null
@@ -1,413 +0,0 @@
-From f0086e3ac65c8bcabb84c1c29ab00b0c8a187555 Mon Sep 17 00:00:00 2001
-From: George Dunlap <george.dunlap@citrix.com>
-Date: Thu, 10 Oct 2019 17:57:50 +0100
-Subject: [PATCH 11/11] x86/mm: Don't drop a type ref unless you held a ref to
- begin with
-
-Validation and de-validation of pagetable trees may take arbitrarily
-large amounts of time, and so must be preemptible. This is indicated
-by setting the PGT_partial bit in the type_info, and setting
-nr_validated_entries and partial_flags appropriately. Specifically,
-if the entry at [nr_validated_entries] is partially validated,
-partial_flags should have the PGT_partial_set bit set, and the entry
-should hold a general reference count. During de-validation,
-put_page_type() is called on partially validated entries.
-
-Unfortunately, there are a number of issues with the current algorithm.
-
-First, doing a "normal" put_page_type() is not safe when no type ref
-is held: there is nothing to stop another vcpu from coming along and
-picking up validation again: at which point the put_page_type may drop
-the only page ref on an in-use page. Some examples are listed in the
-appendix.
-
-The core issue is that put_page_type() is being called both to clean
-up PGT_partial, and to drop a type count; and has no way of knowing
-which is which; and so if in between, PGT_partial is cleared,
-put_page_type() will drop the type ref erroneously.
-
-What is needed is to distinguish between two states:
-- Dropping a type ref which is held
-- Cleaning up a page which has been partially de/validated
-
-Fix this by telling put_page_type() which of the two activities you
-intend.
-
-When cleaning up a partial de/validation, take no action unless you
-find a page partially validated.
-
-If put_page_type() is called without PTF_partial_set, and finds the
-page in a PGT_partial state anyway, then there's certainly been a
-misaccounting somewhere, and carrying on would almost certainly cause
-a security issue, so crash the host instead.
-
-In put_page_from_lNe, pass partial_flags on to _put_page_type().
-
-old_guest_table may be set either with a fully validated page (when
-using the "deferred put" pattern), or with a partially validated page
-(when a normal "de-validation" is interrupted, or when a validation
-fails part-way through due to invalid entries). Add a flag,
-old_guest_table_partial, to indicate which of these it is, and use
-that to pass the appropriate flag to _put_page_type().
-
-While here, delete stray trailing whitespace.
-
-This is part of XSA-299.
-
-Reported-by: George Dunlap <george.dunlap@citrix.com>
-Signed-off-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
------
-Appendix:
-
-Suppose page A, when interpreted as an l3 pagetable, contains all
-valid entries; and suppose A[x] points to page B, which when
-interpreted as an l2 pagetable, contains all valid entries.
-
-P1: PIN_L3_TABLE
- A -> PGT_l3_table | 1 | valid
- B -> PGT_l2_table | 1 | valid
-
-P1: UNPIN_TABLE
- > Arrange to interrupt after B has been de-validated
- B:
- type_info -> PGT_l2_table | 0
- A:
- type_info -> PGT_l3_table | 1 | partial
- nr_validated_enties -> (less than x)
-
-P2: mod_l4_entry to point to A
- > Arrange for this to be interrupted while B is being validated
- B:
- type_info -> PGT_l2_table | 1 | partial
- (nr_validated_entires &c set as appropriate)
- A:
- type_info -> PGT_l3_table | 1 | partial
- nr_validated_entries -> x
- partial_pte = 1
-
-P3: mod_l3_entry some other unrelated l3 to point to B:
- B:
- type_info -> PGT_l2_table | 1
-
-P1: Restart UNPIN_TABLE
-
-At this point, since A.nr_validate_entries == x and A.partial_pte !=
-0, free_l3_table() will call put_page_from_l3e() on pl3e[x], dropping
-its type count to 0 while it's still being pointed to by some other l3
-
-A similar issue arises with old_guest_table. Consider the following
-scenario:
-
-Suppose A is a page which, when interpreted as an l2, has valid entries
-until entry x, which is invalid.
-
-V1: PIN_L2_TABLE(A)
- <Validate until we try to validate [x], get -EINVAL>
- A -> PGT_l2_table | 1 | PGT_partial
- V1 -> old_guest_table = A
- <delayed>
-
-V2: PIN_L2_TABLE(A)
- <Pick up where V1 left off, try to re-validate [x], get -EINVAL>
- A -> PGT_l2_table | 1 | PGT_partial
- V2 -> old_guest_table = A
- <restart>
- put_old_guest_table()
- _put_page_type(A)
- A -> PGT_l2_table | 0
-
-V1: <restart>
- put_old_guest_table()
- _put_page_type(A) # UNDERFLOW
-
-Indeed, it is possible to engineer for old_guest_table for every vcpu
-a guest has to point to the same page.
----
- xen/arch/x86/domain.c | 6 +++
- xen/arch/x86/mm.c | 99 +++++++++++++++++++++++++++++++-----
- xen/include/asm-x86/domain.h | 4 +-
- 3 files changed, 95 insertions(+), 14 deletions(-)
-
-diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
-index 8fbecbb169..c880568dd4 100644
---- a/xen/arch/x86/domain.c
-+++ b/xen/arch/x86/domain.c
-@@ -1074,9 +1074,15 @@ int arch_set_info_guest(
- rc = -ERESTART;
- /* Fallthrough */
- case -ERESTART:
-+ /*
-+ * NB that we're putting the kernel-mode table
-+ * here, which we've already successfully
-+ * validated above; hence partial = false;
-+ */
- v->arch.old_guest_ptpg = NULL;
- v->arch.old_guest_table =
- pagetable_get_page(v->arch.guest_table);
-+ v->arch.old_guest_table_partial = false;
- v->arch.guest_table = pagetable_null();
- break;
- default:
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index aa03cb8b40..c701c7ef14 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -1353,10 +1353,11 @@ static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn,
- {
- current->arch.old_guest_ptpg = ptpg;
- current->arch.old_guest_table = pg;
-+ current->arch.old_guest_table_partial = false;
- }
- else
- {
-- rc = _put_page_type(pg, PTF_preemptible, ptpg);
-+ rc = _put_page_type(pg, flags | PTF_preemptible, ptpg);
- if ( likely(!rc) )
- put_page(pg);
- }
-@@ -1379,6 +1380,7 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
- unsigned long mfn = l3e_get_pfn(l3e);
- int writeable = l3e_get_flags(l3e) & _PAGE_RW;
-
-+ ASSERT(!(flags & PTF_partial_set));
- ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
- do {
- put_data_page(mfn_to_page(_mfn(mfn)), writeable);
-@@ -1391,12 +1393,14 @@ static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
-
- if ( flags & PTF_defer )
- {
-+ ASSERT(!(flags & PTF_partial_set));
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
- current->arch.old_guest_table = pg;
-+ current->arch.old_guest_table_partial = false;
- return 0;
- }
-
-- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
-+ rc = _put_page_type(pg, flags | PTF_preemptible, mfn_to_page(_mfn(pfn)));
- if ( likely(!rc) )
- put_page(pg);
-
-@@ -1415,12 +1419,15 @@ static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
-
- if ( flags & PTF_defer )
- {
-+ ASSERT(!(flags & PTF_partial_set));
- current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
- current->arch.old_guest_table = pg;
-+ current->arch.old_guest_table_partial = false;
- return 0;
- }
-
-- rc = _put_page_type(pg, PTF_preemptible, mfn_to_page(_mfn(pfn)));
-+ rc = _put_page_type(pg, flags | PTF_preemptible,
-+ mfn_to_page(_mfn(pfn)));
- if ( likely(!rc) )
- put_page(pg);
- }
-@@ -1525,6 +1532,14 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
-
- pl2e = map_domain_page(_mfn(pfn));
-
-+ /*
-+ * NB that alloc_l2_table will never set partial_pte on an l2; but
-+ * free_l2_table might if a linear_pagetable entry is interrupted
-+ * partway through de-validation. In that circumstance,
-+ * get_page_from_l2e() will always return -EINVAL; and we must
-+ * retain the type ref by doing the normal partial_flags tracking.
-+ */
-+
- for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES;
- i++, partial_flags = 0 )
- {
-@@ -1579,6 +1594,7 @@ static int alloc_l2_table(struct page_info *page, unsigned long type)
- page->partial_flags = partial_flags;
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
-+ current->arch.old_guest_table_partial = true;
- }
- }
- if ( rc < 0 )
-@@ -1681,12 +1697,16 @@ static int alloc_l3_table(struct page_info *page)
- * builds.
- */
- if ( current->arch.old_guest_table == l3e_get_page(l3e) )
-+ {
-+ ASSERT(current->arch.old_guest_table_partial);
- page->partial_flags = PTF_partial_set;
-+ }
- else
- ASSERT_UNREACHABLE();
- }
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
-+ current->arch.old_guest_table_partial = true;
- }
- while ( i-- > 0 )
- pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
-@@ -1860,12 +1880,16 @@ static int alloc_l4_table(struct page_info *page)
- * builds.
- */
- if ( current->arch.old_guest_table == l4e_get_page(pl4e[i]) )
-+ {
-+ ASSERT(current->arch.old_guest_table_partial);
- page->partial_flags = PTF_partial_set;
-+ }
- else
- ASSERT_UNREACHABLE();
- }
- current->arch.old_guest_ptpg = NULL;
- current->arch.old_guest_table = page;
-+ current->arch.old_guest_table_partial = true;
- }
- }
- }
-@@ -2782,6 +2806,28 @@ static int _put_page_type(struct page_info *page, unsigned int flags,
- x = y;
- nx = x - 1;
-
-+ /*
-+ * Is this expected to do a full reference drop, or only
-+ * cleanup partial validation / devalidation?
-+ *
-+ * If the former, the caller must hold a "full" type ref;
-+ * which means the page must be validated. If the page is
-+ * *not* fully validated, continuing would almost certainly
-+ * open up a security hole. An exception to this is during
-+ * domain destruction, where PGT_validated can be dropped
-+ * without dropping a type ref.
-+ *
-+ * If the latter, do nothing unless type PGT_partial is set.
-+ * If it is set, the type count must be 1.
-+ */
-+ if ( !(flags & PTF_partial_set) )
-+ BUG_ON((x & PGT_partial) ||
-+ !((x & PGT_validated) || page_get_owner(page)->is_dying));
-+ else if ( !(x & PGT_partial) )
-+ return 0;
-+ else
-+ BUG_ON((x & PGT_count_mask) != 1);
-+
- ASSERT((x & PGT_count_mask) != 0);
-
- switch ( nx & (PGT_locked | PGT_count_mask) )
-@@ -3041,17 +3087,34 @@ int put_old_guest_table(struct vcpu *v)
- if ( !v->arch.old_guest_table )
- return 0;
-
-- switch ( rc = _put_page_type(v->arch.old_guest_table, PTF_preemptible,
-- v->arch.old_guest_ptpg) )
-+ rc = _put_page_type(v->arch.old_guest_table,
-+ PTF_preemptible |
-+ ( v->arch.old_guest_table_partial ?
-+ PTF_partial_set : 0 ),
-+ v->arch.old_guest_ptpg);
-+
-+ if ( rc == -ERESTART || rc == -EINTR )
- {
-- case -EINTR:
-- case -ERESTART:
-+ v->arch.old_guest_table_partial = (rc == -ERESTART);
- return -ERESTART;
-- case 0:
-- put_page(v->arch.old_guest_table);
- }
-
-+ /*
-+ * It shouldn't be possible for _put_page_type() to return
-+ * anything else at the moment; but if it does happen in
-+ * production, leaking the type ref is probably the best thing to
-+ * do. Either way, drop the general ref held by old_guest_table.
-+ */
-+ ASSERT(rc == 0);
-+
-+ put_page(v->arch.old_guest_table);
- v->arch.old_guest_table = NULL;
-+ v->arch.old_guest_ptpg = NULL;
-+ /*
-+ * Safest default if someone sets old_guest_table without
-+ * explicitly setting old_guest_table_partial.
-+ */
-+ v->arch.old_guest_table_partial = true;
-
- return rc;
- }
-@@ -3201,11 +3264,11 @@ int new_guest_cr3(mfn_t mfn)
- switch ( rc = put_page_and_type_preemptible(page) )
- {
- case -EINTR:
-- rc = -ERESTART;
-- /* fallthrough */
- case -ERESTART:
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
-+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
-+ rc = -ERESTART;
- break;
- default:
- BUG_ON(rc);
-@@ -3479,6 +3542,7 @@ long do_mmuext_op(
- {
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
-+ curr->arch.old_guest_table_partial = false;
- }
- }
- }
-@@ -3513,6 +3577,11 @@ long do_mmuext_op(
- case -ERESTART:
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
-+ /*
-+ * EINTR means we still hold the type ref; ERESTART
-+ * means PGT_partial holds the type ref
-+ */
-+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
- rc = 0;
- break;
- default:
-@@ -3581,11 +3650,15 @@ long do_mmuext_op(
- switch ( rc = put_page_and_type_preemptible(page) )
- {
- case -EINTR:
-- rc = -ERESTART;
-- /* fallthrough */
- case -ERESTART:
- curr->arch.old_guest_ptpg = NULL;
- curr->arch.old_guest_table = page;
-+ /*
-+ * EINTR means we still hold the type ref;
-+ * ERESTART means PGT_partial holds the ref
-+ */
-+ curr->arch.old_guest_table_partial = (rc == -ERESTART);
-+ rc = -ERESTART;
- break;
- default:
- BUG_ON(rc);
-diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h
-index 1ac5a96c08..360c38bd83 100644
---- a/xen/include/asm-x86/domain.h
-+++ b/xen/include/asm-x86/domain.h
-@@ -309,7 +309,7 @@ struct arch_domain
-
- struct paging_domain paging;
- struct p2m_domain *p2m;
-- /* To enforce lock ordering in the pod code wrt the
-+ /* To enforce lock ordering in the pod code wrt the
- * page_alloc lock */
- int page_alloc_unlock_level;
-
-@@ -542,6 +542,8 @@ struct arch_vcpu
- struct page_info *old_guest_table; /* partially destructed pagetable */
- struct page_info *old_guest_ptpg; /* containing page table of the */
- /* former, if any */
-+ bool old_guest_table_partial; /* Are we dropping a type ref, or just
-+ * finishing up a partial de-validation? */
- /* guest_table holds a ref to the page, and also a type-count unless
- * shadow refcounts are in use */
- pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
---
-2.23.0
-
diff --git a/main/xen/xsa301-4.11-1.patch b/main/xen/xsa301-4.11-1.patch
deleted file mode 100644
index 4d528fe13b..0000000000
--- a/main/xen/xsa301-4.11-1.patch
+++ /dev/null
@@ -1,80 +0,0 @@
-From 21dfe8f707febd62869d4ebbaa155736870bebec Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Wed, 2 Oct 2019 12:06:50 +0100
-Subject: [PATCH 1/3] xen/arm: p2m: Avoid aliasing guest physical frame
-
-The P2M helpers implementation is quite lax and will end up to ignore
-the unused top bits of a guest physical frame.
-
-This effectively means that p2m_set_entry() will create a mapping for a
-different frame (it is always equal to gfn & (mask unused bits)). Yet
-p2m->max_mapped_gfn will be updated using the original frame.
-
-At the moment, p2m_get_entry() and p2m_resolve_translation_fault()
-assume that p2m_get_root_pointer() will always return a non-NULL pointer
-when the GFN is smaller than p2m->max_mapped_gfn.
-
-Unfortunately, because of the aliasing described above, it would be
-possible to set p2m->max_mapped_gfn high enough so it covers frame that
-would lead p2m_get_root_pointer() to return NULL.
-
-As we don't sanity check the guest physical frame provided by a guest, a
-malicious guest could craft a series of hypercalls that will hit the
-BUG_ON() and therefore DoS Xen.
-
-To prevent aliasing, the function p2m_get_root_pointer() is now reworked
-to return NULL If any of the unused top bits are not zero. The caller
-can then decide what's the appropriate action to do. Since the two paths
-(i.e. P2M_ROOT_PAGES == 1 and P2M_ROOT_PAGES != 1) are now very
-similarly, take the opportunity to consolidate them making the code a
-bit simpler.
-
-With this change, p2m_get_entry() will not try to insert a mapping as
-the root pointer is invalid.
-
-Note that root_table is now switch to unsigned long as unsigned int is
-not enough to hold part of a GFN.
-
-This is part of XSA-301.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
----
- xen/arch/arm/p2m.c | 17 +++++------------
- 1 file changed, 5 insertions(+), 12 deletions(-)
-
-diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
-index d43c3aa896..3967ee7306 100644
---- a/xen/arch/arm/p2m.c
-+++ b/xen/arch/arm/p2m.c
-@@ -177,21 +177,14 @@ void p2m_tlb_flush_sync(struct p2m_domain *p2m)
- static lpae_t *p2m_get_root_pointer(struct p2m_domain *p2m,
- gfn_t gfn)
- {
-- unsigned int root_table;
--
-- if ( P2M_ROOT_PAGES == 1 )
-- return __map_domain_page(p2m->root);
-+ unsigned long root_table;
-
- /*
-- * Concatenated root-level tables. The table number will be the
-- * offset at the previous level. It is not possible to
-- * concatenate a level-0 root.
-+ * While the root table index is the offset from the previous level,
-+ * we can't use (P2M_ROOT_LEVEL - 1) because the root level might be
-+ * 0. Yet we still want to check if all the unused bits are zeroed.
- */
-- ASSERT(P2M_ROOT_LEVEL > 0);
--
-- root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL - 1]);
-- root_table &= LPAE_ENTRY_MASK;
--
-+ root_table = gfn_x(gfn) >> (level_orders[P2M_ROOT_LEVEL] + LPAE_SHIFT);
- if ( root_table >= P2M_ROOT_PAGES )
- return NULL;
-
---
-2.11.0
-
diff --git a/main/xen/xsa301-4.11-2.patch b/main/xen/xsa301-4.11-2.patch
deleted file mode 100644
index 33b6150370..0000000000
--- a/main/xen/xsa301-4.11-2.patch
+++ /dev/null
@@ -1,92 +0,0 @@
-From 4426d993b7ee0966fb39531dc5a269ce8493ca97 Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Wed, 2 Oct 2019 12:35:59 +0100
-Subject: [PATCH 2/3] xen/arm: p2m: Avoid off-by-one check on
- p2m->max_mapped_gfn
-
-The code base is using inconsistently the field p2m->max_mapped_gfn.
-Some of the useres expect that p2m->max_guest_gfn contain the highest
-mapped GFN while others expect highest + 1.
-
-p2m->max_guest_gfn is set as highest + 1, because of that the sanity
-check on the GFN in p2m_resolved_translation_fault() and
-p2m_get_entry() can be bypassed when GFN == p2m->max_guest_gfn.
-
-p2m_get_root_pointer(p2m->max_guest_gfn) may return NULL if it is
-outside of address range supported and therefore the BUG_ON() could be
-hit.
-
-The current value hold in p2m->max_mapped_gfn is inconsistent with the
-expectation of the common code (see domain_get_maximum_gpfn()) and also
-the documentation of the field.
-
-Rather than changing the check in p2m_translation_fault() and
-p2m_get_entry(), p2m->max_mapped_gfn is now containing the highest
-mapped GFN and the callers assuming "highest + 1" are now adjusted.
-
-Take the opportunity to use 1UL rather than 1 as page_order could
-theoritically big enough to overflow a 32-bit integer.
-
-Lastly, the documentation of the field max_guest_gfn to reflect how it
-is computed.
-
-This is part of XSA-301.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
----
- xen/arch/arm/p2m.c | 6 +++---
- xen/include/asm-arm/p2m.h | 5 +----
- 2 files changed, 4 insertions(+), 7 deletions(-)
-
-diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
-index 3967ee7306..c7e049901d 100644
---- a/xen/arch/arm/p2m.c
-+++ b/xen/arch/arm/p2m.c
-@@ -931,7 +931,7 @@ static int __p2m_set_entry(struct p2m_domain *p2m,
- p2m_write_pte(entry, pte, p2m->clean_pte);
-
- p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
-- gfn_add(sgfn, 1 << page_order));
-+ gfn_add(sgfn, (1UL << page_order) - 1));
- p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
- }
-
-@@ -1291,7 +1291,7 @@ int relinquish_p2m_mapping(struct domain *d)
- p2m_write_lock(p2m);
-
- start = p2m->lowest_mapped_gfn;
-- end = p2m->max_mapped_gfn;
-+ end = gfn_add(p2m->max_mapped_gfn, 1);
-
- for ( ; gfn_x(start) < gfn_x(end);
- start = gfn_next_boundary(start, order) )
-@@ -1356,7 +1356,7 @@ int p2m_cache_flush(struct domain *d, gfn_t start, unsigned long nr)
- p2m_read_lock(p2m);
-
- start = gfn_max(start, p2m->lowest_mapped_gfn);
-- end = gfn_min(end, p2m->max_mapped_gfn);
-+ end = gfn_min(end, gfn_add(p2m->max_mapped_gfn, 1));
-
- for ( ; gfn_x(start) < gfn_x(end); start = next_gfn )
- {
-diff --git a/xen/include/asm-arm/p2m.h b/xen/include/asm-arm/p2m.h
-index 8823707c17..7f1f7e9109 100644
---- a/xen/include/asm-arm/p2m.h
-+++ b/xen/include/asm-arm/p2m.h
-@@ -38,10 +38,7 @@ struct p2m_domain {
- /* Current Translation Table Base Register for the p2m */
- uint64_t vttbr;
-
-- /*
-- * Highest guest frame that's ever been mapped in the p2m
-- * Only takes into account ram and foreign mapping
-- */
-+ /* Highest guest frame that's ever been mapped in the p2m */
- gfn_t max_mapped_gfn;
-
- /*
---
-2.11.0
-
diff --git a/main/xen/xsa301-4.11-3.patch b/main/xen/xsa301-4.11-3.patch
deleted file mode 100644
index 55a701a5c7..0000000000
--- a/main/xen/xsa301-4.11-3.patch
+++ /dev/null
@@ -1,49 +0,0 @@
-From 61c73af08b4ede1fc8cfd2cf72661e6c7cfdbeaa Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Wed, 2 Oct 2019 10:55:07 +0100
-Subject: [PATCH 3/3] xen/arm: p2m: Don't check the return of
- p2m_get_root_pointer() with BUG_ON()
-
-It turns out that the BUG_ON() was actually reachable with well-crafted
-hypercalls. The BUG_ON() is here to prevent catch logical error, so
-crashing Xen is a bit over the top.
-
-While all the holes should now be fixed, it would be better to downgrade
-the BUG_ON() to something less fatal to prevent any more DoS.
-
-The BUG_ON() in p2m_get_entry() is now replaced by ASSERT_UNREACHABLE()
-to catch mistake in debug build and return INVALID_MFN for production
-build. The interface also requires to set page_order to give an idea of
-the size of "hole". So 'level' is now set so we report a hole of size of
-the an entry of the root page-table. This stays inline with what happen
-when the GFN is higher than p2m->max_mapped_gfn.
-
-This is part of XSA-301.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
----
- xen/arch/arm/p2m.c | 7 ++++++-
- 1 file changed, 6 insertions(+), 1 deletion(-)
-
-diff --git a/xen/arch/arm/p2m.c b/xen/arch/arm/p2m.c
-index c7e049901d..af3515df42 100644
---- a/xen/arch/arm/p2m.c
-+++ b/xen/arch/arm/p2m.c
-@@ -318,7 +318,12 @@ mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
- * the table should always be non-NULL because the gfn is below
- * p2m->max_mapped_gfn and the root table pages are always present.
- */
-- BUG_ON(table == NULL);
-+ if ( !table )
-+ {
-+ ASSERT_UNREACHABLE();
-+ level = P2M_ROOT_LEVEL;
-+ goto out;
-+ }
-
- for ( level = P2M_ROOT_LEVEL; level < 3; level++ )
- {
---
-2.11.0
-
diff --git a/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch b/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch
deleted file mode 100644
index c3d44351a0..0000000000
--- a/main/xen/xsa302-0001-IOMMU-add-missing-HVM-check.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-From bbca29f88d9ad9c7e91125a3b5d5f13a23e5801f Mon Sep 17 00:00:00 2001
-From: Jan Beulich <jbeulich@suse.com>
-Date: Wed, 2 Oct 2019 13:36:59 +0200
-Subject: [PATCH 1/2] IOMMU: add missing HVM check
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Fix an unguarded d->arch.hvm access in assign_device().
-
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
-Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
-
-(cherry picked from commit 41fd1009cd7416b73d745a77c24b4e8d1a296fe6)
-Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
----
- xen/drivers/passthrough/pci.c | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
-index f51cae7f4e..037aba7c94 100644
---- a/xen/drivers/passthrough/pci.c
-+++ b/xen/drivers/passthrough/pci.c
-@@ -1416,7 +1416,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
- /* Prevent device assign if mem paging or mem sharing have been
- * enabled for this domain */
- if ( unlikely(!need_iommu(d) &&
-- (d->arch.hvm_domain.mem_sharing_enabled ||
-+ ((is_hvm_domain(d) &&
-+ d->arch.hvm_domain.mem_sharing_enabled) ||
- vm_event_check_ring(d->vm_event_paging) ||
- p2m_get_hostp2m(d)->global_logdirty)) )
- return -EXDEV;
---
-2.11.0
-
diff --git a/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch b/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch
deleted file mode 100644
index 5204c9f756..0000000000
--- a/main/xen/xsa302-0002-passthrough-quarantine-PCI-devices.patch
+++ /dev/null
@@ -1,498 +0,0 @@
-From ec99857f59f7f06236f11ca8b0b2303e5e745cc4 Mon Sep 17 00:00:00 2001
-From: Paul Durrant <paul.durrant@citrix.com>
-Date: Mon, 14 Oct 2019 17:52:59 +0100
-Subject: [PATCH 2/2] passthrough: quarantine PCI devices
-
-When a PCI device is assigned to an untrusted domain, it is possible for
-that domain to program the device to DMA to an arbitrary address. The
-IOMMU is used to protect the host from malicious DMA by making sure that
-the device addresses can only target memory assigned to the guest. However,
-when the guest domain is torn down the device is assigned back to dom0,
-thus allowing any in-flight DMA to potentially target critical host data.
-
-This patch introduces a 'quarantine' for PCI devices using dom_io. When
-the toolstack makes a device assignable (by binding it to pciback), it
-will now also assign it to DOMID_IO and the device will only be assigned
-back to dom0 when the device is made unassignable again. Whilst device is
-assignable it will only ever transfer between dom_io and guest domains.
-dom_io is actually only used as a sentinel domain for quarantining purposes;
-it is not configured with any IOMMU mappings. Assignment to dom_io simply
-means that the device's initiator (requestor) identifier is not present in
-the IOMMU's device table and thus any DMA transactions issued will be
-terminated with a fault condition.
-
-In addition, a fix to assignment handling is made for VT-d. Failure
-during the assignment step should not lead to a device still being
-associated with its prior owner. Hand the device to DomIO temporarily,
-until the assignment step has completed successfully. Remove the PI
-hooks from the source domain then earlier as well.
-
-Failure of the recovery reassign_device_ownership() may not go silent:
-There e.g. may still be left over RMRR mappings in the domain assignment
-to which has failed, and hence we can't allow that domain to continue
-executing.
-
-NOTE: This patch also includes one printk() cleanup; the
- "XEN_DOMCTL_assign_device: " tag is dropped in iommu_do_pci_domctl(),
- since similar printk()-s elsewhere also don't log such a tag.
-
-This is XSA-302.
-
-Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
-Signed-off-by: Jan Beulich <jbeulich@suse.com>
-Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
----
- tools/libxl/libxl_pci.c | 25 +++++++++++-
- xen/arch/x86/mm.c | 2 +
- xen/common/domctl.c | 14 ++++++-
- xen/drivers/passthrough/amd/pci_amd_iommu.c | 10 ++++-
- xen/drivers/passthrough/iommu.c | 9 +++++
- xen/drivers/passthrough/pci.c | 59 ++++++++++++++++++++++-------
- xen/drivers/passthrough/vtd/iommu.c | 40 ++++++++++++++++---
- xen/include/xen/pci.h | 3 ++
- 8 files changed, 138 insertions(+), 24 deletions(-)
-
-diff --git a/tools/libxl/libxl_pci.c b/tools/libxl/libxl_pci.c
-index 4755a0c93c..81890a91ac 100644
---- a/tools/libxl/libxl_pci.c
-+++ b/tools/libxl/libxl_pci.c
-@@ -754,6 +754,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
- libxl_device_pci *pcidev,
- int rebind)
- {
-+ libxl_ctx *ctx = libxl__gc_owner(gc);
- unsigned dom, bus, dev, func;
- char *spath, *driver_path = NULL;
- int rc;
-@@ -779,7 +780,7 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
- }
- if ( rc ) {
- LOG(WARN, PCI_BDF" already assigned to pciback", dom, bus, dev, func);
-- return 0;
-+ goto quarantine;
- }
-
- /* Check to see if there's already a driver that we need to unbind from */
-@@ -810,6 +811,19 @@ static int libxl__device_pci_assignable_add(libxl__gc *gc,
- return ERROR_FAIL;
- }
-
-+quarantine:
-+ /*
-+ * DOMID_IO is just a sentinel domain, without any actual mappings,
-+ * so always pass XEN_DOMCTL_DEV_RDM_RELAXED to avoid assignment being
-+ * unnecessarily denied.
-+ */
-+ rc = xc_assign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev),
-+ XEN_DOMCTL_DEV_RDM_RELAXED);
-+ if ( rc < 0 ) {
-+ LOG(ERROR, "failed to quarantine "PCI_BDF, dom, bus, dev, func);
-+ return ERROR_FAIL;
-+ }
-+
- return 0;
- }
-
-@@ -817,9 +831,18 @@ static int libxl__device_pci_assignable_remove(libxl__gc *gc,
- libxl_device_pci *pcidev,
- int rebind)
- {
-+ libxl_ctx *ctx = libxl__gc_owner(gc);
- int rc;
- char *driver_path;
-
-+ /* De-quarantine */
-+ rc = xc_deassign_device(ctx->xch, DOMID_IO, pcidev_encode_bdf(pcidev));
-+ if ( rc < 0 ) {
-+ LOG(ERROR, "failed to de-quarantine "PCI_BDF, pcidev->domain, pcidev->bus,
-+ pcidev->dev, pcidev->func);
-+ return ERROR_FAIL;
-+ }
-+
- /* Unbind from pciback */
- if ( (rc=pciback_dev_is_assigned(gc, pcidev)) < 0 ) {
- return ERROR_FAIL;
-diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
-index e6a4cb28f8..c1ab57f9a5 100644
---- a/xen/arch/x86/mm.c
-+++ b/xen/arch/x86/mm.c
-@@ -295,9 +295,11 @@ void __init arch_init_memory(void)
- * Initialise our DOMID_IO domain.
- * This domain owns I/O pages that are within the range of the page_info
- * array. Mappings occur at the priv of the caller.
-+ * Quarantined PCI devices will be associated with this domain.
- */
- dom_io = domain_create(DOMID_IO, NULL);
- BUG_ON(IS_ERR(dom_io));
-+ INIT_LIST_HEAD(&dom_io->arch.pdev_list);
-
- /*
- * Initialise our COW domain.
-diff --git a/xen/common/domctl.c b/xen/common/domctl.c
-index 9b7bc083ee..741d774cd1 100644
---- a/xen/common/domctl.c
-+++ b/xen/common/domctl.c
-@@ -392,6 +392,16 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
-
- switch ( op->cmd )
- {
-+ case XEN_DOMCTL_assign_device:
-+ case XEN_DOMCTL_deassign_device:
-+ if ( op->domain == DOMID_IO )
-+ {
-+ d = dom_io;
-+ break;
-+ }
-+ else if ( op->domain == DOMID_INVALID )
-+ return -ESRCH;
-+ /* fall through */
- case XEN_DOMCTL_test_assign_device:
- if ( op->domain == DOMID_INVALID )
- {
-@@ -413,7 +423,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
-
- if ( !domctl_lock_acquire() )
- {
-- if ( d )
-+ if ( d && d != dom_io )
- rcu_unlock_domain(d);
- return hypercall_create_continuation(
- __HYPERVISOR_domctl, "h", u_domctl);
-@@ -1148,7 +1158,7 @@ long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
- domctl_lock_release();
-
- domctl_out_unlock_domonly:
-- if ( d )
-+ if ( d && d != dom_io )
- rcu_unlock_domain(d);
-
- if ( copyback && __copy_to_guest(u_domctl, op, 1) )
-diff --git a/xen/drivers/passthrough/amd/pci_amd_iommu.c b/xen/drivers/passthrough/amd/pci_amd_iommu.c
-index 12d2695b89..ec8baae717 100644
---- a/xen/drivers/passthrough/amd/pci_amd_iommu.c
-+++ b/xen/drivers/passthrough/amd/pci_amd_iommu.c
-@@ -118,6 +118,10 @@ static void amd_iommu_setup_domain_device(
- u8 bus = pdev->bus;
- const struct domain_iommu *hd = dom_iommu(domain);
-
-+ /* dom_io is used as a sentinel for quarantined devices */
-+ if ( domain == dom_io )
-+ return;
-+
- BUG_ON( !hd->arch.root_table || !hd->arch.paging_mode ||
- !iommu->dev_table.buffer );
-
-@@ -305,6 +309,10 @@ void amd_iommu_disable_domain_device(struct domain *domain,
- int req_id;
- u8 bus = pdev->bus;
-
-+ /* dom_io is used as a sentinel for quarantined devices */
-+ if ( domain == dom_io )
-+ return;
-+
- BUG_ON ( iommu->dev_table.buffer == NULL );
- req_id = get_dma_requestor_id(iommu->seg, PCI_BDF2(bus, devfn));
- dte = iommu->dev_table.buffer + (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
-@@ -391,7 +399,7 @@ static int amd_iommu_assign_device(struct domain *d, u8 devfn,
- ivrs_mappings[req_id].read_permission);
- }
-
-- return reassign_device(hardware_domain, d, devfn, pdev);
-+ return reassign_device(pdev->domain, d, devfn, pdev);
- }
-
- static void deallocate_next_page_table(struct page_info *pg, int level)
-diff --git a/xen/drivers/passthrough/iommu.c b/xen/drivers/passthrough/iommu.c
-index 04b0be37d3..8027d96f1c 100644
---- a/xen/drivers/passthrough/iommu.c
-+++ b/xen/drivers/passthrough/iommu.c
-@@ -219,6 +219,9 @@ void iommu_teardown(struct domain *d)
- {
- const struct domain_iommu *hd = dom_iommu(d);
-
-+ if ( d == dom_io )
-+ return;
-+
- d->need_iommu = 0;
- hd->platform_ops->teardown(d);
- tasklet_schedule(&iommu_pt_cleanup_tasklet);
-@@ -229,6 +232,9 @@ int iommu_construct(struct domain *d)
- if ( need_iommu(d) > 0 )
- return 0;
-
-+ if ( d == dom_io )
-+ return 0;
-+
- if ( !iommu_use_hap_pt(d) )
- {
- int rc;
-@@ -404,6 +410,9 @@ int __init iommu_setup(void)
- printk("I/O virtualisation %sabled\n", iommu_enabled ? "en" : "dis");
- if ( iommu_enabled )
- {
-+ if ( iommu_domain_init(dom_io) )
-+ panic("Could not set up quarantine\n");
-+
- printk(" - Dom0 mode: %s\n",
- iommu_passthrough ? "Passthrough" :
- iommu_dom0_strict ? "Strict" : "Relaxed");
-diff --git a/xen/drivers/passthrough/pci.c b/xen/drivers/passthrough/pci.c
-index 037aba7c94..fb010a547b 100644
---- a/xen/drivers/passthrough/pci.c
-+++ b/xen/drivers/passthrough/pci.c
-@@ -1389,19 +1389,29 @@ static int iommu_remove_device(struct pci_dev *pdev)
- return hd->platform_ops->remove_device(pdev->devfn, pci_to_dev(pdev));
- }
-
--/*
-- * If the device isn't owned by the hardware domain, it means it already
-- * has been assigned to other domain, or it doesn't exist.
-- */
- static int device_assigned(u16 seg, u8 bus, u8 devfn)
- {
- struct pci_dev *pdev;
-+ int rc = 0;
-
- pcidevs_lock();
-- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
-+
-+ pdev = pci_get_pdev(seg, bus, devfn);
-+
-+ if ( !pdev )
-+ rc = -ENODEV;
-+ /*
-+ * If the device exists and it is not owned by either the hardware
-+ * domain or dom_io then it must be assigned to a guest, or be
-+ * hidden (owned by dom_xen).
-+ */
-+ else if ( pdev->domain != hardware_domain &&
-+ pdev->domain != dom_io )
-+ rc = -EBUSY;
-+
- pcidevs_unlock();
-
-- return pdev ? 0 : -EBUSY;
-+ return rc;
- }
-
- static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
-@@ -1415,7 +1425,8 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
-
- /* Prevent device assign if mem paging or mem sharing have been
- * enabled for this domain */
-- if ( unlikely(!need_iommu(d) &&
-+ if ( d != dom_io &&
-+ unlikely(!need_iommu(d) &&
- ((is_hvm_domain(d) &&
- d->arch.hvm_domain.mem_sharing_enabled) ||
- vm_event_check_ring(d->vm_event_paging) ||
-@@ -1432,12 +1443,20 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
- return rc;
- }
-
-- pdev = pci_get_pdev_by_domain(hardware_domain, seg, bus, devfn);
-+ pdev = pci_get_pdev(seg, bus, devfn);
-+
-+ rc = -ENODEV;
- if ( !pdev )
-- {
-- rc = pci_get_pdev(seg, bus, devfn) ? -EBUSY : -ENODEV;
- goto done;
-- }
-+
-+ rc = 0;
-+ if ( d == pdev->domain )
-+ goto done;
-+
-+ rc = -EBUSY;
-+ if ( pdev->domain != hardware_domain &&
-+ pdev->domain != dom_io )
-+ goto done;
-
- if ( pdev->msix )
- msixtbl_init(d);
-@@ -1460,6 +1479,10 @@ static int assign_device(struct domain *d, u16 seg, u8 bus, u8 devfn, u32 flag)
- }
-
- done:
-+ /* The device is assigned to dom_io so mark it as quarantined */
-+ if ( !rc && d == dom_io )
-+ pdev->quarantine = true;
-+
- if ( !has_arch_pdevs(d) && need_iommu(d) )
- iommu_teardown(d);
- pcidevs_unlock();
-@@ -1472,6 +1495,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
- {
- const struct domain_iommu *hd = dom_iommu(d);
- struct pci_dev *pdev = NULL;
-+ struct domain *target;
- int ret = 0;
-
- if ( !iommu_enabled || !hd->platform_ops )
-@@ -1482,12 +1506,16 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
- if ( !pdev )
- return -ENODEV;
-
-+ /* De-assignment from dom_io should de-quarantine the device */
-+ target = (pdev->quarantine && pdev->domain != dom_io) ?
-+ dom_io : hardware_domain;
-+
- while ( pdev->phantom_stride )
- {
- devfn += pdev->phantom_stride;
- if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
- break;
-- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
-+ ret = hd->platform_ops->reassign_device(d, target, devfn,
- pci_to_dev(pdev));
- if ( !ret )
- continue;
-@@ -1498,7 +1526,7 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
- }
-
- devfn = pdev->devfn;
-- ret = hd->platform_ops->reassign_device(d, hardware_domain, devfn,
-+ ret = hd->platform_ops->reassign_device(d, target, devfn,
- pci_to_dev(pdev));
- if ( ret )
- {
-@@ -1508,6 +1536,9 @@ int deassign_device(struct domain *d, u16 seg, u8 bus, u8 devfn)
- return ret;
- }
-
-+ if ( pdev->domain == hardware_domain )
-+ pdev->quarantine = false;
-+
- pdev->fault.count = 0;
-
- if ( !has_arch_pdevs(d) && need_iommu(d) )
-@@ -1686,7 +1717,7 @@ int iommu_do_pci_domctl(
- ret = hypercall_create_continuation(__HYPERVISOR_domctl,
- "h", u_domctl);
- else if ( ret )
-- printk(XENLOG_G_ERR "XEN_DOMCTL_assign_device: "
-+ printk(XENLOG_G_ERR
- "assign %04x:%02x:%02x.%u to dom%d failed (%d)\n",
- seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
- d->domain_id, ret);
-diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
-index 4c719d4ee7..19f7d13013 100644
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -1338,6 +1338,10 @@ int domain_context_mapping_one(
- int agaw, rc, ret;
- bool_t flush_dev_iotlb;
-
-+ /* dom_io is used as a sentinel for quarantined devices */
-+ if ( domain == dom_io )
-+ return 0;
-+
- ASSERT(pcidevs_locked());
- spin_lock(&iommu->lock);
- maddr = bus_to_context_maddr(iommu, bus);
-@@ -1573,6 +1577,10 @@ int domain_context_unmap_one(
- int iommu_domid, rc, ret;
- bool_t flush_dev_iotlb;
-
-+ /* dom_io is used as a sentinel for quarantined devices */
-+ if ( domain == dom_io )
-+ return 0;
-+
- ASSERT(pcidevs_locked());
- spin_lock(&iommu->lock);
-
-@@ -1705,6 +1713,10 @@ static int domain_context_unmap(struct domain *domain, u8 devfn,
- goto out;
- }
-
-+ /* dom_io is used as a sentinel for quarantined devices */
-+ if ( domain == dom_io )
-+ goto out;
-+
- /*
- * if no other devices under the same iommu owned by this domain,
- * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
-@@ -2389,6 +2401,15 @@ static int reassign_device_ownership(
- if ( ret )
- return ret;
-
-+ if ( devfn == pdev->devfn )
-+ {
-+ list_move(&pdev->domain_list, &dom_io->arch.pdev_list);
-+ pdev->domain = dom_io;
-+ }
-+
-+ if ( !has_arch_pdevs(source) )
-+ vmx_pi_hooks_deassign(source);
-+
- if ( !has_arch_pdevs(target) )
- vmx_pi_hooks_assign(target);
-
-@@ -2407,15 +2428,13 @@ static int reassign_device_ownership(
- pdev->domain = target;
- }
-
-- if ( !has_arch_pdevs(source) )
-- vmx_pi_hooks_deassign(source);
--
- return ret;
- }
-
- static int intel_iommu_assign_device(
- struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
- {
-+ struct domain *s = pdev->domain;
- struct acpi_rmrr_unit *rmrr;
- int ret = 0, i;
- u16 bdf, seg;
-@@ -2458,8 +2477,8 @@ static int intel_iommu_assign_device(
- }
- }
-
-- ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
-- if ( ret )
-+ ret = reassign_device_ownership(s, d, devfn, pdev);
-+ if ( ret || d == dom_io )
- return ret;
-
- /* Setup rmrr identity mapping */
-@@ -2472,11 +2491,20 @@ static int intel_iommu_assign_device(
- ret = rmrr_identity_mapping(d, 1, rmrr, flag);
- if ( ret )
- {
-- reassign_device_ownership(d, hardware_domain, devfn, pdev);
-+ int rc;
-+
-+ rc = reassign_device_ownership(d, s, devfn, pdev);
- printk(XENLOG_G_ERR VTDPREFIX
- " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
- rmrr->base_address, rmrr->end_address,
- d->domain_id, ret);
-+ if ( rc )
-+ {
-+ printk(XENLOG_ERR VTDPREFIX
-+ " failed to reclaim %04x:%02x:%02x.%u from %pd (%d)\n",
-+ seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), d, rc);
-+ domain_crash(d);
-+ }
- break;
- }
- }
-diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
-index 4cfa774615..066364bdef 100644
---- a/xen/include/xen/pci.h
-+++ b/xen/include/xen/pci.h
-@@ -88,6 +88,9 @@ struct pci_dev {
-
- nodeid_t node; /* NUMA node */
-
-+ /* Device to be quarantined, don't automatically re-assign to dom0 */
-+ bool quarantine;
-+
- enum pdev_type {
- DEV_TYPE_PCI_UNKNOWN,
- DEV_TYPE_PCIe_ENDPOINT,
---
-2.11.0
-
diff --git a/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch b/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
deleted file mode 100644
index afb1096c1d..0000000000
--- a/main/xen/xsa303-0001-xen-arm32-entry-Split-__DEFINE_ENTRY_TRAP-in-two.patch
+++ /dev/null
@@ -1,74 +0,0 @@
-From c8cb33fa64c9ccbfa2a494a9dad2e0a763c09176 Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Tue, 1 Oct 2019 13:07:53 +0100
-Subject: [PATCH 1/4] xen/arm32: entry: Split __DEFINE_ENTRY_TRAP in two
-
-The preprocessing macro __DEFINE_ENTRY_TRAP is used to generate trap
-entry function. While the macro is fairly small today, follow-up patches
-will increase the size signicantly.
-
-In general, assembly macros are more readable as they allow you to name
-parameters and avoid '\'. So the actual implementation of the trap is
-now switched to an assembly macro.
-
-This is part of XSA-303.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
-Reviewed-by: Andre Przywara <andre.przywara@arm.com>
----
- xen/arch/arm/arm32/entry.S | 34 +++++++++++++++++++---------------
- 1 file changed, 19 insertions(+), 15 deletions(-)
-
-diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
-index 0b4cd19abd..4a762e04f1 100644
---- a/xen/arch/arm/arm32/entry.S
-+++ b/xen/arch/arm/arm32/entry.S
-@@ -126,24 +126,28 @@ abort_guest_exit_end:
- skip_check:
- mov pc, lr
-
--/*
-- * Macro to define trap entry. The iflags corresponds to the list of
-- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
-- */
-+ /*
-+ * Macro to define trap entry. The iflags corresponds to the list of
-+ * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
-+ */
-+ .macro vector trap, iflags
-+ SAVE_ALL
-+ cpsie \iflags
-+ adr lr, return_from_trap
-+ mov r0, sp
-+ /*
-+ * Save the stack pointer in r11. It will be restored after the
-+ * trap has been handled (see return_from_trap).
-+ */
-+ mov r11, sp
-+ bic sp, #7 /* Align the stack pointer (noop on guest trap) */
-+ b do_trap_\trap
-+ .endm
-+
- #define __DEFINE_TRAP_ENTRY(trap, iflags) \
- ALIGN; \
- trap_##trap: \
-- SAVE_ALL; \
-- cpsie iflags; \
-- adr lr, return_from_trap; \
-- mov r0, sp; \
-- /* \
-- * Save the stack pointer in r11. It will be restored after the \
-- * trap has been handled (see return_from_trap). \
-- */ \
-- mov r11, sp; \
-- bic sp, #7; /* Align the stack pointer (noop on guest trap) */ \
-- b do_trap_##trap
-+ vector trap, iflags
-
- /* Trap handler which unmask IRQ/Abort, keep FIQ masked */
- #define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai)
---
-2.11.0
-
diff --git a/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch b/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
deleted file mode 100644
index 35f9c0475e..0000000000
--- a/main/xen/xsa303-0002-xen-arm32-entry-Fold-the-macro-SAVE_ALL-in-the-macro.patch
+++ /dev/null
@@ -1,97 +0,0 @@
-From be7379207c83fa74f8a6c22a8ea213f02714776f Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Tue, 1 Oct 2019 13:15:48 +0100
-Subject: [PATCH 2/4] xen/arm32: entry: Fold the macro SAVE_ALL in the macro
- vector
-
-Follow-up rework will require the macro vector to distinguish between
-a trap from a guest vs while in the hypervisor.
-
-The macro SAVE_ALL already has code to distinguish between the two and
-it is only called by the vector macro. So fold the former into the
-latter. This will help to avoid duplicating the check.
-
-This is part of XSA-303.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
-Reviewed-by: Andre Przywara <andre.przywara@arm.com>
----
- xen/arch/arm/arm32/entry.S | 46 +++++++++++++++++++++++-----------------------
- 1 file changed, 23 insertions(+), 23 deletions(-)
-
-diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
-index 4a762e04f1..150cbc0b4b 100644
---- a/xen/arch/arm/arm32/entry.S
-+++ b/xen/arch/arm/arm32/entry.S
-@@ -13,27 +13,6 @@
- #define RESTORE_BANKED(mode) \
- RESTORE_ONE_BANKED(SP_##mode) ; RESTORE_ONE_BANKED(LR_##mode) ; RESTORE_ONE_BANKED(SPSR_##mode)
-
--#define SAVE_ALL \
-- sub sp, #(UREGS_SP_usr - UREGS_sp); /* SP, LR, SPSR, PC */ \
-- push {r0-r12}; /* Save R0-R12 */ \
-- \
-- mrs r11, ELR_hyp; /* ELR_hyp is return address. */\
-- str r11, [sp, #UREGS_pc]; \
-- \
-- str lr, [sp, #UREGS_lr]; \
-- \
-- add r11, sp, #UREGS_kernel_sizeof+4; \
-- str r11, [sp, #UREGS_sp]; \
-- \
-- mrc CP32(r11, HSR); /* Save exception syndrome */ \
-- str r11, [sp, #UREGS_hsr]; \
-- \
-- mrs r11, SPSR_hyp; \
-- str r11, [sp, #UREGS_cpsr]; \
-- and r11, #PSR_MODE_MASK; \
-- cmp r11, #PSR_MODE_HYP; \
-- blne save_guest_regs
--
- save_guest_regs:
- #ifdef CONFIG_ARM32_HARDEN_BRANCH_PREDICTOR
- /*
-@@ -52,7 +31,7 @@ save_guest_regs:
- ldr r11, =0xffffffff /* Clobber SP which is only valid for hypervisor frames. */
- str r11, [sp, #UREGS_sp]
- SAVE_ONE_BANKED(SP_usr)
-- /* LR_usr is the same physical register as lr and is saved in SAVE_ALL */
-+ /* LR_usr is the same physical register as lr and is saved by the caller */
- SAVE_BANKED(svc)
- SAVE_BANKED(abt)
- SAVE_BANKED(und)
-@@ -131,7 +110,28 @@ skip_check:
- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
- */
- .macro vector trap, iflags
-- SAVE_ALL
-+ /* Save registers in the stack */
-+ sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */
-+ push {r0-r12} /* Save R0-R12 */
-+ mrs r11, ELR_hyp /* ELR_hyp is return address */
-+ str r11, [sp, #UREGS_pc]
-+
-+ str lr, [sp, #UREGS_lr]
-+
-+ add r11, sp, #(UREGS_kernel_sizeof + 4)
-+
-+ str r11, [sp, #UREGS_sp]
-+
-+ mrc CP32(r11, HSR) /* Save exception syndrome */
-+ str r11, [sp, #UREGS_hsr]
-+
-+ mrs r11, SPSR_hyp
-+ str r11, [sp, #UREGS_cpsr]
-+ and r11, #PSR_MODE_MASK
-+ cmp r11, #PSR_MODE_HYP
-+ blne save_guest_regs
-+
-+ /* We are ready to handle the trap, setup the registers and jump. */
- cpsie \iflags
- adr lr, return_from_trap
- mov r0, sp
---
-2.11.0
-
diff --git a/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
deleted file mode 100644
index 5168452148..0000000000
--- a/main/xen/xsa303-0003-xen-arm32-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
+++ /dev/null
@@ -1,226 +0,0 @@
-From 098fe877967870ffda2dfd9629a5fd272f6aacdc Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Fri, 11 Oct 2019 17:49:28 +0100
-Subject: [PATCH 3/4] xen/arm32: Don't blindly unmask interrupts on trap
- without a change of level
-
-Exception vectors will unmask interrupts regardless the state of them in
-the interrupted context.
-
-One of the consequences is IRQ will be unmasked when receiving an
-undefined instruction exception (used by WARN*) from the hypervisor.
-This could result to unexpected behavior such as deadlock (if a lock was
-shared with interrupts).
-
-In a nutshell, interrupts should only be unmasked when it is safe to do.
-Xen only unmask IRQ and Abort interrupts, so the logic can stay simple.
-
-As vectors exceptions may be shared between guest and hypervisor, we now
-need to have a different policy for the interrupts.
-
-On exception from hypervisor, each vector will select the list of
-interrupts to inherit from the interrupted context. Any interrupts not
-listed will be kept masked.
-
-On exception from the guest, the Abort and IRQ will be unmasked
-depending on the exact vector.
-
-The interrupts will be kept unmasked when the vector cannot used by
-either guest or hypervisor.
-
-Note that each vector is not anymore preceded by ALIGN. This is fine
-because the alignment is already bigger than what we need.
-
-This is part of XSA-303.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
-Reviewed-by: Andre Przywara <andre.przywara@arm.com>
----
- xen/arch/arm/arm32/entry.S | 138 +++++++++++++++++++++++++++++++++++----------
- 1 file changed, 109 insertions(+), 29 deletions(-)
-
-diff --git a/xen/arch/arm/arm32/entry.S b/xen/arch/arm/arm32/entry.S
-index 150cbc0b4b..ec90cca093 100644
---- a/xen/arch/arm/arm32/entry.S
-+++ b/xen/arch/arm/arm32/entry.S
-@@ -4,6 +4,17 @@
- #include <asm/alternative.h>
- #include <public/xen.h>
-
-+/*
-+ * Short-hands to defined the interrupts (A, I, F)
-+ *
-+ * _ means the interrupt state will not change
-+ * X means the state of interrupt X will change
-+ *
-+ * To be used with msr cpsr_* only
-+ */
-+#define IFLAGS_AIF PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK
-+#define IFLAGS_A_F PSR_ABT_MASK | PSR_FIQ_MASK
-+
- #define SAVE_ONE_BANKED(reg) mrs r11, reg; str r11, [sp, #UREGS_##reg]
- #define RESTORE_ONE_BANKED(reg) ldr r11, [sp, #UREGS_##reg]; msr reg, r11
-
-@@ -106,10 +117,18 @@ skip_check:
- mov pc, lr
-
- /*
-- * Macro to define trap entry. The iflags corresponds to the list of
-- * interrupts (Asynchronous Abort, IRQ, FIQ) to unmask.
-+ * Macro to define a trap entry.
-+ *
-+ * @guest_iflags: Optional list of interrupts to unmask when
-+ * entering from guest context. As this is used with cpsie,
-+ * the letter (a, i, f) should be used.
-+ *
-+ * @hyp_iflags: Optional list of interrupts to inherit when
-+ * entering from hypervisor context. Any interrupts not
-+ * listed will be kept unchanged. As this is used with cpsr_*,
-+ * IFLAGS_* short-hands should be used.
- */
-- .macro vector trap, iflags
-+ .macro vector trap, guest_iflags=n, hyp_iflags=0
- /* Save registers in the stack */
- sub sp, #(UREGS_SP_usr - UREGS_sp) /* SP, LR, SPSR, PC */
- push {r0-r12} /* Save R0-R12 */
-@@ -127,12 +146,39 @@ skip_check:
-
- mrs r11, SPSR_hyp
- str r11, [sp, #UREGS_cpsr]
-- and r11, #PSR_MODE_MASK
-- cmp r11, #PSR_MODE_HYP
-- blne save_guest_regs
-
-+ /*
-+ * We need to distinguish whether we came from guest or
-+ * hypervisor context.
-+ */
-+ and r0, r11, #PSR_MODE_MASK
-+ cmp r0, #PSR_MODE_HYP
-+
-+ bne 1f
-+ /*
-+ * Trap from the hypervisor
-+ *
-+ * Inherit the state of the interrupts from the hypervisor
-+ * context. For that we need to use SPSR (stored in r11) and
-+ * modify CPSR accordingly.
-+ *
-+ * CPSR = (CPSR & ~hyp_iflags) | (SPSR & hyp_iflags)
-+ */
-+ mrs r10, cpsr
-+ bic r10, r10, #\hyp_iflags
-+ and r11, r11, #\hyp_iflags
-+ orr r10, r10, r11
-+ msr cpsr_cx, r10
-+ b 2f
-+
-+1:
-+ /* Trap from the guest */
-+ bl save_guest_regs
-+ .if \guest_iflags != n
-+ cpsie \guest_iflags
-+ .endif
-+2:
- /* We are ready to handle the trap, setup the registers and jump. */
-- cpsie \iflags
- adr lr, return_from_trap
- mov r0, sp
- /*
-@@ -144,20 +190,6 @@ skip_check:
- b do_trap_\trap
- .endm
-
--#define __DEFINE_TRAP_ENTRY(trap, iflags) \
-- ALIGN; \
--trap_##trap: \
-- vector trap, iflags
--
--/* Trap handler which unmask IRQ/Abort, keep FIQ masked */
--#define DEFINE_TRAP_ENTRY(trap) __DEFINE_TRAP_ENTRY(trap, ai)
--
--/* Trap handler which unmask Abort, keep IRQ/FIQ masked */
--#define DEFINE_TRAP_ENTRY_NOIRQ(trap) __DEFINE_TRAP_ENTRY(trap, a)
--
--/* Trap handler which unmask IRQ, keep Abort/FIQ masked */
--#define DEFINE_TRAP_ENTRY_NOABORT(trap) __DEFINE_TRAP_ENTRY(trap, i)
--
- .align 5
- GLOBAL(hyp_traps_vector)
- b trap_reset /* 0x00 - Reset */
-@@ -228,14 +260,62 @@ decode_vectors:
-
- #endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */
-
--DEFINE_TRAP_ENTRY(reset)
--DEFINE_TRAP_ENTRY(undefined_instruction)
--DEFINE_TRAP_ENTRY(hypervisor_call)
--DEFINE_TRAP_ENTRY(prefetch_abort)
--DEFINE_TRAP_ENTRY(guest_sync)
--DEFINE_TRAP_ENTRY_NOIRQ(irq)
--DEFINE_TRAP_ENTRY_NOIRQ(fiq)
--DEFINE_TRAP_ENTRY_NOABORT(data_abort)
-+/* Vector not used by the Hypervisor. */
-+trap_reset:
-+ vector reset
-+
-+/*
-+ * Vector only used by the Hypervisor.
-+ *
-+ * While the exception can be executed with all the interrupts (e.g.
-+ * IRQ) unmasked, the interrupted context may have purposefully masked
-+ * some of them. So we want to inherit the state from the interrupted
-+ * context.
-+ */
-+trap_undefined_instruction:
-+ vector undefined_instruction, hyp_iflags=IFLAGS_AIF
-+
-+/* We should never reach this trap */
-+trap_hypervisor_call:
-+ vector hypervisor_call
-+
-+/*
-+ * Vector only used by the hypervisor.
-+ *
-+ * While the exception can be executed with all the interrupts (e.g.
-+ * IRQ) unmasked, the interrupted context may have purposefully masked
-+ * some of them. So we want to inherit the state from the interrupted
-+ * context.
-+ */
-+trap_prefetch_abort:
-+ vector prefetch_abort, hyp_iflags=IFLAGS_AIF
-+
-+/*
-+ * Vector only used by the hypervisor.
-+ *
-+ * Data Abort should be rare and most likely fatal. It is best to not
-+ * unmask any interrupts to limit the amount of code that can run before
-+ * the Data Abort is treated.
-+ */
-+trap_data_abort:
-+ vector data_abort
-+
-+/* Vector only used by the guest. We can unmask Abort/IRQ. */
-+trap_guest_sync:
-+ vector guest_sync, guest_iflags=ai
-+
-+
-+/* Vector used by the hypervisor and the guest. */
-+trap_irq:
-+ vector irq, guest_iflags=a, hyp_iflags=IFLAGS_A_F
-+
-+/*
-+ * Vector used by the hypervisor and the guest.
-+ *
-+ * FIQ are not meant to happen, so we don't unmask any interrupts.
-+ */
-+trap_fiq:
-+ vector fiq
-
- return_from_trap:
- /*
---
-2.11.0
-
diff --git a/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch b/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
deleted file mode 100644
index 106cbf98f1..0000000000
--- a/main/xen/xsa303-0004-xen-arm64-Don-t-blindly-unmask-interrupts-on-trap-wi.patch
+++ /dev/null
@@ -1,114 +0,0 @@
-From c6d290ce157a044dec417fdda8db71e41a37d744 Mon Sep 17 00:00:00 2001
-From: Julien Grall <julien.grall@arm.com>
-Date: Mon, 7 Oct 2019 18:10:56 +0100
-Subject: [PATCH 4/4] xen/arm64: Don't blindly unmask interrupts on trap
- without a change of level
-
-Some of the traps without a change of the level (i.e. hypervisor ->
-hypervisor) will unmask interrupts regardless the state of them in the
-interrupted context.
-
-One of the consequences is IRQ will be unmasked when receiving a
-synchronous exception (used by WARN*()). This could result to unexpected
-behavior such as deadlock (if a lock was shared with interrupts).
-
-In a nutshell, interrupts should only be unmasked when it is safe to
-do. Xen only unmask IRQ and Abort interrupts, so the logic can stay
-simple:
- - hyp_error: All the interrupts are now kept masked. SError should
- be pretty rare and if ever happen then we most likely want to
- avoid any other interrupts to be generated. The potential main
- "caller" is during virtual SError synchronization on the exit
- path from the guest (see check_pending_vserror).
-
- - hyp_sync: The interrupts state is inherited from the interrupted
- context.
-
- - hyp_irq: All the interrupts but IRQ state are inherited from the
- interrupted context. IRQ is kept masked.
-
-This is part of XSA-303.
-
-Reported-by: Julien Grall <Julien.Grall@arm.com>
-Signed-off-by: Julien Grall <julien.grall@arm.com>
-Reviewed-by: Stefano Stabellini <sstabellini@kernel.org>
-Reviewed-by: Andre Przywara <andre.przywara@arm.com>
----
- xen/arch/arm/arm64/entry.S | 47 ++++++++++++++++++++++++++++++++++++++++++----
- 1 file changed, 43 insertions(+), 4 deletions(-)
-
-diff --git a/xen/arch/arm/arm64/entry.S b/xen/arch/arm/arm64/entry.S
-index 2d9a2713a1..3e41ba65b6 100644
---- a/xen/arch/arm/arm64/entry.S
-+++ b/xen/arch/arm/arm64/entry.S
-@@ -188,24 +188,63 @@ hyp_error_invalid:
- entry hyp=1
- invalid BAD_ERROR
-
-+/*
-+ * SError received while running in the hypervisor mode.
-+ *
-+ * Technically, we could unmask the IRQ if it were unmasked in the
-+ * interrupted context. However, this require to check the PSTATE. For
-+ * simplicity, as SError should be rare and potentially fatal,
-+ * all interrupts are kept masked.
-+ */
- hyp_error:
- entry hyp=1
-- msr daifclr, #2
- mov x0, sp
- bl do_trap_hyp_serror
- exit hyp=1
-
--/* Traps taken in Current EL with SP_ELx */
-+/*
-+ * Synchronous exception received while running in the hypervisor mode.
-+ *
-+ * While the exception could be executed with all the interrupts (e.g.
-+ * IRQ) unmasked, the interrupted context may have purposefully masked
-+ * some of them. So we want to inherit the state from the interrupted
-+ * context.
-+ */
- hyp_sync:
- entry hyp=1
-- msr daifclr, #6
-+
-+ /* Inherit interrupts */
-+ mrs x0, SPSR_el2
-+ and x0, x0, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_IRQ_MASK | PSR_FIQ_MASK)
-+ msr daif, x0
-+
- mov x0, sp
- bl do_trap_hyp_sync
- exit hyp=1
-
-+/*
-+ * IRQ received while running in the hypervisor mode.
-+ *
-+ * While the exception could be executed with all the interrupts but IRQ
-+ * unmasked, the interrupted context may have purposefully masked some
-+ * of them. So we want to inherit the state from the interrupt context
-+ * and keep IRQ masked.
-+ *
-+ * XXX: We may want to consider an ordering between interrupts (e.g. if
-+ * SError are masked, then IRQ should be masked too). However, this
-+ * would require some rework in some paths (e.g. panic, livepatch) to
-+ * ensure the ordering is enforced everywhere.
-+ */
- hyp_irq:
- entry hyp=1
-- msr daifclr, #4
-+
-+ /* Inherit D, A, F interrupts and keep I masked */
-+ mrs x0, SPSR_el2
-+ mov x1, #(PSR_DBG_MASK | PSR_ABT_MASK | PSR_FIQ_MASK)
-+ and x0, x0, x1
-+ orr x0, x0, #PSR_IRQ_MASK
-+ msr daif, x0
-+
- mov x0, sp
- bl do_trap_irq
- exit hyp=1
---
-2.11.0
-
diff --git a/main/xen/xsa304-4.11-1.patch b/main/xen/xsa304-4.11-1.patch
deleted file mode 100644
index f78d15220f..0000000000
--- a/main/xen/xsa304-4.11-1.patch
+++ /dev/null
@@ -1,71 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/vtd: Hide superpage support for SandyBridge IOMMUs
-
-Something causes SandyBridge IOMMUs to choke when sharing EPT pagetables, and
-an EPT superpage gets shattered. The root cause is still under investigation,
-but the end result is unusable in combination with CVE-2018-12207 protections.
-
-This is part of XSA-304 / CVE-2018-12207
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
-index fb7edfaef9..d698b1d50a 100644
---- a/xen/drivers/passthrough/vtd/extern.h
-+++ b/xen/drivers/passthrough/vtd/extern.h
-@@ -96,6 +96,8 @@ void vtd_ops_postamble_quirk(struct iommu* iommu);
- int __must_check me_wifi_quirk(struct domain *domain,
- u8 bus, u8 devfn, int map);
- void pci_vtd_quirk(const struct pci_dev *);
-+void quirk_iommu_caps(struct iommu *iommu);
-+
- bool_t platform_supports_intremap(void);
- bool_t platform_supports_x2apic(void);
-
-diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
-index f242e30caf..8712d3b4dc 100644
---- a/xen/drivers/passthrough/vtd/iommu.c
-+++ b/xen/drivers/passthrough/vtd/iommu.c
-@@ -1211,6 +1211,8 @@ int __init iommu_alloc(struct acpi_drhd_unit *drhd)
- if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
- return -ENODEV;
-
-+ quirk_iommu_caps(iommu);
-+
- if ( cap_fault_reg_offset(iommu->cap) +
- cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
- ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
-diff --git a/xen/drivers/passthrough/vtd/quirks.c b/xen/drivers/passthrough/vtd/quirks.c
-index d6db862678..b02688e316 100644
---- a/xen/drivers/passthrough/vtd/quirks.c
-+++ b/xen/drivers/passthrough/vtd/quirks.c
-@@ -540,3 +540,28 @@ void pci_vtd_quirk(const struct pci_dev *pdev)
- break;
- }
- }
-+
-+void __init quirk_iommu_caps(struct iommu *iommu)
-+{
-+ /*
-+ * IOMMU Quirks:
-+ *
-+ * SandyBridge IOMMUs claim support for 2M and 1G superpages, but don't
-+ * implement superpages internally.
-+ *
-+ * There are issues changing the walk length under in-flight DMA, which
-+ * has manifested as incompatibility between EPT/IOMMU sharing and the
-+ * workaround for CVE-2018-12207 / XSA-304. Hide the superpages
-+ * capabilities in the IOMMU, which will prevent Xen from sharing the EPT
-+ * and IOMMU pagetables.
-+ *
-+ * Detection of SandyBridge unfortunately has to be done by processor
-+ * model because the client parts don't expose their IOMMUs as PCI devices
-+ * we could match with a Device ID.
-+ */
-+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
-+ boot_cpu_data.x86 == 6 &&
-+ (boot_cpu_data.x86_model == 0x2a ||
-+ boot_cpu_data.x86_model == 0x2d) )
-+ iommu->cap &= ~(0xful << 34);
-+}
diff --git a/main/xen/xsa304-4.11-2.patch b/main/xen/xsa304-4.11-2.patch
deleted file mode 100644
index f22d7cc59b..0000000000
--- a/main/xen/xsa304-4.11-2.patch
+++ /dev/null
@@ -1,298 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/vtx: Disable executable EPT superpages to work around
- CVE-2018-12207
-
-CVE-2018-12207 covers a set of errata on various Intel processors, whereby a
-machine check exception can be generated in a corner case when an executable
-mapping changes size or cacheability without TLB invalidation. HVM guest
-kernels can trigger this to DoS the host.
-
-To mitigate, in affected hardware, all EPT superpages are marked NX. When an
-instruction fetch violation is observed against the superpage, the superpage
-is shattered to 4k and has execute permissions restored. This prevents the
-guest kernel from being able to create the necessary preconditions in the iTLB
-to exploit the vulnerability.
-
-This does come with a workload-dependent performance overhead, caused by
-increased TLB pressure. Performance can be restored, if guest kernels are
-trusted not to mount an attack, by specifying ept=exec-sp on the command line.
-
-This is part of XSA-304 / CVE-2018-12207
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Acked-by: George Dunlap <george.dunlap@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
-index c63a07d29b..684671cb7b 100644
---- a/docs/misc/xen-command-line.markdown
-+++ b/docs/misc/xen-command-line.markdown
-@@ -828,7 +828,7 @@ effect the inverse meaning.
- >> set as UC.
-
- ### ept (Intel)
--> `= List of ( {no-}pml | {no-}ad )`
-+> `= List of [ {no-}pml, {no-}ad, {no-}exec-sp ]`
-
- Controls EPT related features.
-
-@@ -851,6 +851,16 @@ Controls EPT related features.
-
- >> Have hardware keep accessed/dirty (A/D) bits updated.
-
-+* The `exec-sp` boolean controls whether EPT superpages with execute
-+ permissions are permitted. In general this is good for performance.
-+
-+ However, on processors vulnerable CVE-2018-12207, HVM guest kernels can
-+ use executable superpages to crash the host. By default, executable
-+ superpages are disabled on affected hardware.
-+
-+ If HVM guest kernels are trusted not to mount a DoS against the system,
-+ this option can enabled to regain performance.
-+
- ### extra\_guest\_irqs
- > `= [<domU number>][,<dom0 number>]`
-
-diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
-index f4a6a37149..1924434960 100644
---- a/xen/arch/x86/hvm/hvm.c
-+++ b/xen/arch/x86/hvm/hvm.c
-@@ -1706,6 +1706,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
- struct p2m_domain *p2m, *hostp2m;
- int rc, fall_through = 0, paged = 0;
- int sharing_enomem = 0;
-+ unsigned int page_order = 0;
- vm_event_request_t *req_ptr = NULL;
- bool_t ap2m_active, sync = 0;
-
-@@ -1774,7 +1775,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
- hostp2m = p2m_get_hostp2m(currd);
- mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma,
- P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0),
-- NULL);
-+ &page_order);
-
- if ( ap2m_active )
- {
-@@ -1786,7 +1787,7 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
- goto out;
- }
-
-- mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL);
-+ mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, &page_order);
- }
- else
- p2m = hostp2m;
-@@ -1828,6 +1829,24 @@ int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
- break;
- }
-
-+ /*
-+ * Workaround for XSA-304 / CVE-2018-12207. If we take an execution
-+ * fault against a non-executable superpage, shatter it to regain
-+ * execute permissions.
-+ */
-+ if ( page_order > 0 && npfec.insn_fetch && npfec.present && !violation )
-+ {
-+ int res = p2m_set_entry(p2m, _gfn(gfn), mfn, PAGE_ORDER_4K,
-+ p2mt, p2ma);
-+
-+ if ( res )
-+ printk(XENLOG_ERR "Failed to shatter gfn %"PRI_gfn": %d\n",
-+ gfn, res);
-+
-+ rc = !res;
-+ goto out_put_gfn;
-+ }
-+
- if ( violation )
- {
- /* Should #VE be emulated for this fault? */
-diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
-index 493986e84a..8821a3b536 100644
---- a/xen/arch/x86/hvm/vmx/vmcs.c
-+++ b/xen/arch/x86/hvm/vmx/vmcs.c
-@@ -67,6 +67,7 @@ integer_param("ple_window", ple_window);
-
- static bool_t __read_mostly opt_pml_enabled = 1;
- static s8 __read_mostly opt_ept_ad = -1;
-+int8_t __read_mostly opt_ept_exec_sp = -1;
-
- /*
- * The 'ept' parameter controls functionalities that depend on, or impact the
-@@ -94,6 +95,8 @@ static int __init parse_ept_param(const char *s)
- opt_pml_enabled = val;
- else if ( !cmdline_strcmp(s, "ad") )
- opt_ept_ad = val;
-+ else if ( !cmdline_strcmp(s, "exec-sp") )
-+ opt_ept_exec_sp = val;
- else
- rc = -EINVAL;
-
-diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
-index 840dc2b44d..a568d62643 100644
---- a/xen/arch/x86/hvm/vmx/vmx.c
-+++ b/xen/arch/x86/hvm/vmx/vmx.c
-@@ -2415,6 +2415,102 @@ static void pi_notification_interrupt(struct cpu_user_regs *regs)
- static void __init lbr_tsx_fixup_check(void);
- static void __init bdw_erratum_bdf14_fixup_check(void);
-
-+/*
-+ * Calculate whether the CPU is vulnerable to Instruction Fetch page
-+ * size-change MCEs.
-+ */
-+static bool __init has_if_pschange_mc(void)
-+{
-+ uint64_t caps = 0;
-+
-+ /*
-+ * If we are virtualised, there is nothing we can do. Our EPT tables are
-+ * shadowed by our hypervisor, and not walked by hardware.
-+ */
-+ if ( cpu_has_hypervisor )
-+ return false;
-+
-+ if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
-+ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
-+
-+ if ( caps & ARCH_CAPS_IF_PSCHANGE_MC_NO )
-+ return false;
-+
-+ /*
-+ * IF_PSCHANGE_MC is only known to affect Intel Family 6 processors at
-+ * this time.
-+ */
-+ if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-+ boot_cpu_data.x86 != 6 )
-+ return false;
-+
-+ switch ( boot_cpu_data.x86_model )
-+ {
-+ /*
-+ * Core processors since at least Nehalem are vulnerable.
-+ */
-+ case 0x1f: /* Auburndale / Havendale */
-+ case 0x1e: /* Nehalem */
-+ case 0x1a: /* Nehalem EP */
-+ case 0x2e: /* Nehalem EX */
-+ case 0x25: /* Westmere */
-+ case 0x2c: /* Westmere EP */
-+ case 0x2f: /* Westmere EX */
-+ case 0x2a: /* SandyBridge */
-+ case 0x2d: /* SandyBridge EP/EX */
-+ case 0x3a: /* IvyBridge */
-+ case 0x3e: /* IvyBridge EP/EX */
-+ case 0x3c: /* Haswell */
-+ case 0x3f: /* Haswell EX/EP */
-+ case 0x45: /* Haswell D */
-+ case 0x46: /* Haswell H */
-+ case 0x3d: /* Broadwell */
-+ case 0x47: /* Broadwell H */
-+ case 0x4f: /* Broadwell EP/EX */
-+ case 0x56: /* Broadwell D */
-+ case 0x4e: /* Skylake M */
-+ case 0x5e: /* Skylake D */
-+ case 0x55: /* Skylake-X / Cascade Lake */
-+ case 0x8e: /* Kaby / Coffee / Whiskey Lake M */
-+ case 0x9e: /* Kaby / Coffee / Whiskey Lake D */
-+ return true;
-+
-+ /*
-+ * Atom processors are not vulnerable.
-+ */
-+ case 0x1c: /* Pineview */
-+ case 0x26: /* Lincroft */
-+ case 0x27: /* Penwell */
-+ case 0x35: /* Cloverview */
-+ case 0x36: /* Cedarview */
-+ case 0x37: /* Baytrail / Valleyview (Silvermont) */
-+ case 0x4d: /* Avaton / Rangely (Silvermont) */
-+ case 0x4c: /* Cherrytrail / Brasswell */
-+ case 0x4a: /* Merrifield */
-+ case 0x5a: /* Moorefield */
-+ case 0x5c: /* Goldmont */
-+ case 0x5d: /* SoFIA 3G Granite/ES2.1 */
-+ case 0x65: /* SoFIA LTE AOSP */
-+ case 0x5f: /* Denverton */
-+ case 0x6e: /* Cougar Mountain */
-+ case 0x75: /* Lightning Mountain */
-+ case 0x7a: /* Gemini Lake */
-+ case 0x86: /* Jacobsville */
-+
-+ /*
-+ * Knights processors are not vulnerable.
-+ */
-+ case 0x57: /* Knights Landing */
-+ case 0x85: /* Knights Mill */
-+ return false;
-+
-+ default:
-+ printk("Unrecognised CPU model %#x - assuming vulnerable to IF_PSCHANGE_MC\n",
-+ boot_cpu_data.x86_model);
-+ return true;
-+ }
-+}
-+
- const struct hvm_function_table * __init start_vmx(void)
- {
- set_in_cr4(X86_CR4_VMXE);
-@@ -2435,6 +2531,17 @@ const struct hvm_function_table * __init start_vmx(void)
- */
- if ( cpu_has_vmx_ept && (cpu_has_vmx_pat || opt_force_ept) )
- {
-+ bool cpu_has_bug_pschange_mc = has_if_pschange_mc();
-+
-+ if ( opt_ept_exec_sp == -1 )
-+ {
-+ /* Default to non-executable superpages on vulnerable hardware. */
-+ opt_ept_exec_sp = !cpu_has_bug_pschange_mc;
-+
-+ if ( cpu_has_bug_pschange_mc )
-+ printk("VMX: Disabling executable EPT superpages due to CVE-2018-12207\n");
-+ }
-+
- vmx_function_table.hap_supported = 1;
- vmx_function_table.altp2m_supported = 1;
-
-diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
-index ce46201d45..93e08f89a2 100644
---- a/xen/arch/x86/mm/p2m-ept.c
-+++ b/xen/arch/x86/mm/p2m-ept.c
-@@ -215,6 +215,12 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
- break;
- }
-
-+ /*
-+ * Don't create executable superpages if we need to shatter them to
-+ * protect against CVE-2018-12207.
-+ */
-+ if ( !opt_ept_exec_sp && is_epte_superpage(entry) )
-+ entry->x = 0;
- }
-
- #define GUEST_TABLE_MAP_FAILED 0
-diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h
-index 89619e4afd..20eb7f6082 100644
---- a/xen/include/asm-x86/hvm/vmx/vmx.h
-+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
-@@ -28,6 +28,8 @@
- #include <asm/hvm/trace.h>
- #include <asm/hvm/vmx/vmcs.h>
-
-+extern int8_t opt_ept_exec_sp;
-+
- typedef union {
- struct {
- u64 r : 1, /* bit 0 - Read permission */
-diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
-index b8151d2d9f..89ae3e03f1 100644
---- a/xen/include/asm-x86/msr-index.h
-+++ b/xen/include/asm-x86/msr-index.h
-@@ -54,6 +54,7 @@
- #define ARCH_CAPS_SKIP_L1DFL (_AC(1, ULL) << 3)
- #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
- #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
-+#define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
-
- #define MSR_FLUSH_CMD 0x0000010b
- #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
diff --git a/main/xen/xsa304-4.11-3.patch b/main/xen/xsa304-4.11-3.patch
deleted file mode 100644
index bf30699efb..0000000000
--- a/main/xen/xsa304-4.11-3.patch
+++ /dev/null
@@ -1,110 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/vtx: Allow runtime modification of the exec-sp setting
-
-See patch for details.
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-Reviewed-by: George Dunlap <george.dunlap@citrix.com>
-
-diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
-index 684671cb7b..33ed1ffc40 100644
---- a/docs/misc/xen-command-line.markdown
-+++ b/docs/misc/xen-command-line.markdown
-@@ -861,6 +861,21 @@ Controls EPT related features.
- If HVM guest kernels are trusted not to mount a DoS against the system,
- this option can enabled to regain performance.
-
-+ This boolean may be modified at runtime using `xl set-parameters
-+ ept=[no-]exec-sp` to switch between fast and secure.
-+
-+ * When switching from secure to fast, preexisting HVM domains will run
-+ at their current performance until they are rebooted; new domains will
-+ run without any overhead.
-+
-+ * When switching from fast to secure, all HVM domains will immediately
-+ suffer a performance penalty.
-+
-+ **Warning: No guarantee is made that this runtime option will be retained
-+ indefinitely, or that it will retain this exact behaviour. It is
-+ intended as an emergency option for people who first chose fast, then
-+ change their minds to secure, and wish not to reboot.**
-+
- ### extra\_guest\_irqs
- > `= [<domU number>][,<dom0 number>]`
-
-diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c
-index 8821a3b536..15376e25ba 100644
---- a/xen/arch/x86/hvm/vmx/vmcs.c
-+++ b/xen/arch/x86/hvm/vmx/vmcs.c
-@@ -107,6 +107,41 @@ static int __init parse_ept_param(const char *s)
- }
- custom_param("ept", parse_ept_param);
-
-+static int parse_ept_param_runtime(const char *s)
-+{
-+ int val;
-+
-+ if ( !cpu_has_vmx_ept || !hvm_funcs.hap_supported ||
-+ !(hvm_funcs.hap_capabilities &
-+ (HVM_HAP_SUPERPAGE_2MB | HVM_HAP_SUPERPAGE_1GB)) )
-+ {
-+ printk("VMX: EPT not available, or not in use - ignoring\n");
-+ return 0;
-+ }
-+
-+ if ( (val = parse_boolean("exec-sp", s, NULL)) < 0 )
-+ return -EINVAL;
-+
-+ if ( val != opt_ept_exec_sp )
-+ {
-+ struct domain *d;
-+
-+ opt_ept_exec_sp = val;
-+
-+ rcu_read_lock(&domlist_read_lock);
-+ for_each_domain ( d )
-+ if ( paging_mode_hap(d) )
-+ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
-+ rcu_read_unlock(&domlist_read_lock);
-+ }
-+
-+ printk("VMX: EPT executable superpages %sabled\n",
-+ val ? "en" : "dis");
-+
-+ return 0;
-+}
-+custom_runtime_only_param("ept", parse_ept_param_runtime);
-+
- /* Dynamic (run-time adjusted) execution control flags. */
- u32 vmx_pin_based_exec_control __read_mostly;
- u32 vmx_cpu_based_exec_control __read_mostly;
-diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
-index 2b62bc61dd..97c417fc3e 100644
---- a/xen/arch/x86/mm/p2m.c
-+++ b/xen/arch/x86/mm/p2m.c
-@@ -257,17 +257,22 @@ int p2m_is_logdirty_range(struct p2m_domain *p2m, unsigned long start,
- return 0;
- }
-
-+/*
-+ * May be called with ot = nt = p2m_ram_rw for its side effect of
-+ * recalculating all PTEs in the p2m.
-+ */
- void p2m_change_entry_type_global(struct domain *d,
- p2m_type_t ot, p2m_type_t nt)
- {
- struct p2m_domain *p2m = p2m_get_hostp2m(d);
-
-- ASSERT(ot != nt);
- ASSERT(p2m_is_changeable(ot) && p2m_is_changeable(nt));
-
- p2m_lock(p2m);
- p2m->change_entry_type_global(p2m, ot, nt);
-- p2m->global_logdirty = (nt == p2m_ram_logdirty);
-+ /* Don't allow 'recalculate' operations to change the logdirty state. */
-+ if ( ot != nt )
-+ p2m->global_logdirty = (nt == p2m_ram_logdirty);
- p2m_unlock(p2m);
- }
-
diff --git a/main/xen/xsa305-4.11-1.patch b/main/xen/xsa305-4.11-1.patch
deleted file mode 100644
index 92e6ed33d0..0000000000
--- a/main/xen/xsa305-4.11-1.patch
+++ /dev/null
@@ -1,288 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/tsx: Introduce tsx= to use MSR_TSX_CTRL when available
-
-To protect against the TSX Async Abort speculative vulnerability, Intel have
-released new microcode for affected parts which introduce the MSR_TSX_CTRL
-control, which allows TSX to be turned off. This will be architectural on
-future parts.
-
-Introduce tsx= to provide a global on/off for TSX, including its enumeration
-via CPUID. Provide stub virtualisation of this MSR, as it is not exposed to
-guests at the moment.
-
-VMs may have booted before microcode is loaded, or before hosts have rebooted,
-and they still want to migrate freely. A VM which booted seeing TSX can
-migrate safely to hosts with TSX disabled - TSX will start unconditionally
-aborting, but still behave in a manner compatible with the ABI.
-
-The guest-visible behaviour is equivalent to late loading the microcode and
-setting the RTM_DISABLE bit in the course of live patching.
-
-This is part of XSA-305 / CVE-2019-11135
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
-index 684671cb7b..b86d26399a 100644
---- a/docs/misc/xen-command-line.markdown
-+++ b/docs/misc/xen-command-line.markdown
-@@ -1948,6 +1948,20 @@ pages) must also be specified via the tbuf\_size parameter.
- ### tsc (x86)
- > `= unstable | skewed | stable:socket`
-
-+### tsx
-+ = <bool>
-+
-+ Applicability: x86
-+ Default: true
-+
-+Controls for the use of Transactional Synchronization eXtensions.
-+
-+On Intel parts released in Q3 2019 (with updated microcode), and future parts,
-+a control has been introduced which allows TSX to be turned off.
-+
-+On systems with the ability to turn TSX off, this boolean offers system wide
-+control of whether TSX is enabled or disabled.
-+
- ### ucode (x86)
- > `= [<integer> | scan]`
-
-diff --git a/xen/arch/x86/Makefile b/xen/arch/x86/Makefile
-index da1e4827f4..4c82d9f710 100644
---- a/xen/arch/x86/Makefile
-+++ b/xen/arch/x86/Makefile
-@@ -65,6 +65,7 @@ obj-y += sysctl.o
- obj-y += time.o
- obj-y += trace.o
- obj-y += traps.o
-+obj-y += tsx.o
- obj-y += usercopy.o
- obj-y += x86_emulate.o
- obj-$(CONFIG_TBOOT) += tboot.o
-diff --git a/xen/arch/x86/cpuid.c b/xen/arch/x86/cpuid.c
-index 5e11970701..04aefa555d 100644
---- a/xen/arch/x86/cpuid.c
-+++ b/xen/arch/x86/cpuid.c
-@@ -622,6 +622,20 @@ void recalculate_cpuid_policy(struct domain *d)
- if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
- __set_bit(X86_FEATURE_ITSC, max_fs);
-
-+ /*
-+ * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
-+ * TSX and hide the feature bits. Migrating-in VMs may have been booted
-+ * pre-mitigation when the TSX features were visbile.
-+ *
-+ * This situation is compatible (albeit with a perf hit to any TSX code in
-+ * the guest), so allow the feature bits to remain set.
-+ */
-+ if ( cpu_has_tsx_ctrl )
-+ {
-+ __set_bit(X86_FEATURE_HLE, max_fs);
-+ __set_bit(X86_FEATURE_RTM, max_fs);
-+ }
-+
- /* Clamp the toolstacks choices to reality. */
- for ( i = 0; i < ARRAY_SIZE(fs); i++ )
- fs[i] &= max_fs[i];
-diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
-index ebc0665615..35d99a98a1 100644
---- a/xen/arch/x86/msr.c
-+++ b/xen/arch/x86/msr.c
-@@ -153,6 +153,7 @@ int guest_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
- case MSR_FLUSH_CMD:
- /* Write-only */
- case MSR_TSX_FORCE_ABORT:
-+ case MSR_TSX_CTRL:
- /* Not offered to guests. */
- goto gp_fault;
-
-@@ -233,6 +234,7 @@ int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
- case MSR_ARCH_CAPABILITIES:
- /* Read-only */
- case MSR_TSX_FORCE_ABORT:
-+ case MSR_TSX_CTRL:
- /* Not offered to guests. */
- goto gp_fault;
-
-diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
-index 657160549f..dc13ad6c36 100644
---- a/xen/arch/x86/setup.c
-+++ b/xen/arch/x86/setup.c
-@@ -1551,6 +1551,8 @@ void __init noreturn __start_xen(unsigned long mbi_p)
-
- early_microcode_init();
-
-+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
-+
- identify_cpu(&boot_cpu_data);
-
- set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
-diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
-index fd52a10cf9..bdc118d88b 100644
---- a/xen/arch/x86/smpboot.c
-+++ b/xen/arch/x86/smpboot.c
-@@ -376,6 +376,8 @@ void start_secondary(void *unused)
- if ( boot_cpu_has(X86_FEATURE_IBRSB) )
- wrmsrl(MSR_SPEC_CTRL, default_xen_spec_ctrl);
-
-+ tsx_init(); /* Needs microcode. May change HLE/RTM feature bits. */
-+
- if ( xen_guest )
- hypervisor_ap_setup();
-
-diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
-new file mode 100644
-index 0000000000..a8ec2ccc69
---- /dev/null
-+++ b/xen/arch/x86/tsx.c
-@@ -0,0 +1,74 @@
-+#include <xen/init.h>
-+#include <asm/msr.h>
-+
-+/*
-+ * Valid values:
-+ * 1 => Explicit tsx=1
-+ * 0 => Explicit tsx=0
-+ * -1 => Default, implicit tsx=1
-+ *
-+ * This is arranged such that the bottom bit encodes whether TSX is actually
-+ * disabled, while identifying various explicit (>=0) and implicit (<0)
-+ * conditions.
-+ */
-+int8_t __read_mostly opt_tsx = -1;
-+int8_t __read_mostly cpu_has_tsx_ctrl = -1;
-+
-+static int __init parse_tsx(const char *s)
-+{
-+ int rc = 0, val = parse_bool(s, NULL);
-+
-+ if ( val >= 0 )
-+ opt_tsx = val;
-+ else
-+ rc = -EINVAL;
-+
-+ return rc;
-+}
-+custom_param("tsx", parse_tsx);
-+
-+void tsx_init(void)
-+{
-+ /*
-+ * This function is first called between microcode being loaded, and CPUID
-+ * being scanned generally. Calculate from raw data whether MSR_TSX_CTRL
-+ * is available.
-+ */
-+ if ( unlikely(cpu_has_tsx_ctrl < 0) )
-+ {
-+ uint64_t caps = 0;
-+
-+ if ( boot_cpu_data.cpuid_level >= 7 &&
-+ (cpuid_count_edx(7, 0) & cpufeat_mask(X86_FEATURE_ARCH_CAPS)) )
-+ rdmsrl(MSR_ARCH_CAPABILITIES, caps);
-+
-+ cpu_has_tsx_ctrl = !!(caps & ARCH_CAPS_TSX_CTRL);
-+ }
-+
-+ if ( cpu_has_tsx_ctrl )
-+ {
-+ uint64_t val;
-+
-+ rdmsrl(MSR_TSX_CTRL, val);
-+
-+ val &= ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR);
-+ /* Check bottom bit only. Higher bits are various sentinals. */
-+ if ( !(opt_tsx & 1) )
-+ val |= TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR;
-+
-+ wrmsrl(MSR_TSX_CTRL, val);
-+ }
-+ else if ( opt_tsx >= 0 )
-+ printk_once(XENLOG_WARNING
-+ "MSR_TSX_CTRL not available - Ignoring tsx= setting\n");
-+}
-+
-+/*
-+ * Local variables:
-+ * mode: C
-+ * c-file-style: "BSD"
-+ * c-basic-offset: 4
-+ * tab-width: 4
-+ * indent-tabs-mode: nil
-+ * End:
-+ */
-diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
-index 89ae3e03f1..5ee7a37c12 100644
---- a/xen/include/asm-x86/msr-index.h
-+++ b/xen/include/asm-x86/msr-index.h
-@@ -55,6 +55,7 @@
- #define ARCH_CAPS_SSB_NO (_AC(1, ULL) << 4)
- #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
- #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
-+#define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
-
- #define MSR_FLUSH_CMD 0x0000010b
- #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)
-@@ -62,6 +63,10 @@
- #define MSR_TSX_FORCE_ABORT 0x0000010f
- #define TSX_FORCE_ABORT_RTM (_AC(1, ULL) << 0)
-
-+#define MSR_TSX_CTRL 0x00000122
-+#define TSX_CTRL_RTM_DISABLE (_AC(1, ULL) << 0)
-+#define TSX_CTRL_CPUID_CLEAR (_AC(1, ULL) << 1)
-+
- /* Intel MSRs. Some also available on other CPUs */
- #define MSR_IA32_PERFCTR0 0x000000c1
- #define MSR_IA32_A_PERFCTR0 0x000004c1
-diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
-index 20d1ecb332..66224f23b9 100644
---- a/xen/include/asm-x86/processor.h
-+++ b/xen/include/asm-x86/processor.h
-@@ -258,6 +258,16 @@ static always_inline unsigned int cpuid_count_ebx(
- return ebx;
- }
-
-+static always_inline unsigned int cpuid_count_edx(
-+ unsigned int leaf, unsigned int subleaf)
-+{
-+ unsigned int edx, tmp;
-+
-+ cpuid_count(leaf, subleaf, &tmp, &tmp, &tmp, &edx);
-+
-+ return edx;
-+}
-+
- static always_inline void cpuid_count_leaf(uint32_t leaf, uint32_t subleaf,
- struct cpuid_leaf *data)
- {
-@@ -610,6 +620,9 @@ static inline uint8_t get_cpu_family(uint32_t raw, uint8_t *model,
- return fam;
- }
-
-+extern int8_t opt_tsx, cpu_has_tsx_ctrl;
-+void tsx_init(void);
-+
- #endif /* !__ASSEMBLY__ */
-
- #endif /* __ASM_X86_PROCESSOR_H */
-diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
-index 750f809968..be223a6950 100644
---- a/xen/include/xen/lib.h
-+++ b/xen/include/xen/lib.h
-@@ -116,6 +116,16 @@ extern int printk_ratelimit(void);
- #define gprintk(lvl, fmt, args...) \
- printk(XENLOG_GUEST lvl "%pv " fmt, current, ## args)
-
-+#define printk_once(fmt, args...) \
-+({ \
-+ static bool __read_mostly once_; \
-+ if ( unlikely(!once_) ) \
-+ { \
-+ once_ = true; \
-+ printk(fmt, ## args); \
-+ } \
-+})
-+
- #ifdef NDEBUG
-
- static inline void
diff --git a/main/xen/xsa305-4.11-2.patch b/main/xen/xsa305-4.11-2.patch
deleted file mode 100644
index 0b68ff1451..0000000000
--- a/main/xen/xsa305-4.11-2.patch
+++ /dev/null
@@ -1,192 +0,0 @@
-From: Andrew Cooper <andrew.cooper3@citrix.com>
-Subject: x86/spec-ctrl: Mitigate the TSX Asynchronous Abort sidechannel
-
-See patch documentation and comments.
-
-This is part of XSA-305 / CVE-2019-11135
-
-Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
-Reviewed-by: Jan Beulich <jbeulich@suse.com>
-
-diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
-index b86d26399a..31635a473a 100644
---- a/docs/misc/xen-command-line.markdown
-+++ b/docs/misc/xen-command-line.markdown
-@@ -1841,7 +1841,7 @@ extreme care.**
- An overall boolean value, `spec-ctrl=no`, can be specified to turn off all
- mitigations, including pieces of infrastructure used to virtualise certain
- mitigation features for guests. This also includes settings which `xpti`,
--`smt`, `pv-l1tf` control, unless the respective option(s) have been
-+`smt`, `pv-l1tf`, `tsx` control, unless the respective option(s) have been
- specified earlier on the command line.
-
- Alternatively, a slightly more restricted `spec-ctrl=no-xen` can be used to
-@@ -1952,7 +1952,7 @@ pages) must also be specified via the tbuf\_size parameter.
- = <bool>
-
- Applicability: x86
-- Default: true
-+ Default: false on parts vulnerable to TAA, true otherwise
-
- Controls for the use of Transactional Synchronization eXtensions.
-
-@@ -1962,6 +1962,19 @@ a control has been introduced which allows TSX to be turned off.
- On systems with the ability to turn TSX off, this boolean offers system wide
- control of whether TSX is enabled or disabled.
-
-+On parts vulnerable to CVE-2019-11135 / TSX Asynchronous Abort, the following
-+logic applies:
-+
-+ * An explicit `tsx=` choice is honoured, even if it is `true` and would
-+ result in a vulnerable system.
-+
-+ * When no explicit `tsx=` choice is given, parts vulnerable to TAA will be
-+ mitigated by disabling TSX, as this is the lowest overhead option.
-+
-+ * If the use of TSX is important, the more expensive TAA mitigations can be
-+ opted in to with `smt=0 spec-ctrl=md-clear`, at which point TSX will remain
-+ active by default.
-+
- ### ucode (x86)
- > `= [<integer> | scan]`
-
-diff --git a/xen/arch/x86/spec_ctrl.c b/xen/arch/x86/spec_ctrl.c
-index 2fe16b423d..ab196b156d 100644
---- a/xen/arch/x86/spec_ctrl.c
-+++ b/xen/arch/x86/spec_ctrl.c
-@@ -152,6 +152,9 @@ static int __init parse_spec_ctrl(const char *s)
- if ( opt_pv_l1tf_domu < 0 )
- opt_pv_l1tf_domu = 0;
-
-+ if ( opt_tsx == -1 )
-+ opt_tsx = -3;
-+
- disable_common:
- opt_rsb_pv = false;
- opt_rsb_hvm = false;
-@@ -362,7 +365,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- printk("Speculative mitigation facilities:\n");
-
- /* Hardware features which pertain to speculative mitigations. */
-- printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s\n",
-+ printk(" Hardware features:%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
- (_7d0 & cpufeat_mask(X86_FEATURE_IBRSB)) ? " IBRS/IBPB" : "",
- (_7d0 & cpufeat_mask(X86_FEATURE_STIBP)) ? " STIBP" : "",
- (_7d0 & cpufeat_mask(X86_FEATURE_L1D_FLUSH)) ? " L1D_FLUSH" : "",
-@@ -374,7 +377,9 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (caps & ARCH_CAPS_RSBA) ? " RSBA" : "",
- (caps & ARCH_CAPS_SKIP_L1DFL) ? " SKIP_L1DFL": "",
- (caps & ARCH_CAPS_SSB_NO) ? " SSB_NO" : "",
-- (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "");
-+ (caps & ARCH_CAPS_MDS_NO) ? " MDS_NO" : "",
-+ (caps & ARCH_CAPS_TSX_CTRL) ? " TSX_CTRL" : "",
-+ (caps & ARCH_CAPS_TAA_NO) ? " TAA_NO" : "");
-
- /* Compiled-in support which pertains to mitigations. */
- if ( IS_ENABLED(CONFIG_INDIRECT_THUNK) || IS_ENABLED(CONFIG_SHADOW_PAGING) )
-@@ -388,7 +393,7 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- "\n");
-
- /* Settings for Xen's protection, irrespective of guests. */
-- printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s, Other:%s%s%s\n",
-+ printk(" Xen settings: BTI-Thunk %s, SPEC_CTRL: %s%s%s, Other:%s%s%s\n",
- thunk == THUNK_NONE ? "N/A" :
- thunk == THUNK_RETPOLINE ? "RETPOLINE" :
- thunk == THUNK_LFENCE ? "LFENCE" :
-@@ -397,6 +402,8 @@ static void __init print_details(enum ind_thunk thunk, uint64_t caps)
- (default_xen_spec_ctrl & SPEC_CTRL_IBRS) ? "IBRS+" : "IBRS-",
- !boot_cpu_has(X86_FEATURE_SSBD) ? "" :
- (default_xen_spec_ctrl & SPEC_CTRL_SSBD) ? " SSBD+" : " SSBD-",
-+ !(caps & ARCH_CAPS_TSX_CTRL) ? "" :
-+ (opt_tsx & 1) ? " TSX+" : " TSX-",
- opt_ibpb ? " IBPB" : "",
- opt_l1d_flush ? " L1D_FLUSH" : "",
- opt_md_clear_pv || opt_md_clear_hvm ? " VERW" : "");
-@@ -911,6 +918,7 @@ void __init init_speculation_mitigations(void)
- {
- enum ind_thunk thunk = THUNK_DEFAULT;
- bool use_spec_ctrl = false, ibrs = false, hw_smt_enabled;
-+ bool cpu_has_bug_taa;
- uint64_t caps = 0;
-
- if ( boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
-@@ -1140,6 +1148,53 @@ void __init init_speculation_mitigations(void)
- "enabled. Mitigations will not be fully effective. Please\n"
- "choose an explicit smt=<bool> setting. See XSA-297.\n");
-
-+ /*
-+ * Vulnerability to TAA is a little complicated to quantify.
-+ *
-+ * In the pipeline, it is just another way to get speculative access to
-+ * stale load port, store buffer or fill buffer data, and therefore can be
-+ * considered a superset of MDS (on TSX-capable parts). On parts which
-+ * predate MDS_NO, the existing VERW flushing will mitigate this
-+ * sidechannel as well.
-+ *
-+ * On parts which contain MDS_NO, the lack of VERW flushing means that an
-+ * attacker can still use TSX to target microarchitectural buffers to leak
-+ * secrets. Therefore, we consider TAA to be the set of TSX-capable parts
-+ * which have MDS_NO but lack TAA_NO.
-+ *
-+ * Note: cpu_has_rtm (== hle) could already be hidden by `tsx=0` on the
-+ * cmdline. MSR_TSX_CTRL will only appear on TSX-capable parts, so
-+ * we check both to spot TSX in a microcode/cmdline independent way.
-+ */
-+ cpu_has_bug_taa =
-+ (cpu_has_rtm || (caps & ARCH_CAPS_TSX_CTRL)) &&
-+ (caps & (ARCH_CAPS_MDS_NO | ARCH_CAPS_TAA_NO)) == ARCH_CAPS_MDS_NO;
-+
-+ /*
-+ * On TAA-affected hardware, disabling TSX is the preferred mitigation, vs
-+ * the MDS mitigation of disabling HT and using VERW flushing.
-+ *
-+ * On CPUs which advertise MDS_NO, VERW has no flushing side effect until
-+ * the TSX_CTRL microcode is loaded, despite the MD_CLEAR CPUID bit being
-+ * advertised, and there isn't a MD_CLEAR_2 flag to use...
-+ *
-+ * If we're on affected hardware, able to do something about it (which
-+ * implies that VERW now works), no explicit TSX choice and traditional
-+ * MDS mitigations (no-SMT, VERW) not obviosuly in use (someone might
-+ * plausibly value TSX higher than Hyperthreading...), disable TSX to
-+ * mitigate TAA.
-+ */
-+ if ( opt_tsx == -1 && cpu_has_bug_taa && (caps & ARCH_CAPS_TSX_CTRL) &&
-+ ((hw_smt_enabled && opt_smt) ||
-+ !boot_cpu_has(X86_FEATURE_SC_VERW_IDLE)) )
-+ {
-+ setup_clear_cpu_cap(X86_FEATURE_HLE);
-+ setup_clear_cpu_cap(X86_FEATURE_RTM);
-+
-+ opt_tsx = 0;
-+ tsx_init();
-+ }
-+
- print_details(thunk, caps);
-
- /*
-diff --git a/xen/arch/x86/tsx.c b/xen/arch/x86/tsx.c
-index a8ec2ccc69..2d202a0d4e 100644
---- a/xen/arch/x86/tsx.c
-+++ b/xen/arch/x86/tsx.c
-@@ -5,7 +5,8 @@
- * Valid values:
- * 1 => Explicit tsx=1
- * 0 => Explicit tsx=0
-- * -1 => Default, implicit tsx=1
-+ * -1 => Default, implicit tsx=1, may change to 0 to mitigate TAA
-+ * -3 => Implicit tsx=1 (feed-through from spec-ctrl=0)
- *
- * This is arranged such that the bottom bit encodes whether TSX is actually
- * disabled, while identifying various explicit (>=0) and implicit (<0)
-diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
-index 5ee7a37c12..1761a01f1f 100644
---- a/xen/include/asm-x86/msr-index.h
-+++ b/xen/include/asm-x86/msr-index.h
-@@ -56,6 +56,7 @@
- #define ARCH_CAPS_MDS_NO (_AC(1, ULL) << 5)
- #define ARCH_CAPS_IF_PSCHANGE_MC_NO (_AC(1, ULL) << 6)
- #define ARCH_CAPS_TSX_CTRL (_AC(1, ULL) << 7)
-+#define ARCH_CAPS_TAA_NO (_AC(1, ULL) << 8)
-
- #define MSR_FLUSH_CMD 0x0000010b
- #define FLUSH_CMD_L1D (_AC(1, ULL) << 0)