qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] vfio: Support P2P access in confidential VM


From: Alex Williamson
Subject: Re: [PATCH] vfio: Support P2P access in confidential VM
Date: Thu, 16 Jan 2025 12:47:40 -0500

On Thu, 16 Jan 2025 17:53:55 +0800
Wencheng Yang <east.moutain.yang@gmail.com> wrote:

> On confidential VM platform, for example, AMD-SEV, P2P doesn't work.
> The underlying reason is that IOMMU driver set encryption bit on
> IOMMU page table pte entry, it's reasonalbe if the pte maps iova
> to system memory. However, if the pte maps iova to device's
> mmio bar space, setting encryption bit on pte would cause IOMMU
> translates iova to incorrect bus address, rather than mmio bar
> address.
> 
> To fix the issue, the key point is to let IOMMU driver know the
> target phyical address is system memory or device mmio.
> 
> VFIO allocates virtual address and maps it to device mmio bar,
> the member @ram_device of MemoryRegion indicates the memory
> region is for mmio. The patch passes the info to VFIO DAM API,
> IOMMU driver would do the correct thing.
> 
> Signed-off-by: Wencheng Yang <east.moutain.yang@gmail.com>
> ---
>  hw/vfio/common.c                      | 67 +++++++++++++++++----------
>  hw/vfio/container-base.c              |  4 +-
>  hw/vfio/container.c                   |  6 ++-
>  hw/vfio/iommufd.c                     |  4 +-
>  hw/virtio/vhost-vdpa.c                |  6 +--
>  include/exec/memory.h                 |  7 ++-
>  include/hw/vfio/vfio-common.h         |  4 ++
>  include/hw/vfio/vfio-container-base.h |  4 +-
>  linux-headers/linux/vfio.h            |  1 +
>  system/memory.c                       | 11 +++--
>  10 files changed, 74 insertions(+), 40 deletions(-)
> 
> diff --git a/hw/vfio/common.c b/hw/vfio/common.c
> index f7499a9b74..2660a42f9e 100644
> --- a/hw/vfio/common.c
> +++ b/hw/vfio/common.c
> @@ -247,31 +247,42 @@ static bool 
> vfio_listener_skipped_section(MemoryRegionSection *section)
>  
>  /* Called with rcu_read_lock held.  */
>  static bool vfio_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> -                               ram_addr_t *ram_addr, bool *read_only,
> +                               ram_addr_t *ram_addr, uint32_t *flag,
>                                 Error **errp)
>  {
>      bool ret, mr_has_discard_manager;
> +    uint32_t mr_flag = 0;
>  
> -    ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, read_only,
> +    ret = memory_get_xlat_addr(iotlb, vaddr, ram_addr, &mr_flag,
>                                 &mr_has_discard_manager, errp);
> -    if (ret && mr_has_discard_manager) {
> -        /*
> -         * Malicious VMs might trigger discarding of IOMMU-mapped memory. The
> -         * pages will remain pinned inside vfio until unmapped, resulting in 
> a
> -         * higher memory consumption than expected. If memory would get
> -         * populated again later, there would be an inconsistency between 
> pages
> -         * pinned by vfio and pages seen by QEMU. This is the case until
> -         * unmapped from the IOMMU (e.g., during device reset).
> -         *
> -         * With malicious guests, we really only care about pinning more 
> memory
> -         * than expected. RLIMIT_MEMLOCK set for the user/process can never 
> be
> -         * exceeded and can be used to mitigate this problem.
> -         */
> -        warn_report_once("Using vfio with vIOMMUs and coordinated discarding 
> of"
> -                         " RAM (e.g., virtio-mem) works, however, malicious"
> -                         " guests can trigger pinning of more memory than"
> -                         " intended via an IOMMU. It's possible to mitigate "
> -                         " by setting/adjusting RLIMIT_MEMLOCK.");
> +    if (ret) {
> +        if (flag) {
> +            if (mr_flag & MRF_READONLY)
> +                *flag |= VFIO_MRF_READONLY;
> +
> +            if (mr_flag & MRF_RAMDEV)
> +                *flag |= VFIO_MRF_RAMDEV;
> +        }
> +
> +        if (mr_has_discard_manager) {
> +            /*
> +             * Malicious VMs might trigger discarding of IOMMU-mapped 
> memory. The
> +             * pages will remain pinned inside vfio until unmapped, 
> resulting in a
> +             * higher memory consumption than expected. If memory would get
> +             * populated again later, there would be an inconsistency 
> between pages
> +             * pinned by vfio and pages seen by QEMU. This is the case until
> +             * unmapped from the IOMMU (e.g., during device reset).
> +             *
> +             * With malicious guests, we really only care about pinning more 
> memory
> +             * than expected. RLIMIT_MEMLOCK set for the user/process can 
> never be
> +             * exceeded and can be used to mitigate this problem.
> +             */
> +            warn_report_once("Using vfio with vIOMMUs and coordinated 
> discarding of"
> +                             " RAM (e.g., virtio-mem) works, however, 
> malicious"
> +                             " guests can trigger pinning of more memory 
> than"
> +                             " intended via an IOMMU. It's possible to 
> mitigate "
> +                             " by setting/adjusting RLIMIT_MEMLOCK.");
> +        }
>      }
>      return ret;
>  }
> @@ -298,9 +309,9 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
> IOMMUTLBEntry *iotlb)
>      rcu_read_lock();
>  
>      if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> -        bool read_only;
> +        uint32_t flag = 0;
>  
> -        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, 
> &local_err)) {
> +        if (!vfio_get_xlat_addr(iotlb, &vaddr, NULL, &flag, &local_err)) {
>              error_report_err(local_err);
>              goto out;
>          }
> @@ -313,7 +324,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, 
> IOMMUTLBEntry *iotlb)
>           */
>          ret = vfio_container_dma_map(bcontainer, iova,
>                                       iotlb->addr_mask + 1, vaddr,
> -                                     read_only);
> +                                     flag);
>          if (ret) {
>              error_report("vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
>                           "0x%"HWADDR_PRIx", %p) = %d (%s)",
> @@ -363,6 +374,7 @@ static int 
> vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
>                         int128_get64(section->size);
>      hwaddr start, next, iova;
>      void *vaddr;
> +    uint32_t flag = 0;
>      int ret;
>  
>      /*
> @@ -377,8 +389,10 @@ static int 
> vfio_ram_discard_notify_populate(RamDiscardListener *rdl,
>                 section->offset_within_address_space;
>          vaddr = memory_region_get_ram_ptr(section->mr) + start;
>  
> +        flag |= section->readonly? VFIO_MRF_READONLY: 0;
> +        flag |= section->ram_device? VFIO_MRF_RAMDEV: 0;
>          ret = vfio_container_dma_map(bcontainer, iova, next - start,
> -                                     vaddr, section->readonly);
> +                                     vaddr, flag);
>          if (ret) {
>              /* Rollback */
>              vfio_ram_discard_notify_discard(rdl, section);
> @@ -563,6 +577,7 @@ static void vfio_listener_region_add(MemoryListener 
> *listener,
>      hwaddr iova, end;
>      Int128 llend, llsize;
>      void *vaddr;
> +    uint32_t flag = 0;
>      int ret;
>      Error *err = NULL;
>  
> @@ -661,8 +676,10 @@ static void vfio_listener_region_add(MemoryListener 
> *listener,
>          }
>      }
>  
> +    flag |= section->readonly? VFIO_MRF_READONLY: 0;
> +    flag |= section->ram_device? VFIO_MRF_RAMDEV: 0;
>      ret = vfio_container_dma_map(bcontainer, iova, int128_get64(llsize),
> -                                 vaddr, section->readonly);
> +                                 vaddr, flag);
>      if (ret) {
>          error_setg(&err, "vfio_container_dma_map(%p, 0x%"HWADDR_PRIx", "
>                     "0x%"HWADDR_PRIx", %p) = %d (%s)",
> diff --git a/hw/vfio/container-base.c b/hw/vfio/container-base.c
> index 749a3fd29d..7cee2ac562 100644
> --- a/hw/vfio/container-base.c
> +++ b/hw/vfio/container-base.c
> @@ -17,12 +17,12 @@
>  
>  int vfio_container_dma_map(VFIOContainerBase *bcontainer,
>                             hwaddr iova, ram_addr_t size,
> -                           void *vaddr, bool readonly)
> +                           void *vaddr, uint32_t flag)
>  {
>      VFIOIOMMUClass *vioc = VFIO_IOMMU_GET_CLASS(bcontainer);
>  
>      g_assert(vioc->dma_map);
> -    return vioc->dma_map(bcontainer, iova, size, vaddr, readonly);
> +    return vioc->dma_map(bcontainer, iova, size, vaddr, flag);
>  }
>  
>  int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
> diff --git a/hw/vfio/container.c b/hw/vfio/container.c
> index 4ebb526808..90c32cd16d 100644
> --- a/hw/vfio/container.c
> +++ b/hw/vfio/container.c
> @@ -176,7 +176,7 @@ static int vfio_legacy_dma_unmap(const VFIOContainerBase 
> *bcontainer,
>  }
>  
>  static int vfio_legacy_dma_map(const VFIOContainerBase *bcontainer, hwaddr 
> iova,
> -                               ram_addr_t size, void *vaddr, bool readonly)
> +                               ram_addr_t size, void *vaddr, uint32_t flag)
>  {
>      const VFIOContainer *container = container_of(bcontainer, VFIOContainer,
>                                                    bcontainer);
> @@ -188,9 +188,11 @@ static int vfio_legacy_dma_map(const VFIOContainerBase 
> *bcontainer, hwaddr iova,
>          .size = size,
>      };
>  
> -    if (!readonly) {
> +    if (!(flag & VFIO_MRF_READONLY)) {
>          map.flags |= VFIO_DMA_MAP_FLAG_WRITE;
>      }
> +    if (flag & VFIO_MRF_RAMDEV)
> +        map.flags |= VFIO_DMA_MAP_FLAG_MMIO;
>  
>      /*
>       * Try the mapping, if it fails with EBUSY, unmap the region and try
> diff --git a/hw/vfio/iommufd.c b/hw/vfio/iommufd.c
> index 3490a8f1eb..c773b45b5d 100644
> --- a/hw/vfio/iommufd.c
> +++ b/hw/vfio/iommufd.c
> @@ -28,14 +28,14 @@
>  #include "exec/ram_addr.h"
>  
>  static int iommufd_cdev_map(const VFIOContainerBase *bcontainer, hwaddr iova,
> -                            ram_addr_t size, void *vaddr, bool readonly)
> +                            ram_addr_t size, void *vaddr, uint32_t flag)
>  {
>      const VFIOIOMMUFDContainer *container =
>          container_of(bcontainer, VFIOIOMMUFDContainer, bcontainer);
>  
>      return iommufd_backend_map_dma(container->be,
>                                     container->ioas_id,
> -                                   iova, size, vaddr, readonly);
> +                                   iova, size, vaddr, flag & 
> VFIO_MRF_READONLY);
>  }
>  
>  static int iommufd_cdev_unmap(const VFIOContainerBase *bcontainer,
> diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
> index 3cdaa12ed5..dea733ef8a 100644
> --- a/hw/virtio/vhost-vdpa.c
> +++ b/hw/virtio/vhost-vdpa.c
> @@ -226,15 +226,15 @@ static void vhost_vdpa_iommu_map_notify(IOMMUNotifier 
> *n, IOMMUTLBEntry *iotlb)
>      }
>  
>      if ((iotlb->perm & IOMMU_RW) != IOMMU_NONE) {
> -        bool read_only;
> +        uint32_t flag;
>  
> -        if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &read_only, NULL,
> +        if (!memory_get_xlat_addr(iotlb, &vaddr, NULL, &flag, NULL,
>                                    &local_err)) {
>              error_report_err(local_err);
>              return;
>          }
>          ret = vhost_vdpa_dma_map(s, VHOST_VDPA_GUEST_PA_ASID, iova,
> -                                 iotlb->addr_mask + 1, vaddr, read_only);
> +                                 iotlb->addr_mask + 1, vaddr, flag & 
> MRF_READONLY);
>          if (ret) {
>              error_report("vhost_vdpa_dma_map(%p, 0x%" HWADDR_PRIx ", "
>                           "0x%" HWADDR_PRIx ", %p) = %d (%m)",
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 9458e2801d..24405af0be 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -104,10 +104,15 @@ struct MemoryRegionSection {
>      hwaddr offset_within_region;
>      hwaddr offset_within_address_space;
>      bool readonly;
> +    bool ram_device;
>      bool nonvolatile;
>      bool unmergeable;
>  };
>  
> +/* memory region flag */
> +#define MRF_READONLY 0x1
> +#define MRF_RAMDEV   0x2
> +
>  typedef struct IOMMUTLBEntry IOMMUTLBEntry;
>  
>  /* See address_space_translate: bit 0 is read, bit 1 is write.  */
> @@ -742,7 +747,7 @@ void 
> ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
>   * Return: true on success, else false setting @errp with error.
>   */
>  bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> -                          ram_addr_t *ram_addr, bool *read_only,
> +                          ram_addr_t *ram_addr, uint32_t *flag,
>                            bool *mr_has_discard_manager, Error **errp);
>  
>  typedef struct CoalescedMemoryRange CoalescedMemoryRange;
> diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
> index 0c60be5b15..48018dc751 100644
> --- a/include/hw/vfio/vfio-common.h
> +++ b/include/hw/vfio/vfio-common.h
> @@ -43,6 +43,10 @@ enum {
>      VFIO_DEVICE_TYPE_AP = 3,
>  };
>  
> +/* vfio memory region flag */
> +#define VFIO_MRF_READONLY 0x1
> +#define VFIO_MRF_RAMDEV   0x2
> +
>  typedef struct VFIOMmap {
>      MemoryRegion mem;
>      void *mmap;
> diff --git a/include/hw/vfio/vfio-container-base.h 
> b/include/hw/vfio/vfio-container-base.h
> index 4cff9943ab..bb473e7201 100644
> --- a/include/hw/vfio/vfio-container-base.h
> +++ b/include/hw/vfio/vfio-container-base.h
> @@ -73,7 +73,7 @@ typedef struct VFIORamDiscardListener {
>  
>  int vfio_container_dma_map(VFIOContainerBase *bcontainer,
>                             hwaddr iova, ram_addr_t size,
> -                           void *vaddr, bool readonly);
> +                           void *vaddr, uint32_t flag);
>  int vfio_container_dma_unmap(VFIOContainerBase *bcontainer,
>                               hwaddr iova, ram_addr_t size,
>                               IOMMUTLBEntry *iotlb);
> @@ -113,7 +113,7 @@ struct VFIOIOMMUClass {
>      bool (*setup)(VFIOContainerBase *bcontainer, Error **errp);
>      int (*dma_map)(const VFIOContainerBase *bcontainer,
>                     hwaddr iova, ram_addr_t size,
> -                   void *vaddr, bool readonly);
> +                   void *vaddr, uint32_t flag);
>      int (*dma_unmap)(const VFIOContainerBase *bcontainer,
>                       hwaddr iova, ram_addr_t size,
>                       IOMMUTLBEntry *iotlb);
> diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
> index 1b5e254d6a..4a32e70c33 100644
> --- a/linux-headers/linux/vfio.h
> +++ b/linux-headers/linux/vfio.h
> @@ -1560,6 +1560,7 @@ struct vfio_iommu_type1_dma_map {
>  #define VFIO_DMA_MAP_FLAG_READ (1 << 0)              /* readable from device 
> */
>  #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)     /* writable from device */
>  #define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
> +#define VFIO_DMA_MAP_FLAG_MMIO (1 << 3)

Where's the kernel patch that implements the MMIO map flag.  That needs
to come first.

I also don't understand why we're creating multiple read-only and
ramdev flags to distill back into vfio mapping flags.  Thanks,

Alex

>       __u64   vaddr;                          /* Process virtual address */
>       __u64   iova;                           /* IO virtual address */
>       __u64   size;                           /* Size of mapping (bytes) */
> diff --git a/system/memory.c b/system/memory.c
> index b17b5538ff..71c54fc045 100644
> --- a/system/memory.c
> +++ b/system/memory.c
> @@ -223,6 +223,7 @@ struct FlatRange {
>      uint8_t dirty_log_mask;
>      bool romd_mode;
>      bool readonly;
> +    bool ram_device;
>      bool nonvolatile;
>      bool unmergeable;
>  };
> @@ -240,6 +241,7 @@ section_from_flat_range(FlatRange *fr, FlatView *fv)
>          .size = fr->addr.size,
>          .offset_within_address_space = int128_get64(fr->addr.start),
>          .readonly = fr->readonly,
> +        .ram_device = fr->ram_device,
>          .nonvolatile = fr->nonvolatile,
>          .unmergeable = fr->unmergeable,
>      };
> @@ -252,6 +254,7 @@ static bool flatrange_equal(FlatRange *a, FlatRange *b)
>          && a->offset_in_region == b->offset_in_region
>          && a->romd_mode == b->romd_mode
>          && a->readonly == b->readonly
> +        && a->ram_device == b->ram_device
>          && a->nonvolatile == b->nonvolatile
>          && a->unmergeable == b->unmergeable;
>  }
> @@ -657,6 +660,7 @@ static void render_memory_region(FlatView *view,
>      fr.dirty_log_mask = memory_region_get_dirty_log_mask(mr);
>      fr.romd_mode = mr->romd_mode;
>      fr.readonly = readonly;
> +    fr.ram_device = mr->ram_device;
>      fr.nonvolatile = nonvolatile;
>      fr.unmergeable = unmergeable;
>  
> @@ -2184,7 +2188,7 @@ void 
> ram_discard_manager_unregister_listener(RamDiscardManager *rdm,
>  
>  /* Called with rcu_read_lock held.  */
>  bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void **vaddr,
> -                          ram_addr_t *ram_addr, bool *read_only,
> +                          ram_addr_t *ram_addr, uint32_t *flag,
>                            bool *mr_has_discard_manager, Error **errp)
>  {
>      MemoryRegion *mr;
> @@ -2246,8 +2250,9 @@ bool memory_get_xlat_addr(IOMMUTLBEntry *iotlb, void 
> **vaddr,
>          *ram_addr = memory_region_get_ram_addr(mr) + xlat;
>      }
>  
> -    if (read_only) {
> -        *read_only = !writable || mr->readonly;
> +    if (flag) {
> +        *flag |= (!writable || mr->readonly)? MRF_READONLY: 0;
> +        *flag |= mr->ram_device? MRF_RAMDEV: 0;
>      }
>  
>      return true;




reply via email to

[Prev in Thread] Current Thread [Next in Thread]