aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Levon <john.levon@nutanix.com>2022-05-30 09:41:32 +0100
committerGitHub <noreply@github.com>2022-05-30 09:41:32 +0100
commite036ac145acea1a5aa77879e978ac2fff909a657 (patch)
tree1f0837b4c79feb97aa642d4e505e3d64012896d7
parent79e83e482d4eb0b7a07cfa207506d33edf05d04b (diff)
downloadlibvfio-user-e036ac145acea1a5aa77879e978ac2fff909a657.zip
libvfio-user-e036ac145acea1a5aa77879e978ac2fff909a657.tar.gz
libvfio-user-e036ac145acea1a5aa77879e978ac2fff909a657.tar.bz2
allow concurrent dirty bitmap get (#677)
Use atomic operations to allow concurrent bitmap updates with VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP operations. Dirtying clients can race against each other, so we must use atomic or when marking dirty: we do this byte-by-byte. When reading the dirty bitmap, we must be careful to not race and lose any set bits within the same byte. If we miss an update, we'll catch it the next time around, presuming that before the final pass we'll have quiesced all I/O. Signed-off-by: John Levon <john.levon@nutanix.com> Reviewed-by: Raphael Norwitz <raphael.norwitz@nutanix.com> Reviewed-by: Thanos Makatos <thanos.makatos@nutanix.com>
-rw-r--r--lib/dma.c34
-rw-r--r--lib/dma.h57
-rw-r--r--lib/libvfio-user.c11
-rw-r--r--test/py/test_dirty_pages.py104
4 files changed, 161 insertions, 45 deletions
diff --git a/lib/dma.c b/lib/dma.c
index 5ca897f..ac3ddfe 100644
--- a/lib/dma.c
+++ b/lib/dma.c
@@ -31,7 +31,6 @@
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
-#include <stdio.h>
#include <sys/param.h>
#include <stddef.h>
@@ -281,7 +280,8 @@ dirty_page_logging_start_on_region(dma_memory_region_t *region, size_t pgsize)
if (size < 0) {
return size;
}
- region->dirty_bitmap = calloc(size, sizeof(char));
+
+ region->dirty_bitmap = calloc(size, 1);
if (region->dirty_bitmap == NULL) {
return ERROR_INT(errno);
}
@@ -553,10 +553,11 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
uint64_t len, size_t pgsize, size_t size,
char *bitmap)
{
- int ret;
+ dma_memory_region_t *region;
ssize_t bitmap_size;
dma_sg_t sg;
- dma_memory_region_t *region;
+ size_t i;
+ int ret;
assert(dma != NULL);
assert(bitmap != NULL);
@@ -599,11 +600,32 @@ dma_controller_dirty_page_get(dma_controller_t *dma, vfu_dma_addr_t addr,
return ERROR_INT(EINVAL);
}
- memcpy(bitmap, region->dirty_bitmap, size);
+ for (i = 0; i < (size_t)bitmap_size; i++) {
+ uint8_t val = region->dirty_bitmap[i];
+ uint8_t *outp = (uint8_t *)&bitmap[i];
+
+ /*
+ * If no bits are dirty, avoid the atomic exchange. This is obviously
+ * racy, but it's OK: if we miss a dirty bit being set, we'll catch it
+ * the next time around.
+ *
+ * Otherwise, atomically exchange the dirty bits with zero: as we use
+ * atomic or in _dma_mark_dirty(), this cannot lose set bits - we might
+ * miss a bit being set after, but again, we'll catch that next time
+ * around.
+ */
+ if (val == 0) {
+ *outp = 0;
+ } else {
+ uint8_t zero = 0;
+ __atomic_exchange(&region->dirty_bitmap[i], &zero,
+ outp, __ATOMIC_SEQ_CST);
+ }
+ }
+
#ifdef DEBUG
log_dirty_bitmap(dma->vfu_ctx, region, bitmap, size);
#endif
- memset(region->dirty_bitmap, 0, size);
return 0;
}
diff --git a/lib/dma.h b/lib/dma.h
index 3fdbd65..089f973 100644
--- a/lib/dma.h
+++ b/lib/dma.h
@@ -58,6 +58,7 @@
* effectively a no-op.
*/
+#include <stdio.h>
#ifdef DMA_MAP_PROTECTED
#undef DMA_MAP_FAST
#define DMA_MAP_FAST_IMPL 0
@@ -95,7 +96,7 @@ typedef struct {
vfu_dma_info_t info;
int fd; // File descriptor to mmap
off_t offset; // File offset
- char *dirty_bitmap; // Dirty page bitmap
+ uint8_t *dirty_bitmap; // Dirty page bitmap
} dma_memory_region_t;
typedef struct dma_controller {
@@ -140,22 +141,64 @@ _dma_addr_sg_split(const dma_controller_t *dma,
vfu_dma_addr_t dma_addr, uint64_t len,
dma_sg_t *sg, int max_nr_sgs, int prot);
-static void
+/* Convert a start address and length to its containing page numbers. */
+static inline void
+range_to_pages(size_t start, size_t len, size_t pgsize,
+ size_t *pgstart, size_t *pgend)
+{
+ *pgstart = start / pgsize;
+ *pgend = ROUND_UP(start + len, pgsize) / pgsize;
+}
+
+/* Given a bit position, return the containing byte. */
+static inline size_t
+bit_to_u8(size_t val)
+{
+ return val / (CHAR_BIT);
+}
+
+/* Return a value modulo the bitsize of a uint8_t. */
+static inline size_t
+bit_to_u8off(size_t val)
+{
+ return val % (CHAR_BIT);
+}
+
+static inline void
_dma_mark_dirty(const dma_controller_t *dma, const dma_memory_region_t *region,
dma_sg_t *sg)
{
- size_t i, start, end;
+ size_t index;
+ size_t end;
+ size_t pgstart;
+ size_t pgend;
+ size_t i;
assert(dma != NULL);
assert(region != NULL);
assert(sg != NULL);
assert(region->dirty_bitmap != NULL);
- start = sg->offset / dma->dirty_pgsize;
- end = start + (sg->length / dma->dirty_pgsize) + (sg->length % dma->dirty_pgsize != 0) - 1;
+ range_to_pages(sg->offset, sg->length, dma->dirty_pgsize,
+ &pgstart, &pgend);
+
+ index = bit_to_u8(pgstart);
+ end = bit_to_u8(pgend) + !!(bit_to_u8off(pgend));
+
+ for (i = index; i < end; i++) {
+ uint8_t bm = ~0;
+
+ /* Mask off any pages in the first u8 that aren't in the range. */
+ if (i == index && bit_to_u8off(pgstart) != 0) {
+ bm &= ~((1 << bit_to_u8off(pgstart)) - 1);
+ }
+
+ /* Mask off any pages in the last u8 that aren't in the range. */
+ if (i == end - 1 && bit_to_u8off(pgend) != 0) {
+ bm &= ((1 << bit_to_u8off(pgend)) - 1);
+ }
- for (i = start; i <= end; i++) {
- region->dirty_bitmap[i / CHAR_BIT] |= 1 << (i % CHAR_BIT);
+ __atomic_or_fetch(&region->dirty_bitmap[i], bm, __ATOMIC_SEQ_CST);
}
}
diff --git a/lib/libvfio-user.c b/lib/libvfio-user.c
index 90c4b39..566ece0 100644
--- a/lib/libvfio-user.c
+++ b/lib/libvfio-user.c
@@ -1310,8 +1310,15 @@ command_needs_quiesce(vfu_ctx_t *vfu_ctx, const vfu_msg_t *msg)
case VFIO_USER_DEVICE_RESET:
return true;
- case VFIO_USER_DIRTY_PAGES:
- return true;
+ case VFIO_USER_DIRTY_PAGES: {
+ struct vfio_user_dirty_pages *dirty_pages = msg->in.iov.iov_base;
+
+ if (msg->in.iov.iov_len < sizeof(*dirty_pages)) {
+ return false;
+ }
+
+ return !(dirty_pages->flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP);
+ }
case VFIO_USER_REGION_WRITE:
if (msg->in.iov.iov_len < sizeof(*reg)) {
diff --git a/test/py/test_dirty_pages.py b/test/py/test_dirty_pages.py
index 6cf87fb..8b4e3dc 100644
--- a/test/py/test_dirty_pages.py
+++ b/test/py/test_dirty_pages.py
@@ -89,13 +89,13 @@ def test_dirty_pages_setup():
payload = vfio_user_dma_map(argsz=len(vfio_user_dma_map()),
flags=(VFIO_USER_F_DMA_REGION_READ | VFIO_USER_F_DMA_REGION_WRITE),
- offset=0, addr=0x10000, size=0x10000)
+ offset=0, addr=0x10000, size=0x20000)
msg(ctx, sock, VFIO_USER_DMA_MAP, payload, fds=[f.fileno()])
payload = vfio_user_dma_map(argsz=len(vfio_user_dma_map()),
flags=(VFIO_USER_F_DMA_REGION_READ | VFIO_USER_F_DMA_REGION_WRITE),
- offset=0, addr=0x20000, size=0x10000)
+ offset=0, addr=0x40000, size=0x10000)
msg(ctx, sock, VFIO_USER_DMA_MAP, payload)
@@ -247,7 +247,7 @@ def test_get_dirty_page_bitmap_unmapped():
dirty_pages = vfio_user_dirty_pages(argsz=argsz,
flags=VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP)
bitmap = vfio_user_bitmap(pgsize=0x1000, size=8)
- br = vfio_user_bitmap_range(iova=0x20000, size=0x10000, bitmap=bitmap)
+ br = vfio_user_bitmap_range(iova=0x40000, size=0x10000, bitmap=bitmap)
payload = bytes(dirty_pages) + bytes(br)
@@ -283,13 +283,7 @@ def test_dirty_pages_get_unmodified():
assert br.bitmap.size == 8
-def get_dirty_page_bitmap_result(resp):
- _, resp = vfio_user_dirty_pages.pop_from_buffer(resp)
- _, resp = vfio_user_bitmap_range.pop_from_buffer(resp)
- return struct.unpack("Q", resp)[0]
-
-
-def send_dirty_page_bitmap(busy=False):
+def get_dirty_page_bitmap():
argsz = len(vfio_user_dirty_pages()) + len(vfio_user_bitmap_range()) + 8
dirty_pages = vfio_user_dirty_pages(argsz=argsz,
@@ -299,18 +293,33 @@ def send_dirty_page_bitmap(busy=False):
payload = bytes(dirty_pages) + bytes(br)
- return msg(ctx, sock, VFIO_USER_DIRTY_PAGES, payload, busy=busy)
+ result = msg(ctx, sock, VFIO_USER_DIRTY_PAGES, payload)
+
+ _, result = vfio_user_dirty_pages.pop_from_buffer(result)
+ _, result = vfio_user_bitmap_range.pop_from_buffer(result)
+ assert(len(result) == 8)
-def get_dirty_page_bitmap():
- result = send_dirty_page_bitmap()
- return get_dirty_page_bitmap_result(result)
+ return struct.unpack("Q", result)[0]
sg3 = None
iovec3 = None
+def write_to_addr(ctx, addr, size, get_bitmap=True):
+ """Simulate a write to the given address and size."""
+ ret, sg = vfu_addr_to_sgl(ctx, dma_addr=addr, length=size)
+ assert ret == 1
+ iovec = iovec_t()
+ ret = vfu_sgl_get(ctx, sg, iovec)
+ assert ret == 0
+ vfu_sgl_put(ctx, sg, iovec)
+ if get_bitmap:
+ return get_dirty_page_bitmap()
+ return None
+
+
def test_dirty_pages_get_modified():
ret, sg1 = vfu_addr_to_sgl(ctx, dma_addr=0x10000, length=0x1000)
assert ret == 1
@@ -326,13 +335,15 @@ def test_dirty_pages_get_modified():
ret = vfu_sgl_get(ctx, sg2, iovec2)
assert ret == 0
+ # simple single bitmap entry map
ret, sg3 = vfu_addr_to_sgl(ctx, dma_addr=0x12000, length=0x1000)
assert ret == 1
iovec3 = iovec_t()
ret = vfu_sgl_get(ctx, sg3, iovec3)
assert ret == 0
- ret, sg4 = vfu_addr_to_sgl(ctx, dma_addr=0x14000, length=0x4000)
+ # write that spans bytes in bitmap
+ ret, sg4 = vfu_addr_to_sgl(ctx, dma_addr=0x16000, length=0x4000)
assert ret == 1
iovec4 = iovec_t()
ret = vfu_sgl_get(ctx, sg4, iovec4)
@@ -340,23 +351,65 @@ def test_dirty_pages_get_modified():
# not put yet, dirty bitmap should be zero
bitmap = get_dirty_page_bitmap()
- assert bitmap == 0b00000000
+ assert bitmap == 0b0000000000000000
# put SGLs, dirty bitmap should be updated
vfu_sgl_put(ctx, sg1, iovec1)
vfu_sgl_put(ctx, sg4, iovec4)
bitmap = get_dirty_page_bitmap()
- assert bitmap == 0b11110001
+ assert bitmap == 0b0000001111000001
# after another two puts, should just be one dirty page
vfu_sgl_put(ctx, sg2, iovec2)
vfu_sgl_put(ctx, sg3, iovec3)
bitmap = get_dirty_page_bitmap()
- assert bitmap == 0b00000100
+ assert bitmap == 0b0000000000000100
# and should now be clear
bitmap = get_dirty_page_bitmap()
- assert bitmap == 0b00000000
+ assert bitmap == 0b0000000000000000
+
+ #
+ # check various edge cases of bitmap values.
+ #
+
+ # very first bit
+ bitmap = write_to_addr(ctx, 0x10000, 0x1000)
+ assert bitmap == 0b0000000000000001
+
+ # top bit of first byte
+ bitmap = write_to_addr(ctx, 0x17000, 0x1000)
+ assert bitmap == 0b0000000010000000
+
+ # all bits except top one of first byte
+ bitmap = write_to_addr(ctx, 0x10000, 0x7000)
+ assert bitmap == 0b0000000001111111
+
+ # all bits of first byte
+ bitmap = write_to_addr(ctx, 0x10000, 0x8000)
+ assert bitmap == 0b0000000011111111
+
+ # all bits of first byte plus bottom bit of next
+ bitmap = write_to_addr(ctx, 0x10000, 0x9000)
+ assert bitmap == 0b0000000111111111
+
+ # straddle top/bottom bit
+ bitmap = write_to_addr(ctx, 0x17000, 0x2000)
+ assert bitmap == 0b0000000110000000
+
+ # top bit of second byte
+ bitmap = write_to_addr(ctx, 0x1f000, 0x1000)
+ assert bitmap == 0b1000000000000000
+
+ # top bit of third byte
+ bitmap = write_to_addr(ctx, 0x27000, 0x1000)
+ assert bitmap == 0b100000000000000000000000
+
+ # bits in third and first byte
+ write_to_addr(ctx, 0x26000, 0x1000, get_bitmap=False)
+ write_to_addr(ctx, 0x12000, 0x2000, get_bitmap=False)
+ bitmap = get_dirty_page_bitmap()
+ assert bitmap == 0b010000000000000000001100
def test_dirty_pages_stop():
@@ -399,17 +452,8 @@ def test_dirty_pages_bitmap_with_quiesce():
assert ret == 0
vfu_sgl_put(ctx, sg1, iovec1)
- send_dirty_page_bitmap(busy=True)
-
- ret = vfu_device_quiesced(ctx, 0)
- assert ret == 0
-
- # now should be able to get the reply
- result = get_reply(sock, expect=0)
- bitmap = get_dirty_page_bitmap_result(result)
- assert bitmap == 0b00000001
-
- quiesce_errno = 0
+ bitmap = get_dirty_page_bitmap()
+ assert bitmap == 0b0000000000000001
def test_dirty_pages_stop_with_quiesce():