From ea8f942fe46dd10e0946f02ab9d698fb41e958f7 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 20 Jul 2011 18:23:35 +0200
Subject: blockdev: Make eject fail for non-removable drives even with -f

Ejecting hard disk platters can only end in tears.

If you need to revoke access to an image, use drive_del, not eject -f.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 blockdev.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 0b8d3a4..a25367a 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -646,16 +646,13 @@ out:
 
 static int eject_device(Monitor *mon, BlockDriverState *bs, int force)
 {
-    if (!force) {
-        if (!bdrv_is_removable(bs)) {
-            qerror_report(QERR_DEVICE_NOT_REMOVABLE,
-                           bdrv_get_device_name(bs));
-            return -1;
-        }
-        if (bdrv_is_locked(bs)) {
-            qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs));
-            return -1;
-        }
+    if (!bdrv_is_removable(bs)) {
+        qerror_report(QERR_DEVICE_NOT_REMOVABLE, bdrv_get_device_name(bs));
+        return -1;
+    }
+    if (!force && bdrv_is_locked(bs)) {
+        qerror_report(QERR_DEVICE_LOCKED, bdrv_get_device_name(bs));
+        return -1;
     }
     bdrv_close(bs);
     return 0;
-- 
cgit v1.1


From a19712b0dbe43016fb17ec48bfff2f360225fe97 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 20 Jul 2011 18:23:36 +0200
Subject: block: Reset device model callbacks on detach

BlockDriverState members change_cb and change_opaque are initially
null.  The device model may set them, with bdrv_set_change_cb().  If
the device model gets detached (hot unplug), they're left dangling.
Only safe because device hot unplug automatically destroys the
BlockDriverState.  But that's a questionable feature, best not to rely
on it.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block.c b/block.c
index 9549b9e..81a8257 100644
--- a/block.c
+++ b/block.c
@@ -730,6 +730,8 @@ void bdrv_detach(BlockDriverState *bs, DeviceState *qdev)
 {
     assert(bs->peer == qdev);
     bs->peer = NULL;
+    bs->change_cb = NULL;
+    bs->change_opaque = NULL;
 }
 
 DeviceState *bdrv_get_attached(BlockDriverState *bs)
-- 
cgit v1.1


From 02266d547a6c7b10e1ac1574ec69b92f4e28f817 Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 20 Jul 2011 18:23:40 +0200
Subject: block/raw-win32: Drop disabled code for removable host devices

It's been disabled since the start (commit 19cb3738, Aug 2006), and
has been untouched except for spelling fixes and such.  I don't feel
like dragging it along any further.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-win32.c | 35 -----------------------------------
 1 file changed, 35 deletions(-)

diff --git a/block/raw-win32.c b/block/raw-win32.c
index 91067e7..e47cfe0 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -393,41 +393,6 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
     return 0;
 }
 
-#if 0
-/***********************************************/
-/* removable device additional commands */
-
-static int raw_is_inserted(BlockDriverState *bs)
-{
-    return 1;
-}
-
-static int raw_media_changed(BlockDriverState *bs)
-{
-    return -ENOTSUP;
-}
-
-static int raw_eject(BlockDriverState *bs, int eject_flag)
-{
-    DWORD ret_count;
-
-    if (s->type == FTYPE_FILE)
-        return -ENOTSUP;
-    if (eject_flag) {
-        DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA,
-                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
-    } else {
-        DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA,
-                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
-    }
-}
-
-static int raw_set_locked(BlockDriverState *bs, int locked)
-{
-    return -ENOTSUP;
-}
-#endif
-
 static int hdev_has_zero_init(BlockDriverState *bs)
 {
     return 0;
-- 
cgit v1.1


From 7bf37feddcfa527304cfdc02bd2db8912ee9bf8c Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 20 Jul 2011 18:23:41 +0200
Subject: block: Make BlockDriver method bdrv_set_locked() return void

The only caller is bdrv_set_locked(), and it ignores the value.

Callees always return 0, except for FreeBSD's cdrom_set_locked(),
which returns -ENOTSUP when the device is in a terminally wedged
state.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 10 +++-------
 block/raw.c       |  3 +--
 block_int.h       |  2 +-
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index cd89c83..5241308 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1363,7 +1363,7 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag)
     return 0;
 }
 
-static int cdrom_set_locked(BlockDriverState *bs, int locked)
+static void cdrom_set_locked(BlockDriverState *bs, int locked)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -1374,8 +1374,6 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked)
          */
         /* perror("CDROM_LOCKDOOR"); */
     }
-
-    return 0;
 }
 
 static BlockDriver bdrv_host_cdrom = {
@@ -1486,12 +1484,12 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag)
     return 0;
 }
 
-static int cdrom_set_locked(BlockDriverState *bs, int locked)
+static void cdrom_set_locked(BlockDriverState *bs, int locked)
 {
     BDRVRawState *s = bs->opaque;
 
     if (s->fd < 0)
-        return -ENOTSUP;
+        return;
     if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
         /*
          * Note: an error can happen if the distribution automatically
@@ -1499,8 +1497,6 @@ static int cdrom_set_locked(BlockDriverState *bs, int locked)
          */
         /* perror("CDROM_LOCKDOOR"); */
     }
-
-    return 0;
 }
 
 static BlockDriver bdrv_host_cdrom = {
diff --git a/block/raw.c b/block/raw.c
index b0f72d6..1398a9c 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -80,10 +80,9 @@ static int raw_eject(BlockDriverState *bs, int eject_flag)
     return bdrv_eject(bs->file, eject_flag);
 }
 
-static int raw_set_locked(BlockDriverState *bs, int locked)
+static void raw_set_locked(BlockDriverState *bs, int locked)
 {
     bdrv_set_locked(bs->file, locked);
-    return 0;
 }
 
 static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
diff --git a/block_int.h b/block_int.h
index efb6803..e0b638c 100644
--- a/block_int.h
+++ b/block_int.h
@@ -113,7 +113,7 @@ struct BlockDriver {
     int (*bdrv_is_inserted)(BlockDriverState *bs);
     int (*bdrv_media_changed)(BlockDriverState *bs);
     int (*bdrv_eject)(BlockDriverState *bs, int eject_flag);
-    int (*bdrv_set_locked)(BlockDriverState *bs, int locked);
+    void (*bdrv_set_locked)(BlockDriverState *bs, int locked);
 
     /* to control generic scsi devices */
     int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf);
-- 
cgit v1.1


From 822e1cd17e8fa3ae98d0481c20f042316ace3fbc Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 20 Jul 2011 18:23:42 +0200
Subject: block: Make BlockDriver method bdrv_eject() return void

Callees always return 0, except for FreeBSD's cdrom_eject(), which
returns -ENOTSUP when the device is in a terminally wedged state.

The only caller is bdrv_eject(), and it maps -ENOTSUP to 0 since
commit 4be9762a.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c           | 17 ++++-------------
 block/raw-posix.c | 16 +++++-----------
 block/raw.c       |  4 ++--
 block_int.h       |  2 +-
 4 files changed, 12 insertions(+), 27 deletions(-)

diff --git a/block.c b/block.c
index 81a8257..7c25fe4 100644
--- a/block.c
+++ b/block.c
@@ -2770,25 +2770,16 @@ int bdrv_media_changed(BlockDriverState *bs)
 int bdrv_eject(BlockDriverState *bs, int eject_flag)
 {
     BlockDriver *drv = bs->drv;
-    int ret;
 
     if (bs->locked) {
         return -EBUSY;
     }
 
-    if (!drv || !drv->bdrv_eject) {
-        ret = -ENOTSUP;
-    } else {
-        ret = drv->bdrv_eject(bs, eject_flag);
-    }
-    if (ret == -ENOTSUP) {
-        ret = 0;
+    if (drv && drv->bdrv_eject) {
+        drv->bdrv_eject(bs, eject_flag);
     }
-    if (ret >= 0) {
-        bs->tray_open = eject_flag;
-    }
-
-    return ret;
+    bs->tray_open = eject_flag;
+    return 0;
 }
 
 int bdrv_is_locked(BlockDriverState *bs)
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 5241308..6672d31 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1254,7 +1254,7 @@ static int floppy_media_changed(BlockDriverState *bs)
     return ret;
 }
 
-static int floppy_eject(BlockDriverState *bs, int eject_flag)
+static void floppy_eject(BlockDriverState *bs, int eject_flag)
 {
     BDRVRawState *s = bs->opaque;
     int fd;
@@ -1269,8 +1269,6 @@ static int floppy_eject(BlockDriverState *bs, int eject_flag)
             perror("FDEJECT");
         close(fd);
     }
-
-    return 0;
 }
 
 static BlockDriver bdrv_host_floppy = {
@@ -1348,7 +1346,7 @@ static int cdrom_is_inserted(BlockDriverState *bs)
     return 0;
 }
 
-static int cdrom_eject(BlockDriverState *bs, int eject_flag)
+static void cdrom_eject(BlockDriverState *bs, int eject_flag)
 {
     BDRVRawState *s = bs->opaque;
 
@@ -1359,8 +1357,6 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag)
         if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0)
             perror("CDROMEJECT");
     }
-
-    return 0;
 }
 
 static void cdrom_set_locked(BlockDriverState *bs, int locked)
@@ -1462,12 +1458,12 @@ static int cdrom_is_inserted(BlockDriverState *bs)
     return raw_getlength(bs) > 0;
 }
 
-static int cdrom_eject(BlockDriverState *bs, int eject_flag)
+static void cdrom_eject(BlockDriverState *bs, int eject_flag)
 {
     BDRVRawState *s = bs->opaque;
 
     if (s->fd < 0)
-        return -ENOTSUP;
+        return;
 
     (void) ioctl(s->fd, CDIOCALLOW);
 
@@ -1479,9 +1475,7 @@ static int cdrom_eject(BlockDriverState *bs, int eject_flag)
             perror("CDIOCCLOSE");
     }
 
-    if (cdrom_reopen(bs) < 0)
-        return -ENOTSUP;
-    return 0;
+    cdrom_reopen(bs);
 }
 
 static void cdrom_set_locked(BlockDriverState *bs, int locked)
diff --git a/block/raw.c b/block/raw.c
index 1398a9c..cb6203e 100644
--- a/block/raw.c
+++ b/block/raw.c
@@ -75,9 +75,9 @@ static int raw_is_inserted(BlockDriverState *bs)
     return bdrv_is_inserted(bs->file);
 }
 
-static int raw_eject(BlockDriverState *bs, int eject_flag)
+static void raw_eject(BlockDriverState *bs, int eject_flag)
 {
-    return bdrv_eject(bs->file, eject_flag);
+    bdrv_eject(bs->file, eject_flag);
 }
 
 static void raw_set_locked(BlockDriverState *bs, int locked)
diff --git a/block_int.h b/block_int.h
index e0b638c..efefbee 100644
--- a/block_int.h
+++ b/block_int.h
@@ -112,7 +112,7 @@ struct BlockDriver {
     /* removable device specific */
     int (*bdrv_is_inserted)(BlockDriverState *bs);
     int (*bdrv_media_changed)(BlockDriverState *bs);
-    int (*bdrv_eject)(BlockDriverState *bs, int eject_flag);
+    void (*bdrv_eject)(BlockDriverState *bs, int eject_flag);
     void (*bdrv_set_locked)(BlockDriverState *bs, int locked);
 
     /* to control generic scsi devices */
-- 
cgit v1.1


From 49aa46bb4b894ff8bdb0339ee2a5dd3fcfe93ecd Mon Sep 17 00:00:00 2001
From: Markus Armbruster <armbru@redhat.com>
Date: Wed, 20 Jul 2011 18:23:43 +0200
Subject: block: Don't let locked flag prevent medium load

Commit aea2a33c made bdrv_eject() obey the locked flag.  Correct for
medium eject (eject_flag set), incorrect for medium load (eject_flag
clear).  See MMC-5 Table 341 "Actions for Lock/Unlock/Eject".

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 7c25fe4..8859f9b 100644
--- a/block.c
+++ b/block.c
@@ -2771,7 +2771,7 @@ int bdrv_eject(BlockDriverState *bs, int eject_flag)
 {
     BlockDriver *drv = bs->drv;
 
-    if (bs->locked) {
+    if (eject_flag && bs->locked) {
         return -EBUSY;
     }
 
-- 
cgit v1.1


From efc8243d00ab4cf4fa05a9be93233cb883b7caa0 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serge@hallyn.com>
Date: Mon, 25 Jul 2011 18:34:35 +0000
Subject: block/vpc.c: Detect too-large vpc file

VHD files technically can be up to 2Tb, but virtual pc is limited
to 127G.  Currently qemu-img refused to create vpc files > 127G,
but it is failing to return error when converting from a non-vpc
VHD file which is >127G.  It returns success, but creates a truncated
converted image.  Also, qemu-img info claims the vpc file is 127G
(and clean).

This patch detects a too-large vpc file and returns -EFBIG.  Without
this patch,

=============================================================
root@ip-10-38-123-242:~/qemu-fixed# qemu-img info /mnt/140g-dynamic.vhd
image: /mnt/140g-dynamic.vhd
file format: vpc
virtual size: 127G (136899993600 bytes)
disk size: 284K
root@ip-10-38-123-242:~/qemu-fixed# qemu-img convert -f vpc -O raw /mnt/140g-dynamic.vhd /mnt/y
root@ip-10-38-123-242:~/qemu-fixed# echo $?
0
root@ip-10-38-123-242:~/qemu-fixed# qemu-img info /mnt/y
image: /mnt/y
file format: raw
virtual size: 127G (136899993600 bytes)
disk size: 0
=============================================================

(The 140G image was truncated with no warning or error.)

With the patch, I get:

=============================================================
root@ip-10-38-123-242:~/qemu-fixed# ./qemu-img info /mnt/140g-dynamic.vhd
qemu-img: Could not open '/mnt/140g-dynamic.vhd': File too large
root@ip-10-38-123-242:~/qemu-fixed# ./qemu-img convert -f vpc -O raw /mnt/140g-dynamic.vhd /mnt/y
qemu-img: Could not open '/mnt/140g-dynamic.vhd': File too large
qemu-img: Could not open '/mnt/140g-dynamic.vhd'
=============================================================

See https://bugs.launchpad.net/qemu/+bug/814222 for details.

Signed-off-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/vpc.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/vpc.c b/block/vpc.c
index 56865da..fdd5236 100644
--- a/block/vpc.c
+++ b/block/vpc.c
@@ -156,6 +156,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
     struct vhd_dyndisk_header* dyndisk_header;
     uint8_t buf[HEADER_SIZE];
     uint32_t checksum;
+    int err = -1;
 
     if (bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
         goto fail;
@@ -176,6 +177,11 @@ static int vpc_open(BlockDriverState *bs, int flags)
     bs->total_sectors = (int64_t)
         be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
 
+    if (bs->total_sectors >= 65535 * 16 * 255) {
+        err = -EFBIG;
+        goto fail;
+    }
+
     if (bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE)
             != HEADER_SIZE)
         goto fail;
@@ -222,7 +228,7 @@ static int vpc_open(BlockDriverState *bs, int flags)
 
     return 0;
  fail:
-    return -1;
+    return err;
 }
 
 /*
-- 
cgit v1.1


From 5f71d32f0da4d1e578738f765b57fbfaf4bd3214 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 22 Jul 2011 16:51:12 +0200
Subject: scsi-disk: Codingstyle fixes

Replace tabs with spaces.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-disk.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index f42a5d1..715f2cd 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -526,7 +526,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
     memset(outbuf, 0, buflen);
 
     if (req->lun) {
-        outbuf[0] = 0x7f;	/* LUN not supported */
+        outbuf[0] = 0x7f;       /* LUN not supported */
         return buflen;
     }
 
@@ -836,7 +836,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
     case TEST_UNIT_READY:
         if (!bdrv_is_inserted(s->bs))
             goto not_ready;
-	break;
+        break;
     case REQUEST_SENSE:
         if (req->cmd.xfer < 4)
             goto illegal_request;
@@ -848,7 +848,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         buflen = scsi_disk_emulate_inquiry(req, outbuf);
         if (buflen < 0)
             goto illegal_request;
-	break;
+        break;
     case MODE_SENSE:
     case MODE_SENSE_10:
         buflen = scsi_disk_emulate_mode_sense(req, outbuf);
@@ -881,14 +881,14 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
             /* load/eject medium */
             bdrv_eject(s->bs, !(req->cmd.buf[4] & 1));
         }
-	break;
+        break;
     case ALLOW_MEDIUM_REMOVAL:
         bdrv_set_locked(s->bs, req->cmd.buf[4] & 1);
-	break;
+        break;
     case READ_CAPACITY:
         /* The normal LEN field for this command is zero.  */
-	memset(outbuf, 0, 8);
-	bdrv_get_geometry(s->bs, &nb_sectors);
+        memset(outbuf, 0, 8);
+        bdrv_get_geometry(s->bs, &nb_sectors);
         if (!nb_sectors)
             goto not_ready;
         nb_sectors /= s->cluster_size;
@@ -908,7 +908,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         outbuf[6] = s->cluster_size * 2;
         outbuf[7] = 0;
         buflen = 8;
-	break;
+        break;
     case SYNCHRONIZE_CACHE:
         ret = bdrv_flush(s->bs);
         if (ret < 0) {
-- 
cgit v1.1


From 3790372c963dbc87d4efdf24f8b718c283798fa0 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 22 Jul 2011 16:51:13 +0200
Subject: scsi: Remove references to SET_WINDOW

SET_WINDOW command is vendor-specific only.
So we shouldn't try to emulate it.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-bus.c  | 2 --
 hw/scsi-defs.h | 1 -
 2 files changed, 3 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 8b1a412..facc98d 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -350,7 +350,6 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     case SEARCH_HIGH_12:
     case SEARCH_EQUAL_12:
     case SEARCH_LOW_12:
-    case SET_WINDOW:
     case MEDIUM_SCAN:
     case SEND_VOLUME_TAG:
     case WRITE_LONG_2:
@@ -544,7 +543,6 @@ static const char *scsi_command_name(uint8_t cmd)
         [ SEND_DIAGNOSTIC          ] = "SEND_DIAGNOSTIC",
         [ ALLOW_MEDIUM_REMOVAL     ] = "ALLOW_MEDIUM_REMOVAL",
 
-        [ SET_WINDOW               ] = "SET_WINDOW",
         [ READ_CAPACITY            ] = "READ_CAPACITY",
         [ READ_10                  ] = "READ_10",
         [ WRITE_10                 ] = "WRITE_10",
diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h
index 413cce0..8513983 100644
--- a/hw/scsi-defs.h
+++ b/hw/scsi-defs.h
@@ -49,7 +49,6 @@
 #define SEND_DIAGNOSTIC       0x1d
 #define ALLOW_MEDIUM_REMOVAL  0x1e
 
-#define SET_WINDOW            0x24
 #define READ_CAPACITY         0x25
 #define READ_10               0x28
 #define WRITE_10              0x2a
-- 
cgit v1.1


From 8bd3e139c638d9742e12da33007a19c5204302af Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 22 Jul 2011 16:51:14 +0200
Subject: scsi: Remove REZERO_UNIT emulation

REZERO_UNIT command is obsolete. Remove support for it.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-bus.c  | 3 ---
 hw/scsi-defs.h | 1 -
 hw/scsi-disk.c | 7 -------
 3 files changed, 11 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index facc98d..52a6784 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -223,7 +223,6 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
 
     switch(cmd[0]) {
     case TEST_UNIT_READY:
-    case REZERO_UNIT:
     case START_STOP:
     case SEEK_6:
     case WRITE_FILEMARKS:
@@ -516,8 +515,6 @@ static const char *scsi_command_name(uint8_t cmd)
 {
     static const char *names[] = {
         [ TEST_UNIT_READY          ] = "TEST_UNIT_READY",
-        [ REZERO_UNIT              ] = "REZERO_UNIT",
-        /* REWIND and REZERO_UNIT use the same operation code */
         [ REQUEST_SENSE            ] = "REQUEST_SENSE",
         [ FORMAT_UNIT              ] = "FORMAT_UNIT",
         [ READ_BLOCK_LIMITS        ] = "READ_BLOCK_LIMITS",
diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h
index 8513983..1f40c5c 100644
--- a/hw/scsi-defs.h
+++ b/hw/scsi-defs.h
@@ -25,7 +25,6 @@
  */
 
 #define TEST_UNIT_READY       0x00
-#define REZERO_UNIT           0x01
 #define REQUEST_SENSE         0x03
 #define FORMAT_UNIT           0x04
 #define READ_BLOCK_LIMITS     0x05
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 715f2cd..abf0bd2 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -972,12 +972,6 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         break;
     case VERIFY:
         break;
-    case REZERO_UNIT:
-        DPRINTF("Rezero Unit\n");
-        if (!bdrv_is_inserted(s->bs)) {
-            goto not_ready;
-        }
-        break;
     default:
         scsi_command_complete(r, CHECK_CONDITION, SENSE_CODE(INVALID_OPCODE));
         return -1;
@@ -1059,7 +1053,6 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     case SERVICE_ACTION_IN:
     case REPORT_LUNS:
     case VERIFY:
-    case REZERO_UNIT:
         rc = scsi_disk_emulate_command(r, outbuf);
         if (rc < 0) {
             return 0;
-- 
cgit v1.1


From 5e30a07d6d70d3073ff61e6db79d61c2b688502f Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 22 Jul 2011 16:51:15 +0200
Subject: scsi: Sanitize command definitions

Sanitize SCSI command definitions.
Add _10 suffix to READ_CAPACITY, WRITE_VERIFY, VERIFY, READ_LONG,
WRITE_LONG, and WRITE_SAME.
Add new command definitions for LOCATE_10, UNMAP, VARLENGTH_CDB,
WRITE_FILEMARKS_16, EXTENDED_COPY, ATA_PASSTHROUGH, ACCESS_CONTROL_IN,
ACCESS_CONTROL_OUT, COMPARE_AND_WRITE, VERIFY_16, SYNCHRONIZE_CACHE_16,
LOCATE_16, ERASE_16, WRITE_LONG_16, LOAD_UNLOAD, VERIFY_12.
Remove invalid definition of WRITE_LONG_2.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-bus.c     | 69 +++++++++++++++++++++++++++++++++----------------------
 hw/scsi-defs.h    | 54 ++++++++++++++++++++++++++-----------------
 hw/scsi-disk.c    | 10 ++++----
 hw/scsi-generic.c |  2 +-
 4 files changed, 81 insertions(+), 54 deletions(-)

diff --git a/hw/scsi-bus.c b/hw/scsi-bus.c
index 52a6784..0b0344c 100644
--- a/hw/scsi-bus.c
+++ b/hw/scsi-bus.c
@@ -223,6 +223,7 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
 
     switch(cmd[0]) {
     case TEST_UNIT_READY:
+    case REWIND:
     case START_STOP:
     case SEEK_6:
     case WRITE_FILEMARKS:
@@ -231,24 +232,24 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
     case RELEASE:
     case ERASE:
     case ALLOW_MEDIUM_REMOVAL:
-    case VERIFY:
+    case VERIFY_10:
     case SEEK_10:
     case SYNCHRONIZE_CACHE:
     case LOCK_UNLOCK_CACHE:
     case LOAD_UNLOAD:
     case SET_CD_SPEED:
     case SET_LIMITS:
-    case WRITE_LONG:
+    case WRITE_LONG_10:
     case MOVE_MEDIUM:
     case UPDATE_BLOCK:
         req->cmd.xfer = 0;
         break;
     case MODE_SENSE:
         break;
-    case WRITE_SAME:
+    case WRITE_SAME_10:
         req->cmd.xfer = 1;
         break;
-    case READ_CAPACITY:
+    case READ_CAPACITY_10:
         req->cmd.xfer = 8;
         break;
     case READ_BLOCK_LIMITS:
@@ -264,7 +265,7 @@ static int scsi_req_length(SCSIRequest *req, uint8_t *cmd)
         req->cmd.xfer *= 8;
         break;
     case WRITE_10:
-    case WRITE_VERIFY:
+    case WRITE_VERIFY_10:
     case WRITE_6:
     case WRITE_12:
     case WRITE_VERIFY_12:
@@ -324,7 +325,7 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     switch (req->cmd.buf[0]) {
     case WRITE_6:
     case WRITE_10:
-    case WRITE_VERIFY:
+    case WRITE_VERIFY_10:
     case WRITE_12:
     case WRITE_VERIFY_12:
     case WRITE_16:
@@ -344,14 +345,13 @@ static void scsi_req_xfer_mode(SCSIRequest *req)
     case SEARCH_HIGH:
     case SEARCH_LOW:
     case UPDATE_BLOCK:
-    case WRITE_LONG:
-    case WRITE_SAME:
+    case WRITE_LONG_10:
+    case WRITE_SAME_10:
     case SEARCH_HIGH_12:
     case SEARCH_EQUAL_12:
     case SEARCH_LOW_12:
     case MEDIUM_SCAN:
     case SEND_VOLUME_TAG:
-    case WRITE_LONG_2:
     case PERSISTENT_RESERVE_OUT:
     case MAINTENANCE_OUT:
         req->cmd.mode = SCSI_XFER_TO_DEV;
@@ -515,6 +515,7 @@ static const char *scsi_command_name(uint8_t cmd)
 {
     static const char *names[] = {
         [ TEST_UNIT_READY          ] = "TEST_UNIT_READY",
+        [ REWIND                   ] = "REWIND",
         [ REQUEST_SENSE            ] = "REQUEST_SENSE",
         [ FORMAT_UNIT              ] = "FORMAT_UNIT",
         [ READ_BLOCK_LIMITS        ] = "READ_BLOCK_LIMITS",
@@ -539,13 +540,12 @@ static const char *scsi_command_name(uint8_t cmd)
         [ RECEIVE_DIAGNOSTIC       ] = "RECEIVE_DIAGNOSTIC",
         [ SEND_DIAGNOSTIC          ] = "SEND_DIAGNOSTIC",
         [ ALLOW_MEDIUM_REMOVAL     ] = "ALLOW_MEDIUM_REMOVAL",
-
-        [ READ_CAPACITY            ] = "READ_CAPACITY",
+        [ READ_CAPACITY_10         ] = "READ_CAPACITY_10",
         [ READ_10                  ] = "READ_10",
         [ WRITE_10                 ] = "WRITE_10",
         [ SEEK_10                  ] = "SEEK_10",
-        [ WRITE_VERIFY             ] = "WRITE_VERIFY",
-        [ VERIFY                   ] = "VERIFY",
+        [ WRITE_VERIFY_10          ] = "WRITE_VERIFY_10",
+        [ VERIFY_10                ] = "VERIFY_10",
         [ SEARCH_HIGH              ] = "SEARCH_HIGH",
         [ SEARCH_EQUAL             ] = "SEARCH_EQUAL",
         [ SEARCH_LOW               ] = "SEARCH_LOW",
@@ -561,11 +561,14 @@ static const char *scsi_command_name(uint8_t cmd)
         [ WRITE_BUFFER             ] = "WRITE_BUFFER",
         [ READ_BUFFER              ] = "READ_BUFFER",
         [ UPDATE_BLOCK             ] = "UPDATE_BLOCK",
-        [ READ_LONG                ] = "READ_LONG",
-        [ WRITE_LONG               ] = "WRITE_LONG",
+        [ READ_LONG_10             ] = "READ_LONG_10",
+        [ WRITE_LONG_10            ] = "WRITE_LONG_10",
         [ CHANGE_DEFINITION        ] = "CHANGE_DEFINITION",
-        [ WRITE_SAME               ] = "WRITE_SAME",
+        [ WRITE_SAME_10            ] = "WRITE_SAME_10",
+        [ UNMAP                    ] = "UNMAP",
         [ READ_TOC                 ] = "READ_TOC",
+        [ REPORT_DENSITY_SUPPORT   ] = "REPORT_DENSITY_SUPPORT",
+        [ GET_CONFIGURATION        ] = "GET_CONFIGURATION",
         [ LOG_SELECT               ] = "LOG_SELECT",
         [ LOG_SENSE                ] = "LOG_SENSE",
         [ MODE_SELECT_10           ] = "MODE_SELECT_10",
@@ -574,27 +577,39 @@ static const char *scsi_command_name(uint8_t cmd)
         [ MODE_SENSE_10            ] = "MODE_SENSE_10",
         [ PERSISTENT_RESERVE_IN    ] = "PERSISTENT_RESERVE_IN",
         [ PERSISTENT_RESERVE_OUT   ] = "PERSISTENT_RESERVE_OUT",
+        [ WRITE_FILEMARKS_16       ] = "WRITE_FILEMARKS_16",
+        [ EXTENDED_COPY            ] = "EXTENDED_COPY",
+        [ ATA_PASSTHROUGH          ] = "ATA_PASSTHROUGH",
+        [ ACCESS_CONTROL_IN        ] = "ACCESS_CONTROL_IN",
+        [ ACCESS_CONTROL_OUT       ] = "ACCESS_CONTROL_OUT",
+        [ READ_16                  ] = "READ_16",
+        [ COMPARE_AND_WRITE        ] = "COMPARE_AND_WRITE",
+        [ WRITE_16                 ] = "WRITE_16",
+        [ WRITE_VERIFY_16          ] = "WRITE_VERIFY_16",
+        [ VERIFY_16                ] = "VERIFY_16",
+        [ SYNCHRONIZE_CACHE_16     ] = "SYNCHRONIZE_CACHE_16",
+        [ LOCATE_16                ] = "LOCATE_16",
+        [ WRITE_SAME_16            ] = "WRITE_SAME_16",
+        [ ERASE_16                 ] = "ERASE_16",
+        [ SERVICE_ACTION_IN        ] = "SERVICE_ACTION_IN",
+        [ WRITE_LONG_16            ] = "WRITE_LONG_16",
+        [ REPORT_LUNS              ] = "REPORT_LUNS",
+        [ BLANK                    ] = "BLANK",
+        [ MAINTENANCE_IN           ] = "MAINTENANCE_IN",
+        [ MAINTENANCE_OUT          ] = "MAINTENANCE_OUT",
         [ MOVE_MEDIUM              ] = "MOVE_MEDIUM",
+        [ LOAD_UNLOAD              ] = "LOAD_UNLOAD",
         [ READ_12                  ] = "READ_12",
         [ WRITE_12                 ] = "WRITE_12",
         [ WRITE_VERIFY_12          ] = "WRITE_VERIFY_12",
+        [ VERIFY_12                ] = "VERIFY_12",
         [ SEARCH_HIGH_12           ] = "SEARCH_HIGH_12",
         [ SEARCH_EQUAL_12          ] = "SEARCH_EQUAL_12",
         [ SEARCH_LOW_12            ] = "SEARCH_LOW_12",
         [ READ_ELEMENT_STATUS      ] = "READ_ELEMENT_STATUS",
         [ SEND_VOLUME_TAG          ] = "SEND_VOLUME_TAG",
-        [ WRITE_LONG_2             ] = "WRITE_LONG_2",
-
-        [ REPORT_DENSITY_SUPPORT   ] = "REPORT_DENSITY_SUPPORT",
-        [ GET_CONFIGURATION        ] = "GET_CONFIGURATION",
-        [ READ_16                  ] = "READ_16",
-        [ WRITE_16                 ] = "WRITE_16",
-        [ WRITE_VERIFY_16          ] = "WRITE_VERIFY_16",
-        [ SERVICE_ACTION_IN        ] = "SERVICE_ACTION_IN",
-        [ REPORT_LUNS              ] = "REPORT_LUNS",
-        [ LOAD_UNLOAD              ] = "LOAD_UNLOAD",
+        [ READ_DEFECT_DATA_12      ] = "READ_DEFECT_DATA_12",
         [ SET_CD_SPEED             ] = "SET_CD_SPEED",
-        [ BLANK                    ] = "BLANK",
     };
 
     if (cmd >= ARRAY_SIZE(names) || names[cmd] == NULL)
diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h
index 1f40c5c..f644860 100644
--- a/hw/scsi-defs.h
+++ b/hw/scsi-defs.h
@@ -25,6 +25,7 @@
  */
 
 #define TEST_UNIT_READY       0x00
+#define REWIND                0x01
 #define REQUEST_SENSE         0x03
 #define FORMAT_UNIT           0x04
 #define READ_BLOCK_LIMITS     0x05
@@ -47,13 +48,13 @@
 #define RECEIVE_DIAGNOSTIC    0x1c
 #define SEND_DIAGNOSTIC       0x1d
 #define ALLOW_MEDIUM_REMOVAL  0x1e
-
-#define READ_CAPACITY         0x25
+#define READ_CAPACITY_10      0x25
 #define READ_10               0x28
 #define WRITE_10              0x2a
 #define SEEK_10               0x2b
-#define WRITE_VERIFY          0x2e
-#define VERIFY                0x2f
+#define LOCATE_10             0x2b
+#define WRITE_VERIFY_10       0x2e
+#define VERIFY_10             0x2f
 #define SEARCH_HIGH           0x30
 #define SEARCH_EQUAL          0x31
 #define SEARCH_LOW            0x32
@@ -69,11 +70,14 @@
 #define WRITE_BUFFER          0x3b
 #define READ_BUFFER           0x3c
 #define UPDATE_BLOCK          0x3d
-#define READ_LONG             0x3e
-#define WRITE_LONG            0x3f
+#define READ_LONG_10          0x3e
+#define WRITE_LONG_10         0x3f
 #define CHANGE_DEFINITION     0x40
-#define WRITE_SAME            0x41
+#define WRITE_SAME_10         0x41
+#define UNMAP                 0x42
 #define READ_TOC              0x43
+#define REPORT_DENSITY_SUPPORT 0x44
+#define GET_CONFIGURATION     0x46
 #define LOG_SELECT            0x4c
 #define LOG_SENSE             0x4d
 #define MODE_SELECT_10        0x55
@@ -82,32 +86,40 @@
 #define MODE_SENSE_10         0x5a
 #define PERSISTENT_RESERVE_IN 0x5e
 #define PERSISTENT_RESERVE_OUT 0x5f
+#define VARLENGTH_CDB         0x7f
+#define WRITE_FILEMARKS_16    0x80
+#define EXTENDED_COPY         0x83
+#define ATA_PASSTHROUGH       0x85
+#define ACCESS_CONTROL_IN     0x86
+#define ACCESS_CONTROL_OUT    0x87
+#define READ_16               0x88
+#define COMPARE_AND_WRITE     0x89
+#define WRITE_16              0x8a
+#define WRITE_VERIFY_16       0x8e
+#define VERIFY_16             0x8f
+#define SYNCHRONIZE_CACHE_16  0x91
+#define LOCATE_16             0x92
 #define WRITE_SAME_16         0x93
+#define ERASE_16              0x93
+#define SERVICE_ACTION_IN     0x9e
+#define WRITE_LONG_16         0x9f
+#define REPORT_LUNS           0xa0
+#define BLANK                 0xa1
 #define MAINTENANCE_IN        0xa3
 #define MAINTENANCE_OUT       0xa4
 #define MOVE_MEDIUM           0xa5
+#define LOAD_UNLOAD           0xa6
 #define READ_12               0xa8
 #define WRITE_12              0xaa
 #define WRITE_VERIFY_12       0xae
+#define VERIFY_12             0xaf
 #define SEARCH_HIGH_12        0xb0
 #define SEARCH_EQUAL_12       0xb1
 #define SEARCH_LOW_12         0xb2
 #define READ_ELEMENT_STATUS   0xb8
 #define SEND_VOLUME_TAG       0xb6
-#define WRITE_LONG_2          0xea
-
-/* from hw/scsi-generic.c */
-#define REWIND 0x01
-#define REPORT_DENSITY_SUPPORT 0x44
-#define GET_CONFIGURATION 0x46
-#define READ_16 0x88
-#define WRITE_16 0x8a
-#define WRITE_VERIFY_16 0x8e
-#define SERVICE_ACTION_IN 0x9e
-#define REPORT_LUNS 0xa0
-#define LOAD_UNLOAD 0xa6
-#define SET_CD_SPEED 0xbb
-#define BLANK 0xa1
+#define READ_DEFECT_DATA_12   0xb7
+#define SET_CD_SPEED          0xbb
 
 /*
  *  SAM Status codes
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index abf0bd2..03f244e 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -885,7 +885,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
     case ALLOW_MEDIUM_REMOVAL:
         bdrv_set_locked(s->bs, req->cmd.buf[4] & 1);
         break;
-    case READ_CAPACITY:
+    case READ_CAPACITY_10:
         /* The normal LEN field for this command is zero.  */
         memset(outbuf, 0, 8);
         bdrv_get_geometry(s->bs, &nb_sectors);
@@ -970,7 +970,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
         outbuf[3] = 8;
         buflen = 16;
         break;
-    case VERIFY:
+    case VERIFY_10:
         break;
     default:
         scsi_command_complete(r, CHECK_CONDITION, SENSE_CODE(INVALID_OPCODE));
@@ -1046,13 +1046,13 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     case RELEASE_10:
     case START_STOP:
     case ALLOW_MEDIUM_REMOVAL:
-    case READ_CAPACITY:
+    case READ_CAPACITY_10:
     case SYNCHRONIZE_CACHE:
     case READ_TOC:
     case GET_CONFIGURATION:
     case SERVICE_ACTION_IN:
     case REPORT_LUNS:
-    case VERIFY:
+    case VERIFY_10:
         rc = scsi_disk_emulate_command(r, outbuf);
         if (rc < 0) {
             return 0;
@@ -1075,7 +1075,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t *buf)
     case WRITE_10:
     case WRITE_12:
     case WRITE_16:
-    case WRITE_VERIFY:
+    case WRITE_VERIFY_10:
     case WRITE_VERIFY_12:
     case WRITE_VERIFY_16:
         len = r->req.cmd.xfer / s->qdev.blocksize;
diff --git a/hw/scsi-generic.c b/hw/scsi-generic.c
index 63361b3..7b0026e 100644
--- a/hw/scsi-generic.c
+++ b/hw/scsi-generic.c
@@ -406,7 +406,7 @@ static int get_blocksize(BlockDriverState *bdrv)
 
     memset(cmd, 0, sizeof(cmd));
     memset(buf, 0, sizeof(buf));
-    cmd[0] = READ_CAPACITY;
+    cmd[0] = READ_CAPACITY_10;
 
     memset(&io_header, 0, sizeof(io_header));
     io_header.interface_id = 'S';
-- 
cgit v1.1


From f37bd73b76e7f1e300e6acfe1bb6d3b2bc63714b Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Fri, 22 Jul 2011 16:44:46 +0200
Subject: scsi-disk: Remove 'drive_kind'

Instead of using its own definitions scsi-disk should
be using the device type of the parent device.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 hw/scsi-defs.h |  6 +++++-
 hw/scsi-disk.c | 46 ++++++++++++++++++++++------------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/hw/scsi-defs.h b/hw/scsi-defs.h
index f644860..27010b7 100644
--- a/hw/scsi-defs.h
+++ b/hw/scsi-defs.h
@@ -164,6 +164,7 @@
 
 #define TYPE_DISK           0x00
 #define TYPE_TAPE           0x01
+#define TYPE_PRINTER        0x02
 #define TYPE_PROCESSOR      0x03    /* HP scanners use this */
 #define TYPE_WORM           0x04    /* Treated as ROM by our system */
 #define TYPE_ROM            0x05
@@ -171,6 +172,9 @@
 #define TYPE_MOD            0x07    /* Magneto-optical disk -
 				     * - treated as TYPE_DISK */
 #define TYPE_MEDIUM_CHANGER 0x08
-#define TYPE_ENCLOSURE	    0x0d    /* Enclosure Services Device */
+#define TYPE_STORAGE_ARRAY  0x0c    /* Storage array device */
+#define TYPE_ENCLOSURE      0x0d    /* Enclosure Services Device */
+#define TYPE_RBC            0x0e    /* Simplified Direct-Access Device */
+#define TYPE_OSD            0x11    /* Object-storage Device */
 #define TYPE_NO_LUN         0x7f
 
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 03f244e..fa198f9 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -59,8 +59,6 @@ typedef struct SCSIDiskReq {
     uint32_t status;
 } SCSIDiskReq;
 
-typedef enum { SCSI_HD, SCSI_CD } SCSIDriveKind;
-
 struct SCSIDiskState
 {
     SCSIDevice qdev;
@@ -74,7 +72,6 @@ struct SCSIDiskState
     char *version;
     char *serial;
     SCSISense sense;
-    SCSIDriveKind drive_kind;
 };
 
 static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type);
@@ -382,7 +379,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             return -1;
         }
 
-        if (s->drive_kind == SCSI_CD) {
+        if (s->qdev.type == TYPE_ROM) {
             outbuf[buflen++] = 5;
         } else {
             outbuf[buflen++] = 0;
@@ -401,7 +398,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             if (s->serial)
                 outbuf[buflen++] = 0x80; // unit serial number
             outbuf[buflen++] = 0x83; // device identification
-            if (s->drive_kind == SCSI_HD) {
+            if (s->qdev.type == TYPE_DISK) {
                 outbuf[buflen++] = 0xb0; // block limits
                 outbuf[buflen++] = 0xb2; // thin provisioning
             }
@@ -460,7 +457,7 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
             unsigned int opt_io_size =
                     s->qdev.conf.opt_io_size / s->qdev.blocksize;
 
-            if (s->drive_kind == SCSI_CD) {
+            if (s->qdev.type == TYPE_ROM) {
                 DPRINTF("Inquiry (EVPD[%02X] not supported for CDROM\n",
                         page_code);
                 return -1;
@@ -530,12 +527,11 @@ static int scsi_disk_emulate_inquiry(SCSIRequest *req, uint8_t *outbuf)
         return buflen;
     }
 
-    if (s->drive_kind == SCSI_CD) {
-        outbuf[0] = 5;
+    outbuf[0] = s->qdev.type & 0x1f;
+    if (s->qdev.type == TYPE_ROM) {
         outbuf[1] = 0x80;
         memcpy(&outbuf[16], "QEMU CD-ROM     ", 16);
     } else {
-        outbuf[0] = 0;
         outbuf[1] = s->removable ? 0x80 : 0;
         memcpy(&outbuf[16], "QEMU HARDDISK   ", 16);
     }
@@ -661,7 +657,7 @@ static int mode_sense_page(SCSIRequest *req, int page, uint8_t *p,
         return p[1] + 2;
 
     case 0x2a: /* CD Capabilities and Mechanical Status page. */
-        if (s->drive_kind != SCSI_CD)
+        if (s->qdev.type != TYPE_ROM)
             return 0;
         p[0] = 0x2a;
         p[1] = 0x14;
@@ -877,7 +873,7 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf)
             goto illegal_request;
         break;
     case START_STOP:
-        if (s->drive_kind == SCSI_CD && (req->cmd.buf[4] & 2)) {
+        if (s->qdev.type == TYPE_ROM && (req->cmd.buf[4] & 2)) {
             /* load/eject medium */
             bdrv_eject(s->bs, !(req->cmd.buf[4] & 1));
         }
@@ -1183,7 +1179,7 @@ static void scsi_destroy(SCSIDevice *dev)
     blockdev_mark_auto_del(s->qdev.conf.bs);
 }
 
-static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind)
+static int scsi_initfn(SCSIDevice *dev, uint8_t scsi_type)
 {
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, dev);
     DriveInfo *dinfo;
@@ -1193,9 +1189,8 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind)
         return -1;
     }
     s->bs = s->qdev.conf.bs;
-    s->drive_kind = kind;
 
-    if (kind == SCSI_HD && !bdrv_is_inserted(s->bs)) {
+    if (scsi_type == TYPE_DISK && !bdrv_is_inserted(s->bs)) {
         error_report("Device needs media, but drive is empty");
         return -1;
     }
@@ -1217,44 +1212,47 @@ static int scsi_initfn(SCSIDevice *dev, SCSIDriveKind kind)
         return -1;
     }
 
-    if (kind == SCSI_CD) {
+    if (scsi_type == TYPE_ROM) {
         s->qdev.blocksize = 2048;
-    } else {
+    } else if (scsi_type == TYPE_DISK) {
         s->qdev.blocksize = s->qdev.conf.logical_block_size;
+    } else {
+        error_report("scsi-disk: Unhandled SCSI type %02x", scsi_type);
+        return -1;
     }
     s->cluster_size = s->qdev.blocksize / 512;
     s->bs->buffer_alignment = s->qdev.blocksize;
 
-    s->qdev.type = TYPE_DISK;
+    s->qdev.type = scsi_type;
     qemu_add_vm_change_state_handler(scsi_dma_restart_cb, s);
-    bdrv_set_removable(s->bs, kind == SCSI_CD);
+    bdrv_set_removable(s->bs, scsi_type == TYPE_ROM);
     add_boot_device_path(s->qdev.conf.bootindex, &dev->qdev, ",0");
     return 0;
 }
 
 static int scsi_hd_initfn(SCSIDevice *dev)
 {
-    return scsi_initfn(dev, SCSI_HD);
+    return scsi_initfn(dev, TYPE_DISK);
 }
 
 static int scsi_cd_initfn(SCSIDevice *dev)
 {
-    return scsi_initfn(dev, SCSI_CD);
+    return scsi_initfn(dev, TYPE_ROM);
 }
 
 static int scsi_disk_initfn(SCSIDevice *dev)
 {
-    SCSIDriveKind kind;
     DriveInfo *dinfo;
+    uint8_t scsi_type;
 
     if (!dev->conf.bs) {
-        kind = SCSI_HD;         /* will die in scsi_initfn() */
+        scsi_type = TYPE_DISK;  /* will die in scsi_initfn() */
     } else {
         dinfo = drive_get_by_blockdev(dev->conf.bs);
-        kind = dinfo->media_cd ? SCSI_CD : SCSI_HD;
+        scsi_type = dinfo->media_cd ? TYPE_ROM : TYPE_DISK;
     }
 
-    return scsi_initfn(dev, kind);
+    return scsi_initfn(dev, scsi_type);
 }
 
 #define DEFINE_SCSI_DISK_PROPERTIES()                           \
-- 
cgit v1.1


From 5bf3f8e4f71e46f80b76dc8a03cc0c37cad5cde0 Mon Sep 17 00:00:00 2001
From: Frediano Ziglio <freddy77@gmail.com>
Date: Wed, 27 Jul 2011 20:12:00 +0200
Subject: block: Removed unused function bdrv_write_sync

Signed-off-by: Frediano Ziglio <freddy77@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 13 -------------
 block.h |  2 --
 2 files changed, 15 deletions(-)

diff --git a/block.c b/block.c
index 8859f9b..4c66b2c 100644
--- a/block.c
+++ b/block.c
@@ -1110,19 +1110,6 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
-/*
- * Writes to the file and ensures that no writes are reordered across this
- * request (acts as a barrier)
- *
- * Returns 0 on success, -errno in error cases.
- */
-int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors)
-{
-    return bdrv_pwrite_sync(bs, BDRV_SECTOR_SIZE * sector_num,
-        buf, BDRV_SECTOR_SIZE * nb_sectors);
-}
-
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
  */
diff --git a/block.h b/block.h
index 59cc410..e672bc6 100644
--- a/block.h
+++ b/block.h
@@ -85,8 +85,6 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
                 const void *buf, int count);
 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     const void *buf, int count);
-int bdrv_write_sync(BlockDriverState *bs, int64_t sector_num,
-    const uint8_t *buf, int nb_sectors);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
-- 
cgit v1.1


From c1ee7d56f3f2c8c5fb90452f4df72fa402f61527 Mon Sep 17 00:00:00 2001
From: Frediano Ziglio <freddy77@gmail.com>
Date: Wed, 27 Jul 2011 20:12:01 +0200
Subject: raw-posix: Typo fix

Signed-off-by: Frediano Ziglio <freddy77@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6672d31..6dd7086 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -587,7 +587,7 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
 
     /*
      * If O_DIRECT is used the buffer needs to be aligned on a sector
-     * boundary.  Check if this is the case or telll the low-level
+     * boundary.  Check if this is the case or tell the low-level
      * driver that it needs to copy the buffer.
      */
     if (s->aligned_buf) {
-- 
cgit v1.1


From f6e8ffc22fe153ba981f2747e4c52ea7e55f6ecc Mon Sep 17 00:00:00 2001
From: Frediano Ziglio <freddy77@gmail.com>
Date: Wed, 27 Jul 2011 20:12:02 +0200
Subject: raw-posix: Always check paio_init result

Signed-off-by: Frediano Ziglio <freddy77@gmail.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 6dd7086..c5c9944 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -230,13 +230,15 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
         }
     }
 
+    /* We're falling back to POSIX AIO in some cases so init always */
+    if (paio_init() < 0) {
+        goto out_free_buf;
+    }
+
 #ifdef CONFIG_LINUX_AIO
     if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
                       (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
 
-        /* We're falling back to POSIX AIO in some cases */
-        paio_init();
-
         s->aio_ctx = laio_init();
         if (!s->aio_ctx) {
             goto out_free_buf;
@@ -245,9 +247,6 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
     } else
 #endif
     {
-        if (paio_init() < 0) {
-            goto out_free_buf;
-        }
 #ifdef CONFIG_LINUX_AIO
         s->use_aio = 0;
 #endif
-- 
cgit v1.1


From 00dccaf1f848290d979a4b1e6248281ce1b32aaa Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 17 Jan 2011 16:08:14 +0000
Subject: coroutine: introduce coroutines

Asynchronous code is becoming very complex.  At the same time
synchronous code is growing because it is convenient to write.
Sometimes duplicate code paths are even added, one synchronous and the
other asynchronous.  This patch introduces coroutines which allow code
that looks synchronous but is asynchronous under the covers.

A coroutine has its own stack and is therefore able to preserve state
across blocking operations, which traditionally require callback
functions and manual marshalling of parameters.

Creating and starting a coroutine is easy:

  coroutine = qemu_coroutine_create(my_coroutine);
  qemu_coroutine_enter(coroutine, my_data);

The coroutine then executes until it returns or yields:

  void coroutine_fn my_coroutine(void *opaque) {
      MyData *my_data = opaque;

      /* do some work */

      qemu_coroutine_yield();

      /* do some more work */
  }

Yielding switches control back to the caller of qemu_coroutine_enter().
This is typically used to switch back to the main thread's event loop
after issuing an asynchronous I/O request.  The request callback will
then invoke qemu_coroutine_enter() once more to switch back to the
coroutine.

Note that if coroutines are used only from threads which hold the global
mutex they will never execute concurrently.  This makes programming with
coroutines easier than with threads.  Race conditions cannot occur since
only one coroutine may be active at any time.  Other coroutines can only
run across yield.

This coroutines implementation is based on the gtk-vnc implementation
written by Anthony Liguori <anthony@codemonkey.ws> but it has been
significantly rewritten by Kevin Wolf <kwolf@redhat.com> to use
setjmp()/longjmp() instead of the more expensive swapcontext() and by
Paolo Bonzini <pbonzini@redhat.com> for Windows Fibers support.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 Makefile.objs        |   7 ++
 coroutine-ucontext.c | 230 +++++++++++++++++++++++++++++++++++++++++++++++++++
 coroutine-win32.c    |  92 +++++++++++++++++++++
 qemu-coroutine-int.h |  48 +++++++++++
 qemu-coroutine.c     |  75 +++++++++++++++++
 qemu-coroutine.h     |  95 +++++++++++++++++++++
 trace-events         |   5 ++
 7 files changed, 552 insertions(+)
 create mode 100644 coroutine-ucontext.c
 create mode 100644 coroutine-win32.c
 create mode 100644 qemu-coroutine-int.h
 create mode 100644 qemu-coroutine.c
 create mode 100644 qemu-coroutine.h

diff --git a/Makefile.objs b/Makefile.objs
index 6991a9f..28e1762 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -11,6 +11,12 @@ oslib-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o
 oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
 
 #######################################################################
+# coroutines
+coroutine-obj-y = qemu-coroutine.o
+coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
+coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
+
+#######################################################################
 # block-obj-y is code used by both qemu system emulation and qemu-img
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o
@@ -69,6 +75,7 @@ common-obj-y += readline.o console.o cursor.o qemu-error.o
 common-obj-y += $(oslib-obj-y)
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
+common-obj-y += $(coroutine-obj-y)
 
 common-obj-y += tcg-runtime.o host-utils.o
 common-obj-y += irq.o ioport.o input.o
diff --git a/coroutine-ucontext.c b/coroutine-ucontext.c
new file mode 100644
index 0000000..41c2379
--- /dev/null
+++ b/coroutine-ucontext.c
@@ -0,0 +1,230 @@
+/*
+ * ucontext coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include <stdlib.h>
+#include <setjmp.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <ucontext.h>
+#include "qemu-common.h"
+#include "qemu-coroutine-int.h"
+
+enum {
+    /* Maximum free pool size prevents holding too many freed coroutines */
+    POOL_MAX_SIZE = 64,
+};
+
+typedef struct {
+    Coroutine base;
+    void *stack;
+    jmp_buf env;
+} CoroutineUContext;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+typedef struct {
+    /** Currently executing coroutine */
+    Coroutine *current;
+
+    /** Free list to speed up creation */
+    QLIST_HEAD(, Coroutine) pool;
+    unsigned int pool_size;
+
+    /** The default coroutine */
+    CoroutineUContext leader;
+} CoroutineThreadState;
+
+static pthread_key_t thread_state_key;
+
+/*
+ * va_args to makecontext() must be type 'int', so passing
+ * the pointer we need may require several int args. This
+ * union is a quick hack to let us do that
+ */
+union cc_arg {
+    void *p;
+    int i[2];
+};
+
+static CoroutineThreadState *coroutine_get_thread_state(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    if (!s) {
+        s = qemu_mallocz(sizeof(*s));
+        s->current = &s->leader.base;
+        QLIST_INIT(&s->pool);
+        pthread_setspecific(thread_state_key, s);
+    }
+    return s;
+}
+
+static void qemu_coroutine_thread_cleanup(void *opaque)
+{
+    CoroutineThreadState *s = opaque;
+    Coroutine *co;
+    Coroutine *tmp;
+
+    QLIST_FOREACH_SAFE(co, &s->pool, pool_next, tmp) {
+        qemu_free(DO_UPCAST(CoroutineUContext, base, co)->stack);
+        qemu_free(co);
+    }
+    qemu_free(s);
+}
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+    int ret;
+
+    ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
+    if (ret != 0) {
+        fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
+        abort();
+    }
+}
+
+static void coroutine_trampoline(int i0, int i1)
+{
+    union cc_arg arg;
+    CoroutineUContext *self;
+    Coroutine *co;
+
+    arg.i[0] = i0;
+    arg.i[1] = i1;
+    self = arg.p;
+    co = &self->base;
+
+    /* Initialize longjmp environment and switch back the caller */
+    if (!setjmp(self->env)) {
+        longjmp(*(jmp_buf *)co->entry_arg, 1);
+    }
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+static Coroutine *coroutine_new(void)
+{
+    const size_t stack_size = 1 << 20;
+    CoroutineUContext *co;
+    ucontext_t old_uc, uc;
+    jmp_buf old_env;
+    union cc_arg arg;
+
+    /* The ucontext functions preserve signal masks which incurs a system call
+     * overhead.  setjmp()/longjmp() does not preserve signal masks but only
+     * works on the current stack.  Since we need a way to create and switch to
+     * a new stack, use the ucontext functions for that but setjmp()/longjmp()
+     * for everything else.
+     */
+
+    if (getcontext(&uc) == -1) {
+        abort();
+    }
+
+    co = qemu_mallocz(sizeof(*co));
+    co->stack = qemu_malloc(stack_size);
+    co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+    uc.uc_link = &old_uc;
+    uc.uc_stack.ss_sp = co->stack;
+    uc.uc_stack.ss_size = stack_size;
+    uc.uc_stack.ss_flags = 0;
+
+    arg.p = co;
+
+    makecontext(&uc, (void (*)(void))coroutine_trampoline,
+                2, arg.i[0], arg.i[1]);
+
+    /* swapcontext() in, longjmp() back out */
+    if (!setjmp(old_env)) {
+        swapcontext(&old_uc, &uc);
+    }
+    return &co->base;
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    Coroutine *co;
+
+    co = QLIST_FIRST(&s->pool);
+    if (co) {
+        QLIST_REMOVE(co, pool_next);
+        s->pool_size--;
+    } else {
+        co = coroutine_new();
+    }
+    return co;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_);
+
+    if (s->pool_size < POOL_MAX_SIZE) {
+        QLIST_INSERT_HEAD(&s->pool, &co->base, pool_next);
+        co->base.caller = NULL;
+        s->pool_size++;
+        return;
+    }
+
+    qemu_free(co->stack);
+    qemu_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
+    CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    int ret;
+
+    s->current = to_;
+
+    ret = setjmp(from->env);
+    if (ret == 0) {
+        longjmp(to->env, action);
+    }
+    return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+
+    return s->current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    return s && s->current->caller;
+}
diff --git a/coroutine-win32.c b/coroutine-win32.c
new file mode 100644
index 0000000..0e29448
--- /dev/null
+++ b/coroutine-win32.c
@@ -0,0 +1,92 @@
+/*
+ * Win32 coroutine initialization code
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "qemu-coroutine-int.h"
+
+typedef struct
+{
+    Coroutine base;
+
+    LPVOID fiber;
+    CoroutineAction action;
+} CoroutineWin32;
+
+static __thread CoroutineWin32 leader;
+static __thread Coroutine *current;
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_);
+    CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_);
+
+    current = to_;
+
+    to->action = action;
+    SwitchToFiber(to->fiber);
+    return from->action;
+}
+
+static void CALLBACK coroutine_trampoline(void *co_)
+{
+    Coroutine *co = co_;
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    const size_t stack_size = 1 << 20;
+    CoroutineWin32 *co;
+
+    co = qemu_mallocz(sizeof(*co));
+    co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base);
+    return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_);
+
+    DeleteFiber(co->fiber);
+    qemu_free(co);
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    if (!current) {
+        current = &leader.base;
+        leader.fiber = ConvertThreadToFiber(NULL);
+    }
+    return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    return current && current->caller;
+}
diff --git a/qemu-coroutine-int.h b/qemu-coroutine-int.h
new file mode 100644
index 0000000..64915c2
--- /dev/null
+++ b/qemu-coroutine-int.h
@@ -0,0 +1,48 @@
+/*
+ * Coroutine internals
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_COROUTINE_INT_H
+#define QEMU_COROUTINE_INT_H
+
+#include "qemu-queue.h"
+#include "qemu-coroutine.h"
+
+typedef enum {
+    COROUTINE_YIELD = 1,
+    COROUTINE_TERMINATE = 2,
+} CoroutineAction;
+
+struct Coroutine {
+    CoroutineEntry *entry;
+    void *entry_arg;
+    Coroutine *caller;
+    QLIST_ENTRY(Coroutine) pool_next;
+};
+
+Coroutine *qemu_coroutine_new(void);
+void qemu_coroutine_delete(Coroutine *co);
+CoroutineAction qemu_coroutine_switch(Coroutine *from, Coroutine *to,
+                                      CoroutineAction action);
+
+#endif
diff --git a/qemu-coroutine.c b/qemu-coroutine.c
new file mode 100644
index 0000000..600be26
--- /dev/null
+++ b/qemu-coroutine.c
@@ -0,0 +1,75 @@
+/*
+ * QEMU coroutines
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Kevin Wolf         <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu-common.h"
+#include "qemu-coroutine.h"
+#include "qemu-coroutine-int.h"
+
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry)
+{
+    Coroutine *co = qemu_coroutine_new();
+    co->entry = entry;
+    return co;
+}
+
+static void coroutine_swap(Coroutine *from, Coroutine *to)
+{
+    CoroutineAction ret;
+
+    ret = qemu_coroutine_switch(from, to, COROUTINE_YIELD);
+
+    switch (ret) {
+    case COROUTINE_YIELD:
+        return;
+    case COROUTINE_TERMINATE:
+        trace_qemu_coroutine_terminate(to);
+        qemu_coroutine_delete(to);
+        return;
+    default:
+        abort();
+    }
+}
+
+void qemu_coroutine_enter(Coroutine *co, void *opaque)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_coroutine_enter(self, co, opaque);
+
+    if (co->caller) {
+        fprintf(stderr, "Co-routine re-entered recursively\n");
+        abort();
+    }
+
+    co->caller = self;
+    co->entry_arg = opaque;
+    coroutine_swap(self, co);
+}
+
+void coroutine_fn qemu_coroutine_yield(void)
+{
+    Coroutine *self = qemu_coroutine_self();
+    Coroutine *to = self->caller;
+
+    trace_qemu_coroutine_yield(self, to);
+
+    if (!to) {
+        fprintf(stderr, "Co-routine is yielding to no one\n");
+        abort();
+    }
+
+    self->caller = NULL;
+    coroutine_swap(self, to);
+}
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
new file mode 100644
index 0000000..08255c7
--- /dev/null
+++ b/qemu-coroutine.h
@@ -0,0 +1,95 @@
+/*
+ * QEMU coroutine implementation
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_COROUTINE_H
+#define QEMU_COROUTINE_H
+
+#include <stdbool.h>
+
+/**
+ * Coroutines are a mechanism for stack switching and can be used for
+ * cooperative userspace threading.  These functions provide a simple but
+ * useful flavor of coroutines that is suitable for writing sequential code,
+ * rather than callbacks, for operations that need to give up control while
+ * waiting for events to complete.
+ *
+ * These functions are re-entrant and may be used outside the global mutex.
+ */
+
+/**
+ * Mark a function that executes in coroutine context
+ *
+ * Functions that execute in coroutine context cannot be called directly from
+ * normal functions.  In the future it would be nice to enable compiler or
+ * static checker support for catching such errors.  This annotation might make
+ * it possible and in the meantime it serves as documentation.
+ *
+ * For example:
+ *
+ *   static void coroutine_fn foo(void) {
+ *       ....
+ *   }
+ */
+#define coroutine_fn
+
+typedef struct Coroutine Coroutine;
+
+/**
+ * Coroutine entry point
+ *
+ * When the coroutine is entered for the first time, opaque is passed in as an
+ * argument.
+ *
+ * When this function returns, the coroutine is destroyed automatically and
+ * execution continues in the caller who last entered the coroutine.
+ */
+typedef void coroutine_fn CoroutineEntry(void *opaque);
+
+/**
+ * Create a new coroutine
+ *
+ * Use qemu_coroutine_enter() to actually transfer control to the coroutine.
+ */
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry);
+
+/**
+ * Transfer control to a coroutine
+ *
+ * The opaque argument is passed as the argument to the entry point when
+ * entering the coroutine for the first time.  It is subsequently ignored.
+ */
+void qemu_coroutine_enter(Coroutine *coroutine, void *opaque);
+
+/**
+ * Transfer control back to a coroutine's caller
+ *
+ * This function does not return until the coroutine is re-entered using
+ * qemu_coroutine_enter().
+ */
+void coroutine_fn qemu_coroutine_yield(void);
+
+/**
+ * Get the currently executing coroutine
+ */
+Coroutine *coroutine_fn qemu_coroutine_self(void);
+
+/**
+ * Return whether or not currently inside a coroutine
+ *
+ * This can be used to write functions that work both when in coroutine context
+ * and when not in coroutine context.  Note that such functions cannot use the
+ * coroutine_fn annotation since they work outside coroutine context.
+ */
+bool qemu_in_coroutine(void);
+
+#endif /* QEMU_COROUTINE_H */
diff --git a/trace-events b/trace-events
index 713f042..136f775 100644
--- a/trace-events
+++ b/trace-events
@@ -425,3 +425,8 @@ disable qemu_put_ram_ptr(void* addr) "%p"
 
 # hw/xen_platform.c
 disable xen_platform_log(char *s) "xen platform: %s"
+
+# qemu-coroutine.c
+disable qemu_coroutine_enter(void *from, void *to, void *opaque) "from %p to %p opaque %p"
+disable qemu_coroutine_yield(void *from, void *to) "from %p to %p"
+disable qemu_coroutine_terminate(void *co) "self %p"
-- 
cgit v1.1


From d0e2fce5366bff40cf2bf48d2f6ff72b4d8de124 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Thu, 9 Jun 2011 23:11:06 +0530
Subject: coroutine: implement coroutines using gthread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On platforms that don't support makecontext(3) use gthread based
coroutine implementation.

Darwin has makecontext(3) but getcontext(3) is stubbed out to return
ENOTSUP.  Andreas Färber <andreas.faerber@web.de> debugged this and
contributed the ./configure test which solves the issue for Darwin/ppc64
(and ppc) v10.5.

[Original patch by Aneesh, made consistent with coroutine-ucontext.c and
switched to GStaticPrivate by Stefan.  Tested on Linux and OpenBSD.]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 Makefile.objs       |   4 ++
 configure           |  18 ++++++++
 coroutine-gthread.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 153 insertions(+)
 create mode 100644 coroutine-gthread.c

diff --git a/Makefile.objs b/Makefile.objs
index 28e1762..5679e1f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -13,7 +13,11 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
 #######################################################################
 # coroutines
 coroutine-obj-y = qemu-coroutine.o
+ifeq ($(CONFIG_UCONTEXT_COROUTINE),y)
 coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
+else
+coroutine-obj-$(CONFIG_POSIX) += coroutine-gthread.o
+endif
 coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
 
 #######################################################################
diff --git a/configure b/configure
index 77194cf..1eed0cd 100755
--- a/configure
+++ b/configure
@@ -2541,6 +2541,20 @@ EOF
 fi
 
 ##########################################
+# check if we have makecontext
+
+ucontext_coroutine=no
+if test "$darwin" != "yes"; then
+  cat > $TMPC << EOF
+#include <ucontext.h>
+int main(void) { makecontext(0, 0, 0); }
+EOF
+  if compile_prog "" "" ; then
+      ucontext_coroutine=yes
+  fi
+fi
+
+##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
 
@@ -3015,6 +3029,10 @@ if test "$rbd" = "yes" ; then
   echo "CONFIG_RBD=y" >> $config_host_mak
 fi
 
+if test "$ucontext_coroutine" = "yes" ; then
+  echo "CONFIG_UCONTEXT_COROUTINE=y" >> $config_host_mak
+fi
+
 # USB host support
 case "$usb" in
 linux)
diff --git a/coroutine-gthread.c b/coroutine-gthread.c
new file mode 100644
index 0000000..f09877e
--- /dev/null
+++ b/coroutine-gthread.c
@@ -0,0 +1,131 @@
+/*
+ * GThread coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <glib.h>
+#include "qemu-common.h"
+#include "qemu-coroutine-int.h"
+
+typedef struct {
+    Coroutine base;
+    GThread *thread;
+    bool runnable;
+    CoroutineAction action;
+} CoroutineGThread;
+
+static GCond *coroutine_cond;
+static GStaticMutex coroutine_lock = G_STATIC_MUTEX_INIT;
+static GStaticPrivate coroutine_key = G_STATIC_PRIVATE_INIT;
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+    if (!g_thread_supported()) {
+        g_thread_init(NULL);
+    }
+
+    coroutine_cond = g_cond_new();
+}
+
+static void coroutine_wait_runnable_locked(CoroutineGThread *co)
+{
+    while (!co->runnable) {
+        g_cond_wait(coroutine_cond, g_static_mutex_get_mutex(&coroutine_lock));
+    }
+}
+
+static void coroutine_wait_runnable(CoroutineGThread *co)
+{
+    g_static_mutex_lock(&coroutine_lock);
+    coroutine_wait_runnable_locked(co);
+    g_static_mutex_unlock(&coroutine_lock);
+}
+
+static gpointer coroutine_thread(gpointer opaque)
+{
+    CoroutineGThread *co = opaque;
+
+    g_static_private_set(&coroutine_key, co, NULL);
+    coroutine_wait_runnable(co);
+    co->base.entry(co->base.entry_arg);
+    qemu_coroutine_switch(&co->base, co->base.caller, COROUTINE_TERMINATE);
+    return NULL;
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineGThread *co;
+
+    co = qemu_mallocz(sizeof(*co));
+    co->thread = g_thread_create_full(coroutine_thread, co, 0, TRUE, TRUE,
+                                      G_THREAD_PRIORITY_NORMAL, NULL);
+    if (!co->thread) {
+        qemu_free(co);
+        return NULL;
+    }
+    return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineGThread *co = DO_UPCAST(CoroutineGThread, base, co_);
+
+    g_thread_join(co->thread);
+    qemu_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_,
+                                      Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineGThread *from = DO_UPCAST(CoroutineGThread, base, from_);
+    CoroutineGThread *to = DO_UPCAST(CoroutineGThread, base, to_);
+
+    g_static_mutex_lock(&coroutine_lock);
+    from->runnable = false;
+    from->action = action;
+    to->runnable = true;
+    to->action = action;
+    g_cond_broadcast(coroutine_cond);
+
+    if (action != COROUTINE_TERMINATE) {
+        coroutine_wait_runnable_locked(from);
+    }
+    g_static_mutex_unlock(&coroutine_lock);
+    return from->action;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    CoroutineGThread *co = g_static_private_get(&coroutine_key);
+
+    if (!co) {
+        co = qemu_mallocz(sizeof(*co));
+        co->runnable = true;
+        g_static_private_set(&coroutine_key, co, (GDestroyNotify)qemu_free);
+    }
+
+    return &co->base;
+}
+
+bool qemu_in_coroutine(void)
+{
+    CoroutineGThread *co = g_static_private_get(&coroutine_key);
+
+    return co && co->base.caller;
+}
-- 
cgit v1.1


From aa7ee42ed3b2c196de9f390fc9a5e339dd7417e3 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Tue, 10 May 2011 10:21:18 +0100
Subject: coroutine: add test-coroutine automated tests

To run automated tests for coroutines:

  make test-coroutine
  ./test-coroutine

On success the program terminates with exit status 0.  On failure an
error message is written to stderr and the program exits with exit
status 1.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 .gitignore       |   1 +
 Makefile         |   3 +-
 test-coroutine.c | 162 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 1 deletion(-)
 create mode 100644 test-coroutine.c

diff --git a/.gitignore b/.gitignore
index 54835bc..59c343c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ qemu-io
 qemu-ga
 qemu-monitor.texi
 QMP/qmp-commands.txt
+test-coroutine
 .gdbinit
 *.a
 *.aux
diff --git a/Makefile b/Makefile
index 4855251..2becedc 100644
--- a/Makefile
+++ b/Makefile
@@ -151,7 +151,7 @@ qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trac
 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
 	$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN   $@")
 
-check-qint.o check-qstring.o check-qdict.o check-qlist.o check-qfloat.o check-qjson.o: $(GENERATED_HEADERS)
+check-qint.o check-qstring.o check-qdict.o check-qlist.o check-qfloat.o check-qjson.o test-coroutine.o: $(GENERATED_HEADERS)
 
 CHECK_PROG_DEPS = qemu-malloc.o $(oslib-obj-y) $(trace-obj-y) qemu-tool.o
 
@@ -161,6 +161,7 @@ check-qdict: check-qdict.o qdict.o qfloat.o qint.o qstring.o qbool.o qlist.o $(C
 check-qlist: check-qlist.o qlist.o qint.o $(CHECK_PROG_DEPS)
 check-qfloat: check-qfloat.o qfloat.o $(CHECK_PROG_DEPS)
 check-qjson: check-qjson.o qfloat.o qint.o qdict.o qstring.o qlist.o qbool.o qjson.o json-streamer.o json-lexer.o json-parser.o error.o qerror.o qemu-error.o $(CHECK_PROG_DEPS)
+test-coroutine: test-coroutine.o qemu-timer-common.o async.o $(coroutine-obj-y) $(CHECK_PROG_DEPS)
 
 $(qapi-obj-y): $(GENERATED_HEADERS)
 qapi-dir := qapi-generated
diff --git a/test-coroutine.c b/test-coroutine.c
new file mode 100644
index 0000000..9e9d3c9
--- /dev/null
+++ b/test-coroutine.c
@@ -0,0 +1,162 @@
+/*
+ * Coroutine tests
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include <glib.h>
+#include "qemu-coroutine.h"
+
+/*
+ * Check that qemu_in_coroutine() works
+ */
+
+static void coroutine_fn verify_in_coroutine(void *opaque)
+{
+    g_assert(qemu_in_coroutine());
+}
+
+static void test_in_coroutine(void)
+{
+    Coroutine *coroutine;
+
+    g_assert(!qemu_in_coroutine());
+
+    coroutine = qemu_coroutine_create(verify_in_coroutine);
+    qemu_coroutine_enter(coroutine, NULL);
+}
+
+/*
+ * Check that qemu_coroutine_self() works
+ */
+
+static void coroutine_fn verify_self(void *opaque)
+{
+    g_assert(qemu_coroutine_self() == opaque);
+}
+
+static void test_self(void)
+{
+    Coroutine *coroutine;
+
+    coroutine = qemu_coroutine_create(verify_self);
+    qemu_coroutine_enter(coroutine, coroutine);
+}
+
+/*
+ * Check that coroutines may nest multiple levels
+ */
+
+typedef struct {
+    unsigned int n_enter;   /* num coroutines entered */
+    unsigned int n_return;  /* num coroutines returned */
+    unsigned int max;       /* maximum level of nesting */
+} NestData;
+
+static void coroutine_fn nest(void *opaque)
+{
+    NestData *nd = opaque;
+
+    nd->n_enter++;
+
+    if (nd->n_enter < nd->max) {
+        Coroutine *child;
+
+        child = qemu_coroutine_create(nest);
+        qemu_coroutine_enter(child, nd);
+    }
+
+    nd->n_return++;
+}
+
+static void test_nesting(void)
+{
+    Coroutine *root;
+    NestData nd = {
+        .n_enter  = 0,
+        .n_return = 0,
+        .max      = 128,
+    };
+
+    root = qemu_coroutine_create(nest);
+    qemu_coroutine_enter(root, &nd);
+
+    /* Must enter and return from max nesting level */
+    g_assert_cmpint(nd.n_enter, ==, nd.max);
+    g_assert_cmpint(nd.n_return, ==, nd.max);
+}
+
+/*
+ * Check that yield/enter transfer control correctly
+ */
+
+static void coroutine_fn yield_5_times(void *opaque)
+{
+    bool *done = opaque;
+    int i;
+
+    for (i = 0; i < 5; i++) {
+        qemu_coroutine_yield();
+    }
+    *done = true;
+}
+
+static void test_yield(void)
+{
+    Coroutine *coroutine;
+    bool done = false;
+    int i = -1; /* one extra time to return from coroutine */
+
+    coroutine = qemu_coroutine_create(yield_5_times);
+    while (!done) {
+        qemu_coroutine_enter(coroutine, &done);
+        i++;
+    }
+    g_assert_cmpint(i, ==, 5); /* coroutine must yield 5 times */
+}
+
+/*
+ * Check that creation, enter, and return work
+ */
+
+static void coroutine_fn set_and_exit(void *opaque)
+{
+    bool *done = opaque;
+
+    *done = true;
+}
+
+static void test_lifecycle(void)
+{
+    Coroutine *coroutine;
+    bool done = false;
+
+    /* Create, enter, and return from coroutine */
+    coroutine = qemu_coroutine_create(set_and_exit);
+    qemu_coroutine_enter(coroutine, &done);
+    g_assert(done); /* expect done to be true (first time) */
+
+    /* Repeat to check that no state affects this test */
+    done = false;
+    coroutine = qemu_coroutine_create(set_and_exit);
+    qemu_coroutine_enter(coroutine, &done);
+    g_assert(done); /* expect done to be true (second time) */
+}
+
+int main(int argc, char **argv)
+{
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/basic/lifecycle", test_lifecycle);
+    g_test_add_func("/basic/yield", test_yield);
+    g_test_add_func("/basic/nesting", test_nesting);
+    g_test_add_func("/basic/self", test_self);
+    g_test_add_func("/basic/in_coroutine", test_in_coroutine);
+    return g_test_run();
+}
-- 
cgit v1.1


From 5e3840ce24040cbd1957008489cbc136c43ca391 Mon Sep 17 00:00:00 2001
From: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
Date: Thu, 12 May 2011 08:27:39 +0100
Subject: coroutine: add test-coroutine --benchmark-lifecycle

Add a microbenchmark for coroutine create, enter, and return (aka
lifecycle).  This is a useful benchmark because users are expected to
create many coroutines, one per I/O request for example, and we
therefore need to provide good performance in that scenario.

To run:

  make test-coroutine
  ./test-coroutine --benchmark-lifecycle 20000000

This will do 20,000,000 coroutine create, enter, return iterations and
print the resulting time.

Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 test-coroutine.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/test-coroutine.c b/test-coroutine.c
index 9e9d3c9..bf9f3e9 100644
--- a/test-coroutine.c
+++ b/test-coroutine.c
@@ -150,6 +150,33 @@ static void test_lifecycle(void)
     g_assert(done); /* expect done to be true (second time) */
 }
 
+/*
+ * Lifecycle benchmark
+ */
+
+static void coroutine_fn empty_coroutine(void *opaque)
+{
+    /* Do nothing */
+}
+
+static void perf_lifecycle(void)
+{
+    Coroutine *coroutine;
+    unsigned int i, max;
+    double duration;
+
+    max = 1000000;
+
+    g_test_timer_start();
+    for (i = 0; i < max; i++) {
+        coroutine = qemu_coroutine_create(empty_coroutine);
+        qemu_coroutine_enter(coroutine, NULL);
+    }
+    duration = g_test_timer_elapsed();
+
+    g_test_message("Lifecycle %u iterations: %f s\n", max, duration);
+}
+
 int main(int argc, char **argv)
 {
     g_test_init(&argc, &argv, NULL);
@@ -158,5 +185,8 @@ int main(int argc, char **argv)
     g_test_add_func("/basic/nesting", test_nesting);
     g_test_add_func("/basic/self", test_self);
     g_test_add_func("/basic/in_coroutine", test_in_coroutine);
+    if (g_test_perf()) {
+        g_test_add_func("/perf/lifecycle", perf_lifecycle);
+    }
     return g_test_run();
 }
-- 
cgit v1.1


From da1fa91d6cca8a6d3da9c2b222fa485429db297c Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 14 Jul 2011 17:27:13 +0200
Subject: block: Add bdrv_co_readv/writev

Add new block driver callbacks bdrv_co_readv/writev, which work on a
QEMUIOVector like bdrv_aio_*, but don't need a callback. The function may only
be called inside a coroutine, so a block driver implementing this interface can
yield instead of blocking during I/O.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 Makefile.objs |  2 +-
 block.c       | 45 +++++++++++++++++++++++++++++++++++++++++++++
 block.h       |  5 +++++
 block_int.h   |  6 ++++++
 trace-events  |  2 ++
 5 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/Makefile.objs b/Makefile.objs
index 5679e1f..9549e2a 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -25,6 +25,7 @@ coroutine-obj-$(CONFIG_WIN32) += coroutine-win32.o
 
 block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o async.o
 block-obj-y += nbd.o block.o aio.o aes.o qemu-config.o qemu-progress.o qemu-sockets.o
+block-obj-y += $(coroutine-obj-y)
 block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
 block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
 
@@ -79,7 +80,6 @@ common-obj-y += readline.o console.o cursor.o qemu-error.o
 common-obj-y += $(oslib-obj-y)
 common-obj-$(CONFIG_WIN32) += os-win32.o
 common-obj-$(CONFIG_POSIX) += os-posix.o
-common-obj-y += $(coroutine-obj-y)
 
 common-obj-y += tcg-runtime.o host-utils.o
 common-obj-y += irq.o ioport.o input.o
diff --git a/block.c b/block.c
index 4c66b2c..1329299 100644
--- a/block.c
+++ b/block.c
@@ -1110,6 +1110,51 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    BlockDriver *drv = bs->drv;
+
+    trace_bdrv_co_readv(bs, sector_num, nb_sectors);
+
+    if (!drv) {
+        return -ENOMEDIUM;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+}
+
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov)
+{
+    BlockDriver *drv = bs->drv;
+
+    trace_bdrv_co_writev(bs, sector_num, nb_sectors);
+
+    if (!bs->drv) {
+        return -ENOMEDIUM;
+    }
+    if (bs->read_only) {
+        return -EACCES;
+    }
+    if (bdrv_check_request(bs, sector_num, nb_sectors)) {
+        return -EIO;
+    }
+
+    if (bs->dirty_bitmap) {
+        set_dirty_bitmap(bs, sector_num, nb_sectors, 1);
+    }
+
+    if (bs->wr_highest_sector < sector_num + nb_sectors - 1) {
+        bs->wr_highest_sector = sector_num + nb_sectors - 1;
+    }
+
+    return drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+}
+
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
  */
diff --git a/block.h b/block.h
index e672bc6..a3bfaaf 100644
--- a/block.h
+++ b/block.h
@@ -4,6 +4,7 @@
 #include "qemu-aio.h"
 #include "qemu-common.h"
 #include "qemu-option.h"
+#include "qemu-coroutine.h"
 #include "qobject.h"
 
 /* block.c */
@@ -85,6 +86,10 @@ int bdrv_pwrite(BlockDriverState *bs, int64_t offset,
                 const void *buf, int count);
 int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     const void *buf, int count);
+int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
+    int nb_sectors, QEMUIOVector *qiov);
 int bdrv_truncate(BlockDriverState *bs, int64_t offset);
 int64_t bdrv_getlength(BlockDriverState *bs);
 int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
diff --git a/block_int.h b/block_int.h
index efefbee..f6d02b3 100644
--- a/block_int.h
+++ b/block_int.h
@@ -27,6 +27,7 @@
 #include "block.h"
 #include "qemu-option.h"
 #include "qemu-queue.h"
+#include "qemu-coroutine.h"
 
 #define BLOCK_FLAG_ENCRYPT	1
 #define BLOCK_FLAG_COMPAT6	4
@@ -77,6 +78,11 @@ struct BlockDriver {
     int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num,
                         int nb_sectors);
 
+    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+
     int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs,
         int num_reqs);
     int (*bdrv_merge_requests)(BlockDriverState *bs, BlockRequest* a,
diff --git a/trace-events b/trace-events
index 136f775..46bceca 100644
--- a/trace-events
+++ b/trace-events
@@ -66,6 +66,8 @@ disable bdrv_aio_flush(void *bs, void *opaque) "bs %p opaque %p"
 disable bdrv_aio_readv(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 disable bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaque) "bs %p sector_num %"PRId64" nb_sectors %d opaque %p"
 disable bdrv_set_locked(void *bs, int locked) "bs %p locked %d"
+disable bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+disable bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 
 # hw/virtio-blk.c
 disable virtio_blk_req_complete(void *req, int status) "req %p status %d"
-- 
cgit v1.1


From 68485420187094c26f86faee5c7f68b5d6a03603 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 30 Jun 2011 10:05:46 +0200
Subject: block: Emulate AIO functions with bdrv_co_readv/writev

Use the bdrv_co_readv/writev callbacks to implement bdrv_aio_readv/writev and
bdrv_read/write if a driver provides the coroutine version instead of the
synchronous or AIO version.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 1329299..0d973e6 100644
--- a/block.c
+++ b/block.c
@@ -28,6 +28,7 @@
 #include "block_int.h"
 #include "module.h"
 #include "qemu-objects.h"
+#include "qemu-coroutine.h"
 
 #ifdef CONFIG_BSD
 #include <sys/types.h>
@@ -57,6 +58,12 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
+static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -169,7 +176,13 @@ void path_combine(char *dest, int dest_size,
 
 void bdrv_register(BlockDriver *bdrv)
 {
-    if (!bdrv->bdrv_aio_readv) {
+    if (bdrv->bdrv_co_readv) {
+        /* Emulate AIO by coroutines, and sync by AIO */
+        bdrv->bdrv_aio_readv = bdrv_co_aio_readv_em;
+        bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em;
+        bdrv->bdrv_read = bdrv_read_em;
+        bdrv->bdrv_write = bdrv_write_em;
+     } else if (!bdrv->bdrv_aio_readv) {
         /* add AIO emulation layer */
         bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
         bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
@@ -2614,6 +2627,89 @@ static BlockDriverAIOCB *bdrv_aio_writev_em(BlockDriverState *bs,
     return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
 }
 
+
+typedef struct BlockDriverAIOCBCoroutine {
+    BlockDriverAIOCB common;
+    BlockRequest req;
+    bool is_write;
+    QEMUBH* bh;
+} BlockDriverAIOCBCoroutine;
+
+static void bdrv_aio_co_cancel_em(BlockDriverAIOCB *blockacb)
+{
+    qemu_aio_flush();
+}
+
+static AIOPool bdrv_em_co_aio_pool = {
+    .aiocb_size         = sizeof(BlockDriverAIOCBCoroutine),
+    .cancel             = bdrv_aio_co_cancel_em,
+};
+
+static void bdrv_co_rw_bh(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+
+    acb->common.cb(acb->common.opaque, acb->req.error);
+    qemu_bh_delete(acb->bh);
+    qemu_aio_release(acb);
+}
+
+static void coroutine_fn bdrv_co_rw(void *opaque)
+{
+    BlockDriverAIOCBCoroutine *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (!acb->is_write) {
+        acb->req.error = bs->drv->bdrv_co_readv(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov);
+    } else {
+        acb->req.error = bs->drv->bdrv_co_writev(bs, acb->req.sector,
+            acb->req.nb_sectors, acb->req.qiov);
+    }
+
+    acb->bh = qemu_bh_new(bdrv_co_rw_bh, acb);
+    qemu_bh_schedule(acb->bh);
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs,
+                                               int64_t sector_num,
+                                               QEMUIOVector *qiov,
+                                               int nb_sectors,
+                                               BlockDriverCompletionFunc *cb,
+                                               void *opaque,
+                                               bool is_write)
+{
+    Coroutine *co;
+    BlockDriverAIOCBCoroutine *acb;
+
+    acb = qemu_aio_get(&bdrv_em_co_aio_pool, bs, cb, opaque);
+    acb->req.sector = sector_num;
+    acb->req.nb_sectors = nb_sectors;
+    acb->req.qiov = qiov;
+    acb->is_write = is_write;
+
+    co = qemu_coroutine_create(bdrv_co_rw);
+    qemu_coroutine_enter(co, acb);
+
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
+                                 false);
+}
+
+static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque,
+                                 true);
+}
+
 static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
-- 
cgit v1.1


From f9f05dc58c50d19ad762e6c1ce6b5def9814a4ed Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 15 Jul 2011 13:50:26 +0200
Subject: block: Add bdrv_co_readv/writev emulation

In order to be able to call bdrv_co_readv/writev for drivers that don't
implement the functions natively, add an emulation that uses the AIO functions
to implement them.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c      | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 trace-events |  1 +
 2 files changed, 76 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index 0d973e6..e6abea8 100644
--- a/block.c
+++ b/block.c
@@ -64,6 +64,12 @@ static BlockDriverAIOCB *bdrv_co_aio_readv_em(BlockDriverState *bs,
 static BlockDriverAIOCB *bdrv_co_aio_writev_em(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -182,14 +188,19 @@ void bdrv_register(BlockDriver *bdrv)
         bdrv->bdrv_aio_writev = bdrv_co_aio_writev_em;
         bdrv->bdrv_read = bdrv_read_em;
         bdrv->bdrv_write = bdrv_write_em;
-     } else if (!bdrv->bdrv_aio_readv) {
-        /* add AIO emulation layer */
-        bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
-        bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
-    } else if (!bdrv->bdrv_read) {
-        /* add synchronous IO emulation layer */
-        bdrv->bdrv_read = bdrv_read_em;
-        bdrv->bdrv_write = bdrv_write_em;
+     } else {
+        bdrv->bdrv_co_readv = bdrv_co_readv_em;
+        bdrv->bdrv_co_writev = bdrv_co_writev_em;
+
+        if (!bdrv->bdrv_aio_readv) {
+            /* add AIO emulation layer */
+            bdrv->bdrv_aio_readv = bdrv_aio_readv_em;
+            bdrv->bdrv_aio_writev = bdrv_aio_writev_em;
+        } else if (!bdrv->bdrv_read) {
+            /* add synchronous IO emulation layer */
+            bdrv->bdrv_read = bdrv_read_em;
+            bdrv->bdrv_write = bdrv_write_em;
+        }
     }
 
     if (!bdrv->bdrv_aio_flush)
@@ -2856,6 +2867,62 @@ void qemu_aio_release(void *p)
 }
 
 /**************************************************************/
+/* Coroutine block device emulation */
+
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void bdrv_co_io_em_complete(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num,
+                                      int nb_sectors, QEMUIOVector *iov,
+                                      bool is_write)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockDriverAIOCB *acb;
+
+    if (is_write) {
+        acb = bdrv_aio_writev(bs, sector_num, iov, nb_sectors,
+                              bdrv_co_io_em_complete, &co);
+    } else {
+        acb = bdrv_aio_readv(bs, sector_num, iov, nb_sectors,
+                             bdrv_co_io_em_complete, &co);
+    }
+
+    trace_bdrv_co_io(is_write, acb);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+
+    return co.ret;
+}
+
+static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false);
+}
+
+static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
+                                         int64_t sector_num, int nb_sectors,
+                                         QEMUIOVector *iov)
+{
+    return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
+}
+
+/**************************************************************/
 /* removable device support */
 
 /**
diff --git a/trace-events b/trace-events
index 46bceca..bc9be30 100644
--- a/trace-events
+++ b/trace-events
@@ -68,6 +68,7 @@ disable bdrv_aio_writev(void *bs, int64_t sector_num, int nb_sectors, void *opaq
 disable bdrv_set_locked(void *bs, int locked) "bs %p locked %d"
 disable bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 disable bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
+disable bdrv_co_io(int is_write, void *acb) "is_write %d acb %p"
 
 # hw/virtio-blk.c
 disable virtio_blk_req_complete(void *req, int status) "req %p status %d"
-- 
cgit v1.1


From b96e92470ab4a87268e8b174602eaea6c508003b Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 30 Jun 2011 17:56:46 +0200
Subject: coroutines: Locks

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 Makefile.objs         |   2 +-
 qemu-coroutine-int.h  |   1 +
 qemu-coroutine-lock.c | 124 ++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-coroutine.h      |  64 ++++++++++++++++++++++++++
 trace-events          |   8 ++++
 5 files changed, 198 insertions(+), 1 deletion(-)
 create mode 100644 qemu-coroutine-lock.c

diff --git a/Makefile.objs b/Makefile.objs
index 9549e2a..89ca361 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -12,7 +12,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
 
 #######################################################################
 # coroutines
-coroutine-obj-y = qemu-coroutine.o
+coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o
 ifeq ($(CONFIG_UCONTEXT_COROUTINE),y)
 coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
 else
diff --git a/qemu-coroutine-int.h b/qemu-coroutine-int.h
index 64915c2..d495615 100644
--- a/qemu-coroutine-int.h
+++ b/qemu-coroutine-int.h
@@ -38,6 +38,7 @@ struct Coroutine {
     void *entry_arg;
     Coroutine *caller;
     QLIST_ENTRY(Coroutine) pool_next;
+    QTAILQ_ENTRY(Coroutine) co_queue_next;
 };
 
 Coroutine *qemu_coroutine_new(void);
diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
new file mode 100644
index 0000000..abaa1f7
--- /dev/null
+++ b/qemu-coroutine-lock.c
@@ -0,0 +1,124 @@
+/*
+ * coroutine queues and locks
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "qemu-coroutine.h"
+#include "qemu-coroutine-int.h"
+#include "qemu-queue.h"
+#include "trace.h"
+
+static QTAILQ_HEAD(, Coroutine) unlock_bh_queue =
+    QTAILQ_HEAD_INITIALIZER(unlock_bh_queue);
+
+struct unlock_bh {
+    QEMUBH *bh;
+};
+
+static void qemu_co_queue_next_bh(void *opaque)
+{
+    struct unlock_bh *unlock_bh = opaque;
+    Coroutine *next;
+
+    trace_qemu_co_queue_next_bh();
+    while ((next = QTAILQ_FIRST(&unlock_bh_queue))) {
+        QTAILQ_REMOVE(&unlock_bh_queue, next, co_queue_next);
+        qemu_coroutine_enter(next, NULL);
+    }
+
+    qemu_bh_delete(unlock_bh->bh);
+    qemu_free(unlock_bh);
+}
+
+void qemu_co_queue_init(CoQueue *queue)
+{
+    QTAILQ_INIT(&queue->entries);
+}
+
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
+{
+    Coroutine *self = qemu_coroutine_self();
+    QTAILQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+    qemu_coroutine_yield();
+    assert(qemu_in_coroutine());
+}
+
+bool qemu_co_queue_next(CoQueue *queue)
+{
+    struct unlock_bh *unlock_bh;
+    Coroutine *next;
+
+    next = QTAILQ_FIRST(&queue->entries);
+    if (next) {
+        QTAILQ_REMOVE(&queue->entries, next, co_queue_next);
+        QTAILQ_INSERT_TAIL(&unlock_bh_queue, next, co_queue_next);
+        trace_qemu_co_queue_next(next);
+
+        unlock_bh = qemu_malloc(sizeof(*unlock_bh));
+        unlock_bh->bh = qemu_bh_new(qemu_co_queue_next_bh, unlock_bh);
+        qemu_bh_schedule(unlock_bh->bh);
+    }
+
+    return (next != NULL);
+}
+
+bool qemu_co_queue_empty(CoQueue *queue)
+{
+    return (QTAILQ_FIRST(&queue->entries) == NULL);
+}
+
+void qemu_co_mutex_init(CoMutex *mutex)
+{
+    memset(mutex, 0, sizeof(*mutex));
+    qemu_co_queue_init(&mutex->queue);
+}
+
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_co_mutex_lock_entry(mutex, self);
+
+    while (mutex->locked) {
+        qemu_co_queue_wait(&mutex->queue);
+    }
+
+    mutex->locked = true;
+
+    trace_qemu_co_mutex_lock_return(mutex, self);
+}
+
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_co_mutex_unlock_entry(mutex, self);
+
+    assert(mutex->locked == true);
+    assert(qemu_in_coroutine());
+
+    mutex->locked = false;
+    qemu_co_queue_next(&mutex->queue);
+
+    trace_qemu_co_mutex_unlock_return(mutex, self);
+}
diff --git a/qemu-coroutine.h b/qemu-coroutine.h
index 08255c7..2f2fd95 100644
--- a/qemu-coroutine.h
+++ b/qemu-coroutine.h
@@ -5,6 +5,7 @@
  *
  * Authors:
  *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Kevin Wolf         <kwolf@redhat.com>
  *
  * This work is licensed under the terms of the GNU LGPL, version 2 or later.
  * See the COPYING.LIB file in the top-level directory.
@@ -15,6 +16,7 @@
 #define QEMU_COROUTINE_H
 
 #include <stdbool.h>
+#include "qemu-queue.h"
 
 /**
  * Coroutines are a mechanism for stack switching and can be used for
@@ -92,4 +94,66 @@ Coroutine *coroutine_fn qemu_coroutine_self(void);
  */
 bool qemu_in_coroutine(void);
 
+
+
+/**
+ * CoQueues are a mechanism to queue coroutines in order to continue executing
+ * them later. They provide the fundamental primitives on which coroutine locks
+ * are built.
+ */
+typedef struct CoQueue {
+    QTAILQ_HEAD(, Coroutine) entries;
+} CoQueue;
+
+/**
+ * Initialise a CoQueue. This must be called before any other operation is used
+ * on the CoQueue.
+ */
+void qemu_co_queue_init(CoQueue *queue);
+
+/**
+ * Adds the current coroutine to the CoQueue and transfers control to the
+ * caller of the coroutine.
+ */
+void coroutine_fn qemu_co_queue_wait(CoQueue *queue);
+
+/**
+ * Restarts the next coroutine in the CoQueue and removes it from the queue.
+ *
+ * Returns true if a coroutine was restarted, false if the queue is empty.
+ */
+bool qemu_co_queue_next(CoQueue *queue);
+
+/**
+ * Checks if the CoQueue is empty.
+ */
+bool qemu_co_queue_empty(CoQueue *queue);
+
+
+/**
+ * Provides a mutex that can be used to synchronise coroutines
+ */
+typedef struct CoMutex {
+    bool locked;
+    CoQueue queue;
+} CoMutex;
+
+/**
+ * Initialises a CoMutex. This must be called before any other operation is used
+ * on the CoMutex.
+ */
+void qemu_co_mutex_init(CoMutex *mutex);
+
+/**
+ * Locks the mutex. If the lock cannot be taken immediately, control is
+ * transferred to the caller of the current coroutine.
+ */
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex);
+
+/**
+ * Unlocks the mutex and schedules the next coroutine that was waiting for this
+ * lock to be run.
+ */
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex);
+
 #endif /* QEMU_COROUTINE_H */
diff --git a/trace-events b/trace-events
index bc9be30..19d31e3 100644
--- a/trace-events
+++ b/trace-events
@@ -433,3 +433,11 @@ disable xen_platform_log(char *s) "xen platform: %s"
 disable qemu_coroutine_enter(void *from, void *to, void *opaque) "from %p to %p opaque %p"
 disable qemu_coroutine_yield(void *from, void *to) "from %p to %p"
 disable qemu_coroutine_terminate(void *co) "self %p"
+
+# qemu-coroutine-lock.c
+disable qemu_co_queue_next_bh(void) ""
+disable qemu_co_queue_next(void *next) "next %p"
+disable qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
+disable qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
+disable qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
+disable qemu_co_mutex_unlock_return(void *mutex, void *self) "mutex %p self %p"
-- 
cgit v1.1


From 68d100e905453ebbeea8e915f4f18a2bd4339fe8 Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Thu, 30 Jun 2011 17:42:09 +0200
Subject: qcow2: Use coroutines

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c |  26 +++---
 block/qcow2.c         | 240 ++++++++++++++++++--------------------------------
 block/qcow2.h         |   5 +-
 3 files changed, 102 insertions(+), 169 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 882f50a..81cf77d 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -697,12 +697,12 @@ err:
  * m->depends_on is set to NULL and the other fields in m are meaningless.
  *
  * If the cluster is newly allocated, m->nb_clusters is set to the number of
- * contiguous clusters that have been allocated. This may be 0 if the request
- * conflict with another write request in flight; in this case, m->depends_on
- * is set and the remaining fields of m are meaningless.
+ * contiguous clusters that have been allocated. In this case, the other
+ * fields of m are valid and contain information about the first allocated
+ * cluster.
  *
- * If m->nb_clusters is non-zero, the other fields of m are valid and contain
- * information about the first allocated cluster.
+ * If the request conflicts with another write request in flight, the coroutine
+ * is queued and will be reentered when the dependency has completed.
  *
  * Return 0 on success and -errno in error cases
  */
@@ -721,6 +721,7 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
         return ret;
     }
 
+again:
     nb_clusters = size_to_clusters(s, n_end << 9);
 
     nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
@@ -792,12 +793,12 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
             }
 
             if (nb_clusters == 0) {
-                /* Set dependency and wait for a callback */
-                m->depends_on = old_alloc;
-                m->nb_clusters = 0;
-                *num = 0;
-
-                goto out_wait_dependency;
+                /* Wait for the dependency to complete. We need to recheck
+                 * the free/allocated clusters when we continue. */
+                qemu_co_mutex_unlock(&s->lock);
+                qemu_co_queue_wait(&old_alloc->dependent_requests);
+                qemu_co_mutex_lock(&s->lock);
+                goto again;
             }
         }
     }
@@ -834,9 +835,6 @@ out:
 
     return 0;
 
-out_wait_dependency:
-    return qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
-
 fail:
     qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
 fail_put:
diff --git a/block/qcow2.c b/block/qcow2.c
index 48e1b95..f07d550 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -276,6 +276,9 @@ static int qcow2_open(BlockDriverState *bs, int flags)
         goto fail;
     }
 
+    /* Initialise locks */
+    qemu_co_mutex_init(&s->lock);
+
 #ifdef DEBUG_ALLOC
     qcow2_check_refcounts(bs);
 #endif
@@ -379,7 +382,6 @@ typedef struct QCowAIOCB {
     uint64_t cluster_offset;
     uint8_t *cluster_data;
     bool is_write;
-    BlockDriverAIOCB *hd_aiocb;
     QEMUIOVector hd_qiov;
     QEMUBH *bh;
     QCowL2Meta l2meta;
@@ -389,8 +391,6 @@ typedef struct QCowAIOCB {
 static void qcow2_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
-    if (acb->hd_aiocb)
-        bdrv_aio_cancel(acb->hd_aiocb);
     qemu_aio_release(acb);
 }
 
@@ -399,46 +399,16 @@ static AIOPool qcow2_aio_pool = {
     .cancel             = qcow2_aio_cancel,
 };
 
-static void qcow2_aio_read_cb(void *opaque, int ret);
-static void qcow2_aio_write_cb(void *opaque, int ret);
-
-static void qcow2_aio_rw_bh(void *opaque)
-{
-    QCowAIOCB *acb = opaque;
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    if (acb->is_write) {
-        qcow2_aio_write_cb(opaque, 0);
-    } else {
-        qcow2_aio_read_cb(opaque, 0);
-    }
-}
-
-static int qcow2_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
-{
-    if (acb->bh)
-        return -EIO;
-
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh)
-        return -EIO;
-
-    qemu_bh_schedule(acb->bh);
-
-    return 0;
-}
-
-static void qcow2_aio_read_cb(void *opaque, int ret)
+/*
+ * Returns 0 when the request is completed successfully, 1 when there is still
+ * a part left to do and -errno in error cases.
+ */
+static int qcow2_aio_read_cb(QCowAIOCB *acb)
 {
-    QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster, n1;
-
-    acb->hd_aiocb = NULL;
-    if (ret < 0)
-        goto done;
+    int ret;
 
     /* post process the read buffer */
     if (!acb->cluster_offset) {
@@ -463,8 +433,7 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     /* prepare next AIO request */
@@ -477,7 +446,7 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
     ret = qcow2_get_cluster_offset(bs, acb->sector_num << 9,
         &acb->cur_nr_sectors, &acb->cluster_offset);
     if (ret < 0) {
-        goto done;
+        return ret;
     }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
@@ -494,42 +463,35 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
                 acb->sector_num, acb->cur_nr_sectors);
             if (n1 > 0) {
                 BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
-                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
-                                    &acb->hd_qiov, n1, qcow2_aio_read_cb, acb);
-                if (acb->hd_aiocb == NULL) {
-                    ret = -EIO;
-                    goto done;
+                qemu_co_mutex_unlock(&s->lock);
+                ret = bdrv_co_readv(bs->backing_hd, acb->sector_num,
+                                    n1, &acb->hd_qiov);
+                qemu_co_mutex_lock(&s->lock);
+                if (ret < 0) {
+                    return ret;
                 }
-            } else {
-                ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-                if (ret < 0)
-                    goto done;
             }
+            return 1;
         } else {
             /* Note: in this case, no need to wait */
             qemu_iovec_memset(&acb->hd_qiov, 0, 512 * acb->cur_nr_sectors);
-            ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-            if (ret < 0)
-                goto done;
+            return 1;
         }
     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
         /* add AIO support for compressed blocks ? */
         ret = qcow2_decompress_cluster(bs, acb->cluster_offset);
         if (ret < 0) {
-            goto done;
+            return ret;
         }
 
         qemu_iovec_from_buffer(&acb->hd_qiov,
             s->cluster_cache + index_in_cluster * 512,
             512 * acb->cur_nr_sectors);
 
-        ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-        if (ret < 0)
-            goto done;
+        return 1;
     } else {
         if ((acb->cluster_offset & 511) != 0) {
-            ret = -EIO;
-            goto done;
+            return -EIO;
         }
 
         if (s->crypt_method) {
@@ -550,21 +512,17 @@ static void qcow2_aio_read_cb(void *opaque, int ret)
         }
 
         BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
-        acb->hd_aiocb = bdrv_aio_readv(bs->file,
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
-                            &acb->hd_qiov, acb->cur_nr_sectors,
-                            qcow2_aio_read_cb, acb);
-        if (acb->hd_aiocb == NULL) {
-            ret = -EIO;
-            goto done;
+                            acb->cur_nr_sectors, &acb->hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            return ret;
         }
     }
 
-    return;
-done:
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_iovec_destroy(&acb->hd_qiov);
-    qemu_aio_release(acb);
+    return 1;
 }
 
 static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
@@ -577,7 +535,6 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
     acb = qemu_aio_get(&qcow2_aio_pool, bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->hd_aiocb = NULL;
     acb->sector_num = sector_num;
     acb->qiov = qiov;
     acb->is_write = is_write;
@@ -589,70 +546,65 @@ static QCowAIOCB *qcow2_aio_setup(BlockDriverState *bs, int64_t sector_num,
     acb->cur_nr_sectors = 0;
     acb->cluster_offset = 0;
     acb->l2meta.nb_clusters = 0;
-    QLIST_INIT(&acb->l2meta.dependent_requests);
+    qemu_co_queue_init(&acb->l2meta.dependent_requests);
     return acb;
 }
 
-static BlockDriverAIOCB *qcow2_aio_readv(BlockDriverState *bs,
-                                         int64_t sector_num,
-                                         QEMUIOVector *qiov, int nb_sectors,
-                                         BlockDriverCompletionFunc *cb,
-                                         void *opaque)
+static int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
 {
+    BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
     int ret;
 
-    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-    if (!acb)
-        return NULL;
+    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 0);
 
-    ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-    if (ret < 0) {
-        qemu_iovec_destroy(&acb->hd_qiov);
-        qemu_aio_release(acb);
-        return NULL;
-    }
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow2_aio_read_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
 
-    return &acb->common;
+    qemu_iovec_destroy(&acb->hd_qiov);
+    qemu_aio_release(acb);
+
+    return ret;
 }
 
-static void run_dependent_requests(QCowL2Meta *m)
+static void run_dependent_requests(BDRVQcowState *s, QCowL2Meta *m)
 {
-    QCowAIOCB *req;
-    QCowAIOCB *next;
-
     /* Take the request off the list of running requests */
     if (m->nb_clusters != 0) {
         QLIST_REMOVE(m, next_in_flight);
     }
 
     /* Restart all dependent requests */
-    QLIST_FOREACH_SAFE(req, &m->dependent_requests, next_depend, next) {
-        qcow2_aio_write_cb(req, 0);
+    if (!qemu_co_queue_empty(&m->dependent_requests)) {
+        qemu_co_mutex_unlock(&s->lock);
+        while(qemu_co_queue_next(&m->dependent_requests));
+        qemu_co_mutex_lock(&s->lock);
     }
-
-    /* Empty the list for the next part of the request */
-    QLIST_INIT(&m->dependent_requests);
 }
 
-static void qcow2_aio_write_cb(void *opaque, int ret)
+/*
+ * Returns 0 when the request is completed successfully, 1 when there is still
+ * a part left to do and -errno in error cases.
+ */
+static int qcow2_aio_write_cb(QCowAIOCB *acb)
 {
-    QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
     int n_end;
+    int ret;
 
-    acb->hd_aiocb = NULL;
-
-    if (ret >= 0) {
-        ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta);
-    }
+    ret = qcow2_alloc_cluster_link_l2(bs, &acb->l2meta);
 
-    run_dependent_requests(&acb->l2meta);
+    run_dependent_requests(s, &acb->l2meta);
 
-    if (ret < 0)
-        goto done;
+    if (ret < 0) {
+        return ret;
+    }
 
     acb->remaining_sectors -= acb->cur_nr_sectors;
     acb->sector_num += acb->cur_nr_sectors;
@@ -660,8 +612,7 @@ static void qcow2_aio_write_cb(void *opaque, int ret)
 
     if (acb->remaining_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
@@ -673,18 +624,10 @@ static void qcow2_aio_write_cb(void *opaque, int ret)
     ret = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9,
         index_in_cluster, n_end, &acb->cur_nr_sectors, &acb->l2meta);
     if (ret < 0) {
-        goto done;
+        return ret;
     }
 
     acb->cluster_offset = acb->l2meta.cluster_offset;
-
-    /* Need to wait for another request? If so, we are done for now. */
-    if (acb->l2meta.nb_clusters == 0 && acb->l2meta.depends_on != NULL) {
-        QLIST_INSERT_HEAD(&acb->l2meta.depends_on->dependent_requests,
-            acb, next_depend);
-        return;
-    }
-
     assert((acb->cluster_offset & 511) == 0);
 
     qemu_iovec_reset(&acb->hd_qiov);
@@ -709,51 +652,40 @@ static void qcow2_aio_write_cb(void *opaque, int ret)
     }
 
     BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
-    acb->hd_aiocb = bdrv_aio_writev(bs->file,
-                                    (acb->cluster_offset >> 9) + index_in_cluster,
-                                    &acb->hd_qiov, acb->cur_nr_sectors,
-                                    qcow2_aio_write_cb, acb);
-    if (acb->hd_aiocb == NULL) {
-        ret = -EIO;
-        goto fail;
+    qemu_co_mutex_unlock(&s->lock);
+    ret = bdrv_co_writev(bs->file,
+                         (acb->cluster_offset >> 9) + index_in_cluster,
+                         acb->cur_nr_sectors, &acb->hd_qiov);
+    qemu_co_mutex_lock(&s->lock);
+    if (ret < 0) {
+        return ret;
     }
 
-    return;
-
-fail:
-    if (acb->l2meta.nb_clusters != 0) {
-        QLIST_REMOVE(&acb->l2meta, next_in_flight);
-    }
-done:
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_iovec_destroy(&acb->hd_qiov);
-    qemu_aio_release(acb);
+    return 1;
 }
 
-static BlockDriverAIOCB *qcow2_aio_writev(BlockDriverState *bs,
-                                          int64_t sector_num,
-                                          QEMUIOVector *qiov, int nb_sectors,
-                                          BlockDriverCompletionFunc *cb,
-                                          void *opaque)
+static int qcow2_co_writev(BlockDriverState *bs,
+                           int64_t sector_num,
+                           int nb_sectors,
+                           QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
     int ret;
 
+    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, NULL, NULL, 1);
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
-    acb = qcow2_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-    if (!acb)
-        return NULL;
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow2_aio_write_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
 
-    ret = qcow2_schedule_bh(qcow2_aio_rw_bh, acb);
-    if (ret < 0) {
-        qemu_iovec_destroy(&acb->hd_qiov);
-        qemu_aio_release(acb);
-        return NULL;
-    }
+    qemu_iovec_destroy(&acb->hd_qiov);
+    qemu_aio_release(acb);
 
-    return &acb->common;
+    return ret;
 }
 
 static void qcow2_close(BlockDriverState *bs)
@@ -881,7 +813,7 @@ static int preallocate(BlockDriverState *bs)
 
     nb_sectors = bdrv_getlength(bs) >> 9;
     offset = 0;
-    QLIST_INIT(&meta.dependent_requests);
+    qemu_co_queue_init(&meta.dependent_requests);
     meta.cluster_offset = 0;
 
     while (nb_sectors) {
@@ -899,7 +831,7 @@ static int preallocate(BlockDriverState *bs)
 
         /* There are no dependent requests, but we need to remove our request
          * from the list of in-flight requests */
-        run_dependent_requests(&meta);
+        run_dependent_requests(bs->opaque, &meta);
 
         /* TODO Preallocate data if requested */
 
@@ -1387,8 +1319,8 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_set_key       = qcow2_set_key,
     .bdrv_make_empty    = qcow2_make_empty,
 
-    .bdrv_aio_readv     = qcow2_aio_readv,
-    .bdrv_aio_writev    = qcow2_aio_writev,
+    .bdrv_co_readv      = qcow2_co_readv,
+    .bdrv_co_writev     = qcow2_co_writev,
     .bdrv_aio_flush     = qcow2_aio_flush,
 
     .bdrv_discard           = qcow2_discard,
diff --git a/block/qcow2.h b/block/qcow2.h
index 6a0a21b..de23abe 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -26,6 +26,7 @@
 #define BLOCK_QCOW2_H
 
 #include "aes.h"
+#include "qemu-coroutine.h"
 
 //#define DEBUG_ALLOC
 //#define DEBUG_ALLOC2
@@ -114,6 +115,8 @@ typedef struct BDRVQcowState {
     int64_t free_cluster_index;
     int64_t free_byte_offset;
 
+    CoMutex lock;
+
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
     uint32_t crypt_method_header;
     AES_KEY aes_encrypt_key;
@@ -146,7 +149,7 @@ typedef struct QCowL2Meta
     int nb_available;
     int nb_clusters;
     struct QCowL2Meta *depends_on;
-    QLIST_HEAD(QCowAioDependencies, QCowAIOCB) dependent_requests;
+    CoQueue dependent_requests;
 
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
-- 
cgit v1.1


From 52b8eb60132b27ad53476490e9d7579003390cfa Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 15 Jul 2011 16:27:42 +0200
Subject: qcow: Use coroutines

The old qcow format is another user of the AsyncContext infrastructure.
Converting it to coroutines (and therefore CoMutexes) allows to remove
AsyncContexts.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow.c | 180 ++++++++++++++++++++---------------------------------------
 1 file changed, 62 insertions(+), 118 deletions(-)

diff --git a/block/qcow.c b/block/qcow.c
index 227b104..6447c2a 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -73,6 +73,7 @@ typedef struct BDRVQcowState {
     uint32_t crypt_method_header;
     AES_KEY aes_encrypt_key;
     AES_KEY aes_decrypt_key;
+    CoMutex lock;
 } BDRVQcowState;
 
 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
@@ -517,11 +518,11 @@ static AIOPool qcow_aio_pool = {
 
 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+        int is_write)
 {
     QCowAIOCB *acb;
 
-    acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque);
+    acb = qemu_aio_get(&qcow_aio_pool, bs, NULL, NULL);
     if (!acb)
         return NULL;
     acb->hd_aiocb = NULL;
@@ -542,48 +543,15 @@ static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
     return acb;
 }
 
-static void qcow_aio_read_cb(void *opaque, int ret);
-static void qcow_aio_write_cb(void *opaque, int ret);
-
-static void qcow_aio_rw_bh(void *opaque)
-{
-    QCowAIOCB *acb = opaque;
-    qemu_bh_delete(acb->bh);
-    acb->bh = NULL;
-
-    if (acb->is_write) {
-        qcow_aio_write_cb(opaque, 0);
-    } else {
-        qcow_aio_read_cb(opaque, 0);
-    }
-}
-
-static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
-{
-    if (acb->bh) {
-        return -EIO;
-    }
-
-    acb->bh = qemu_bh_new(cb, acb);
-    if (!acb->bh) {
-        return -EIO;
-    }
-
-    qemu_bh_schedule(acb->bh);
-
-    return 0;
-}
-
-static void qcow_aio_read_cb(void *opaque, int ret)
+static int qcow_aio_read_cb(void *opaque)
 {
     QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
     BDRVQcowState *s = bs->opaque;
     int index_in_cluster;
+    int ret;
 
     acb->hd_aiocb = NULL;
-    if (ret < 0)
-        goto done;
 
  redo:
     /* post process the read buffer */
@@ -605,8 +573,7 @@ static void qcow_aio_read_cb(void *opaque, int ret)
 
     if (acb->nb_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     /* prepare next AIO request */
@@ -623,11 +590,12 @@ static void qcow_aio_read_cb(void *opaque, int ret)
             acb->hd_iov.iov_base = (void *)acb->buf;
             acb->hd_iov.iov_len = acb->n * 512;
             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-            acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
-                &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
-            if (acb->hd_aiocb == NULL) {
-                ret = -EIO;
-                goto done;
+            qemu_co_mutex_unlock(&s->lock);
+            ret = bdrv_co_readv(bs->backing_hd, acb->sector_num,
+                                acb->n, &acb->hd_qiov);
+            qemu_co_mutex_lock(&s->lock);
+            if (ret < 0) {
+                return -EIO;
             }
         } else {
             /* Note: in this case, no need to wait */
@@ -637,64 +605,56 @@ static void qcow_aio_read_cb(void *opaque, int ret)
     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
         /* add AIO support for compressed blocks ? */
         if (decompress_cluster(bs, acb->cluster_offset) < 0) {
-            ret = -EIO;
-            goto done;
+            return -EIO;
         }
         memcpy(acb->buf,
                s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
         goto redo;
     } else {
         if ((acb->cluster_offset & 511) != 0) {
-            ret = -EIO;
-            goto done;
+            return -EIO;
         }
         acb->hd_iov.iov_base = (void *)acb->buf;
         acb->hd_iov.iov_len = acb->n * 512;
         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-        acb->hd_aiocb = bdrv_aio_readv(bs->file,
+        qemu_co_mutex_unlock(&s->lock);
+        ret = bdrv_co_readv(bs->file,
                             (acb->cluster_offset >> 9) + index_in_cluster,
-                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
-        if (acb->hd_aiocb == NULL) {
-            ret = -EIO;
-            goto done;
+                            acb->n, &acb->hd_qiov);
+        qemu_co_mutex_lock(&s->lock);
+        if (ret < 0) {
+            return ret;
         }
     }
 
-    return;
-
-done:
-    if (acb->qiov->niov > 1) {
-        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
-        qemu_vfree(acb->orig_buf);
-    }
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_aio_release(acb);
+    return 1;
 }
 
-static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static int qcow_co_readv(BlockDriverState *bs, int64_t sector_num,
+                         int nb_sectors, QEMUIOVector *qiov)
 {
+    BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
     int ret;
 
-    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
-    if (!acb)
-        return NULL;
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 0);
 
-    ret = qcow_schedule_bh(qcow_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
-        }
-        qemu_aio_release(acb);
-        return NULL;
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow_aio_read_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
+
+    if (acb->qiov->niov > 1) {
+        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+        qemu_vfree(acb->orig_buf);
     }
+    qemu_aio_release(acb);
 
-    return &acb->common;
+    return ret;
 }
 
-static void qcow_aio_write_cb(void *opaque, int ret)
+static int qcow_aio_write_cb(void *opaque)
 {
     QCowAIOCB *acb = opaque;
     BlockDriverState *bs = acb->common.bs;
@@ -702,20 +662,17 @@ static void qcow_aio_write_cb(void *opaque, int ret)
     int index_in_cluster;
     uint64_t cluster_offset;
     const uint8_t *src_buf;
+    int ret;
 
     acb->hd_aiocb = NULL;
 
-    if (ret < 0)
-        goto done;
-
     acb->nb_sectors -= acb->n;
     acb->sector_num += acb->n;
     acb->buf += acb->n * 512;
 
     if (acb->nb_sectors == 0) {
         /* request completed */
-        ret = 0;
-        goto done;
+        return 0;
     }
 
     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
@@ -726,16 +683,11 @@ static void qcow_aio_write_cb(void *opaque, int ret)
                                         index_in_cluster,
                                         index_in_cluster + acb->n);
     if (!cluster_offset || (cluster_offset & 511) != 0) {
-        ret = -EIO;
-        goto done;
+        return -EIO;
     }
     if (s->crypt_method) {
         if (!acb->cluster_data) {
             acb->cluster_data = qemu_mallocz(s->cluster_size);
-            if (!acb->cluster_data) {
-                ret = -ENOMEM;
-                goto done;
-            }
         }
         encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
                         acb->n, 1, &s->aes_encrypt_key);
@@ -747,26 +699,19 @@ static void qcow_aio_write_cb(void *opaque, int ret)
     acb->hd_iov.iov_base = (void *)src_buf;
     acb->hd_iov.iov_len = acb->n * 512;
     qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
-    acb->hd_aiocb = bdrv_aio_writev(bs->file,
-                                    (cluster_offset >> 9) + index_in_cluster,
-                                    &acb->hd_qiov, acb->n,
-                                    qcow_aio_write_cb, acb);
-    if (acb->hd_aiocb == NULL) {
-        ret = -EIO;
-        goto done;
+    qemu_co_mutex_unlock(&s->lock);
+    ret = bdrv_co_writev(bs->file,
+                         (cluster_offset >> 9) + index_in_cluster,
+                         acb->n, &acb->hd_qiov);
+    qemu_co_mutex_lock(&s->lock);
+    if (ret < 0) {
+        return ret;
     }
-    return;
-
-done:
-    if (acb->qiov->niov > 1)
-        qemu_vfree(acb->orig_buf);
-    acb->common.cb(acb->common.opaque, ret);
-    qemu_aio_release(acb);
+    return 1;
 }
 
-static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
-        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static int qcow_co_writev(BlockDriverState *bs, int64_t sector_num,
+                          int nb_sectors, QEMUIOVector *qiov)
 {
     BDRVQcowState *s = bs->opaque;
     QCowAIOCB *acb;
@@ -774,21 +719,20 @@ static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
 
     s->cluster_cache_offset = -1; /* disable compressed cache */
 
-    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
-    if (!acb)
-        return NULL;
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, 1);
 
+    qemu_co_mutex_lock(&s->lock);
+    do {
+        ret = qcow_aio_write_cb(acb);
+    } while (ret > 0);
+    qemu_co_mutex_unlock(&s->lock);
 
-    ret = qcow_schedule_bh(qcow_aio_rw_bh, acb);
-    if (ret < 0) {
-        if (acb->qiov->niov > 1) {
-            qemu_vfree(acb->orig_buf);
-        }
-        qemu_aio_release(acb);
-        return NULL;
+    if (acb->qiov->niov > 1) {
+        qemu_vfree(acb->orig_buf);
     }
+    qemu_aio_release(acb);
 
-    return &acb->common;
+    return ret;
 }
 
 static void qcow_close(BlockDriverState *bs)
@@ -1020,8 +964,8 @@ static BlockDriver bdrv_qcow = {
     .bdrv_is_allocated	= qcow_is_allocated,
     .bdrv_set_key	= qcow_set_key,
     .bdrv_make_empty	= qcow_make_empty,
-    .bdrv_aio_readv	= qcow_aio_readv,
-    .bdrv_aio_writev	= qcow_aio_writev,
+    .bdrv_co_readv  = qcow_co_readv,
+    .bdrv_co_writev = qcow_co_writev,
     .bdrv_aio_flush	= qcow_aio_flush,
     .bdrv_write_compressed = qcow_write_compressed,
     .bdrv_get_info	= qcow_get_info,
-- 
cgit v1.1


From 384acbf46b70edf0d2c1648aa1a92a90bcf7057d Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 15 Jul 2011 16:36:40 +0200
Subject: async: Remove AsyncContext

The purpose of AsyncContexts was to protect qcow and qcow2 against reentrancy
during an emulated bdrv_read/write (which includes a qemu_aio_wait() call and
can run AIO callbacks of different requests if it weren't for AsyncContexts).

Now both qcow and qcow2 are protected by CoMutexes and AsyncContexts can be
removed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 async.c            | 98 ++++--------------------------------------------------
 block.c            |  6 ----
 block/qed-table.c  | 14 --------
 block/qed.c        |  4 ---
 linux-aio.c        | 43 +++---------------------
 posix-aio-compat.c | 11 ------
 qemu-common.h      |  4 ---
 7 files changed, 11 insertions(+), 169 deletions(-)

diff --git a/async.c b/async.c
index fd313df..3fe70b9 100644
--- a/async.c
+++ b/async.c
@@ -25,92 +25,8 @@
 #include "qemu-common.h"
 #include "qemu-aio.h"
 
-/*
- * An AsyncContext protects the callbacks of AIO requests and Bottom Halves
- * against interfering with each other. A typical example is qcow2 that accepts
- * asynchronous requests, but relies for manipulation of its metadata on
- * synchronous bdrv_read/write that doesn't trigger any callbacks.
- *
- * However, these functions are often emulated using AIO which means that AIO
- * callbacks must be run - but at the same time we must not run callbacks of
- * other requests as they might start to modify metadata and corrupt the
- * internal state of the caller of bdrv_read/write.
- *
- * To achieve the desired semantics we switch into a new AsyncContext.
- * Callbacks must only be run if they belong to the current AsyncContext.
- * Otherwise they need to be queued until their own context is active again.
- * This is how you can make qemu_aio_wait() wait only for your own callbacks.
- *
- * The AsyncContexts form a stack. When you leave a AsyncContexts, you always
- * return to the old ("parent") context.
- */
-struct AsyncContext {
-    /* Consecutive number of the AsyncContext (position in the stack) */
-    int id;
-
-    /* Anchor of the list of Bottom Halves belonging to the context */
-    struct QEMUBH *first_bh;
-
-    /* Link to parent context */
-    struct AsyncContext *parent;
-};
-
-/* The currently active AsyncContext */
-static struct AsyncContext *async_context = &(struct AsyncContext) { 0 };
-
-/*
- * Enter a new AsyncContext. Already scheduled Bottom Halves and AIO callbacks
- * won't be called until this context is left again.
- */
-void async_context_push(void)
-{
-    struct AsyncContext *new = qemu_mallocz(sizeof(*new));
-    new->parent = async_context;
-    new->id = async_context->id + 1;
-    async_context = new;
-}
-
-/* Run queued AIO completions and destroy Bottom Half */
-static void bh_run_aio_completions(void *opaque)
-{
-    QEMUBH **bh = opaque;
-    qemu_bh_delete(*bh);
-    qemu_free(bh);
-    qemu_aio_process_queue();
-}
-/*
- * Leave the currently active AsyncContext. All Bottom Halves belonging to the
- * old context are executed before changing the context.
- */
-void async_context_pop(void)
-{
-    struct AsyncContext *old = async_context;
-    QEMUBH **bh;
-
-    /* Flush the bottom halves, we don't want to lose them */
-    while (qemu_bh_poll());
-
-    /* Switch back to the parent context */
-    async_context = async_context->parent;
-    qemu_free(old);
-
-    if (async_context == NULL) {
-        abort();
-    }
-
-    /* Schedule BH to run any queued AIO completions as soon as possible */
-    bh = qemu_malloc(sizeof(*bh));
-    *bh = qemu_bh_new(bh_run_aio_completions, bh);
-    qemu_bh_schedule(*bh);
-}
-
-/*
- * Returns the ID of the currently active AsyncContext
- */
-int get_async_context_id(void)
-{
-    return async_context->id;
-}
+/* Anchor of the list of Bottom Halves belonging to the context */
+static struct QEMUBH *first_bh;
 
 /***********************************************************/
 /* bottom halves (can be seen as timers which expire ASAP) */
@@ -130,8 +46,8 @@ QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
     bh = qemu_mallocz(sizeof(QEMUBH));
     bh->cb = cb;
     bh->opaque = opaque;
-    bh->next = async_context->first_bh;
-    async_context->first_bh = bh;
+    bh->next = first_bh;
+    first_bh = bh;
     return bh;
 }
 
@@ -141,7 +57,7 @@ int qemu_bh_poll(void)
     int ret;
 
     ret = 0;
-    for (bh = async_context->first_bh; bh; bh = next) {
+    for (bh = first_bh; bh; bh = next) {
         next = bh->next;
         if (!bh->deleted && bh->scheduled) {
             bh->scheduled = 0;
@@ -153,7 +69,7 @@ int qemu_bh_poll(void)
     }
 
     /* remove deleted bhs */
-    bhp = &async_context->first_bh;
+    bhp = &first_bh;
     while (*bhp) {
         bh = *bhp;
         if (bh->deleted) {
@@ -199,7 +115,7 @@ void qemu_bh_update_timeout(int *timeout)
 {
     QEMUBH *bh;
 
-    for (bh = async_context->first_bh; bh; bh = bh->next) {
+    for (bh = first_bh; bh; bh = bh->next) {
         if (!bh->deleted && bh->scheduled) {
             if (bh->idle) {
                 /* idle bottom halves will be polled at least
diff --git a/block.c b/block.c
index e6abea8..0d05b4b 100644
--- a/block.c
+++ b/block.c
@@ -2777,8 +2777,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
     struct iovec iov;
     QEMUIOVector qiov;
 
-    async_context_push();
-
     async_ret = NOT_DONE;
     iov.iov_base = (void *)buf;
     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
@@ -2796,7 +2794,6 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 
 
 fail:
-    async_context_pop();
     return async_ret;
 }
 
@@ -2808,8 +2805,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
     struct iovec iov;
     QEMUIOVector qiov;
 
-    async_context_push();
-
     async_ret = NOT_DONE;
     iov.iov_base = (void *)buf;
     iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE;
@@ -2825,7 +2820,6 @@ static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
     }
 
 fail:
-    async_context_pop();
     return async_ret;
 }
 
diff --git a/block/qed-table.c b/block/qed-table.c
index d38c673..d96afa8 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -179,16 +179,12 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_read_table(s, s->header.l1_table_offset,
                    s->l1_table, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
-
     return ret;
 }
 
@@ -205,15 +201,11 @@ int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index,
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_write_l1_table(s, index, n, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
-
     return ret;
 }
 
@@ -282,14 +274,11 @@ int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_read_l2_table(s, request, offset, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
     return ret;
 }
 
@@ -307,13 +296,10 @@ int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request,
 {
     int ret = -EINPROGRESS;
 
-    async_context_push();
-
     qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret);
     while (ret == -EINPROGRESS) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
     return ret;
 }
diff --git a/block/qed.c b/block/qed.c
index 3970379..333f067 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -680,16 +680,12 @@ static int bdrv_qed_is_allocated(BlockDriverState *bs, int64_t sector_num,
     };
     QEDRequest request = { .l2_table = NULL };
 
-    async_context_push();
-
     qed_find_cluster(s, &request, pos, len, qed_is_allocated_cb, &cb);
 
     while (cb.is_allocated == -1) {
         qemu_aio_wait();
     }
 
-    async_context_pop();
-
     qed_unref_l2_cache_entry(request.l2_table);
 
     return cb.is_allocated;
diff --git a/linux-aio.c b/linux-aio.c
index 68f4b3d..dc3faf2 100644
--- a/linux-aio.c
+++ b/linux-aio.c
@@ -31,7 +31,6 @@ struct qemu_laiocb {
     struct iocb iocb;
     ssize_t ret;
     size_t nbytes;
-    int async_context_id;
     QLIST_ENTRY(qemu_laiocb) node;
 };
 
@@ -39,7 +38,6 @@ struct qemu_laio_state {
     io_context_t ctx;
     int efd;
     int count;
-    QLIST_HEAD(, qemu_laiocb) completed_reqs;
 };
 
 static inline ssize_t io_event_ret(struct io_event *ev)
@@ -49,7 +47,6 @@ static inline ssize_t io_event_ret(struct io_event *ev)
 
 /*
  * Completes an AIO request (calls the callback and frees the ACB).
- * Be sure to be in the right AsyncContext before calling this function.
  */
 static void qemu_laio_process_completion(struct qemu_laio_state *s,
     struct qemu_laiocb *laiocb)
@@ -72,42 +69,12 @@ static void qemu_laio_process_completion(struct qemu_laio_state *s,
 }
 
 /*
- * Processes all queued AIO requests, i.e. requests that have return from OS
- * but their callback was not called yet. Requests that cannot have their
- * callback called in the current AsyncContext, remain in the queue.
- *
- * Returns 1 if at least one request could be completed, 0 otherwise.
+ * All requests are directly processed when they complete, so there's nothing
+ * left to do during qemu_aio_wait().
  */
 static int qemu_laio_process_requests(void *opaque)
 {
-    struct qemu_laio_state *s = opaque;
-    struct qemu_laiocb *laiocb, *next;
-    int res = 0;
-
-    QLIST_FOREACH_SAFE (laiocb, &s->completed_reqs, node, next) {
-        if (laiocb->async_context_id == get_async_context_id()) {
-            qemu_laio_process_completion(s, laiocb);
-            QLIST_REMOVE(laiocb, node);
-            res = 1;
-        }
-    }
-
-    return res;
-}
-
-/*
- * Puts a request in the completion queue so that its callback is called the
- * next time when it's possible. If we already are in the right AsyncContext,
- * the request is completed immediately instead.
- */
-static void qemu_laio_enqueue_completed(struct qemu_laio_state *s,
-    struct qemu_laiocb* laiocb)
-{
-    if (laiocb->async_context_id == get_async_context_id()) {
-        qemu_laio_process_completion(s, laiocb);
-    } else {
-        QLIST_INSERT_HEAD(&s->completed_reqs, laiocb, node);
-    }
+    return 0;
 }
 
 static void qemu_laio_completion_cb(void *opaque)
@@ -141,7 +108,7 @@ static void qemu_laio_completion_cb(void *opaque)
                     container_of(iocb, struct qemu_laiocb, iocb);
 
             laiocb->ret = io_event_ret(&events[i]);
-            qemu_laio_enqueue_completed(s, laiocb);
+            qemu_laio_process_completion(s, laiocb);
         }
     }
 }
@@ -204,7 +171,6 @@ BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
     laiocb->nbytes = nb_sectors * 512;
     laiocb->ctx = s;
     laiocb->ret = -EINPROGRESS;
-    laiocb->async_context_id = get_async_context_id();
 
     iocbs = &laiocb->iocb;
 
@@ -239,7 +205,6 @@ void *laio_init(void)
     struct qemu_laio_state *s;
 
     s = qemu_mallocz(sizeof(*s));
-    QLIST_INIT(&s->completed_reqs);
     s->efd = eventfd(0, 0);
     if (s->efd == -1)
         goto out_free_state;
diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index c4116e3..788d113 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -49,8 +49,6 @@ struct qemu_paiocb {
     ssize_t ret;
     int active;
     struct qemu_paiocb *next;
-
-    int async_context_id;
 };
 
 typedef struct PosixAioState {
@@ -420,7 +418,6 @@ static int posix_aio_process_queue(void *opaque)
     struct qemu_paiocb *acb, **pacb;
     int ret;
     int result = 0;
-    int async_context_id = get_async_context_id();
 
     for(;;) {
         pacb = &s->first_aio;
@@ -429,12 +426,6 @@ static int posix_aio_process_queue(void *opaque)
             if (!acb)
                 return result;
 
-            /* we're only interested in requests in the right context */
-            if (acb->async_context_id != async_context_id) {
-                pacb = &acb->next;
-                continue;
-            }
-
             ret = qemu_paio_error(acb);
             if (ret == ECANCELED) {
                 /* remove the request */
@@ -575,7 +566,6 @@ BlockDriverAIOCB *paio_submit(BlockDriverState *bs, int fd,
     acb->aio_type = type;
     acb->aio_fildes = fd;
     acb->ev_signo = SIGUSR2;
-    acb->async_context_id = get_async_context_id();
 
     if (qiov) {
         acb->aio_iov = qiov->iov;
@@ -604,7 +594,6 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
     acb->aio_type = QEMU_AIO_IOCTL;
     acb->aio_fildes = fd;
     acb->ev_signo = SIGUSR2;
-    acb->async_context_id = get_async_context_id();
     acb->aio_offset = 0;
     acb->aio_ioctl_buf = buf;
     acb->aio_ioctl_cmd = req;
diff --git a/qemu-common.h b/qemu-common.h
index 1e3c665..8f21a8c 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -115,10 +115,6 @@ int qemu_main(int argc, char **argv, char **envp);
 /* bottom halves */
 typedef void QEMUBHFunc(void *opaque);
 
-void async_context_push(void);
-void async_context_pop(void);
-int get_async_context_id(void);
-
 QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
 void qemu_bh_schedule(QEMUBH *bh);
 /* Bottom halfs that are scheduled from a bottom half handler are instantly
-- 
cgit v1.1


From e680cfa7e20f5049c475ac94f998a79c9997b48d Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Wed, 13 Jul 2011 15:42:52 +0200
Subject: coroutines: Use one global bottom half for CoQueue

Now that AsyncContexts don't exist any more, we can use one global bottom half
for restarting coroutines instead of allocating a new one every time (before
removing AsyncContexts, the problem with having a global BH was that it had to
belong to a single AsyncContexts and wouldn't be executed in a different one -
which leads to deadlocks)

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 qemu-coroutine-lock.c | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/qemu-coroutine-lock.c b/qemu-coroutine-lock.c
index abaa1f7..a80f437 100644
--- a/qemu-coroutine-lock.c
+++ b/qemu-coroutine-lock.c
@@ -30,14 +30,10 @@
 
 static QTAILQ_HEAD(, Coroutine) unlock_bh_queue =
     QTAILQ_HEAD_INITIALIZER(unlock_bh_queue);
-
-struct unlock_bh {
-    QEMUBH *bh;
-};
+static QEMUBH* unlock_bh;
 
 static void qemu_co_queue_next_bh(void *opaque)
 {
-    struct unlock_bh *unlock_bh = opaque;
     Coroutine *next;
 
     trace_qemu_co_queue_next_bh();
@@ -45,14 +41,15 @@ static void qemu_co_queue_next_bh(void *opaque)
         QTAILQ_REMOVE(&unlock_bh_queue, next, co_queue_next);
         qemu_coroutine_enter(next, NULL);
     }
-
-    qemu_bh_delete(unlock_bh->bh);
-    qemu_free(unlock_bh);
 }
 
 void qemu_co_queue_init(CoQueue *queue)
 {
     QTAILQ_INIT(&queue->entries);
+
+    if (!unlock_bh) {
+        unlock_bh = qemu_bh_new(qemu_co_queue_next_bh, NULL);
+    }
 }
 
 void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
@@ -65,7 +62,6 @@ void coroutine_fn qemu_co_queue_wait(CoQueue *queue)
 
 bool qemu_co_queue_next(CoQueue *queue)
 {
-    struct unlock_bh *unlock_bh;
     Coroutine *next;
 
     next = QTAILQ_FIRST(&queue->entries);
@@ -73,10 +69,7 @@ bool qemu_co_queue_next(CoQueue *queue)
         QTAILQ_REMOVE(&queue->entries, next, co_queue_next);
         QTAILQ_INSERT_TAIL(&unlock_bh_queue, next, co_queue_next);
         trace_qemu_co_queue_next(next);
-
-        unlock_bh = qemu_malloc(sizeof(*unlock_bh));
-        unlock_bh->bh = qemu_bh_new(qemu_co_queue_next_bh, unlock_bh);
-        qemu_bh_schedule(unlock_bh->bh);
+        qemu_bh_schedule(unlock_bh);
     }
 
     return (next != NULL);
-- 
cgit v1.1


From ba1d1afdfeeb5ec6cbfd0052ab298d9b7cb7e5ba Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Mon, 25 Jul 2011 19:42:37 +0200
Subject: posix-aio-compat: Allow read after EOF

In order to be able to transparently replace bdrv_read calls by bdrv_co_read,
reading beyond EOF must produce zeros instead of short reads for AIO, too.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 posix-aio-compat.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/posix-aio-compat.c b/posix-aio-compat.c
index 788d113..8dc00cb 100644
--- a/posix-aio-compat.c
+++ b/posix-aio-compat.c
@@ -198,6 +198,12 @@ static ssize_t handle_aiocb_rw_vector(struct qemu_paiocb *aiocb)
     return len;
 }
 
+/*
+ * Read/writes the data to/from a given linear buffer.
+ *
+ * Returns the number of bytes handles or -errno in case of an error. Short
+ * reads are only returned if the end of the file is reached.
+ */
 static ssize_t handle_aiocb_rw_linear(struct qemu_paiocb *aiocb, char *buf)
 {
     ssize_t offset = 0;
@@ -334,6 +340,19 @@ static void *aio_thread(void *unused)
 
         switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) {
         case QEMU_AIO_READ:
+            ret = handle_aiocb_rw(aiocb);
+            if (ret >= 0 && ret < aiocb->aio_nbytes && aiocb->common.bs->growable) {
+                /* A short read means that we have reached EOF. Pad the buffer
+                 * with zeros for bytes after EOF. */
+                QEMUIOVector qiov;
+
+                qemu_iovec_init_external(&qiov, aiocb->aio_iov,
+                                         aiocb->aio_niov);
+                qemu_iovec_memset_skip(&qiov, 0, aiocb->aio_nbytes - ret, ret);
+
+                ret = aiocb->aio_nbytes;
+            }
+            break;
         case QEMU_AIO_WRITE:
             ret = handle_aiocb_rw(aiocb);
             break;
-- 
cgit v1.1


From e7a8a7837a964e0fe327e6ef8dde02c6a53dd14a Mon Sep 17 00:00:00 2001
From: Kevin Wolf <kwolf@redhat.com>
Date: Fri, 15 Jul 2011 16:05:00 +0200
Subject: block: Use bdrv_co_* instead of synchronous versions in coroutines

If we're already in a coroutine, there is no reason to use the synchronous
version of block layer functions when a coroutine one exists. This makes
bdrv_read/write/flush use bdrv_co_* when used inside a coroutine.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/block.c b/block.c
index 0d05b4b..26910ca 100644
--- a/block.c
+++ b/block.c
@@ -70,6 +70,7 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs,
 static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
                                          int64_t sector_num, int nb_sectors,
                                          QEMUIOVector *iov);
+static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -946,6 +947,17 @@ static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num,
                                    nb_sectors * BDRV_SECTOR_SIZE);
 }
 
+static inline bool bdrv_has_async_rw(BlockDriver *drv)
+{
+    return drv->bdrv_co_readv != bdrv_co_readv_em
+        || drv->bdrv_aio_readv != bdrv_aio_readv_em;
+}
+
+static inline bool bdrv_has_async_flush(BlockDriver *drv)
+{
+    return drv->bdrv_aio_flush != bdrv_aio_flush_em;
+}
+
 /* return < 0 if error. See bdrv_write() for the return codes */
 int bdrv_read(BlockDriverState *bs, int64_t sector_num,
               uint8_t *buf, int nb_sectors)
@@ -954,6 +966,18 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num,
 
     if (!drv)
         return -ENOMEDIUM;
+
+    if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
+        QEMUIOVector qiov;
+        struct iovec iov = {
+            .iov_base = (void *)buf,
+            .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+        };
+
+        qemu_iovec_init_external(&qiov, &iov, 1);
+        return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
+    }
+
     if (bdrv_check_request(bs, sector_num, nb_sectors))
         return -EIO;
 
@@ -998,8 +1022,21 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
                const uint8_t *buf, int nb_sectors)
 {
     BlockDriver *drv = bs->drv;
+
     if (!bs->drv)
         return -ENOMEDIUM;
+
+    if (bdrv_has_async_rw(drv) && qemu_in_coroutine()) {
+        QEMUIOVector qiov;
+        struct iovec iov = {
+            .iov_base = (void *)buf,
+            .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
+        };
+
+        qemu_iovec_init_external(&qiov, &iov, 1);
+        return bdrv_co_writev(bs, sector_num, nb_sectors, &qiov);
+    }
+
     if (bs->read_only)
         return -EACCES;
     if (bdrv_check_request(bs, sector_num, nb_sectors))
@@ -1649,6 +1686,10 @@ int bdrv_flush(BlockDriverState *bs)
         return 0;
     }
 
+    if (bs->drv && bdrv_has_async_flush(bs->drv) && qemu_in_coroutine()) {
+        return bdrv_co_flush_em(bs);
+    }
+
     if (bs->drv && bs->drv->bdrv_flush) {
         return bs->drv->bdrv_flush(bs);
     }
@@ -2916,6 +2957,21 @@ static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs,
     return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true);
 }
 
+static int coroutine_fn bdrv_co_flush_em(BlockDriverState *bs)
+{
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockDriverAIOCB *acb;
+
+    acb = bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
+    if (!acb) {
+        return -EIO;
+    }
+    qemu_coroutine_yield();
+    return co.ret;
+}
+
 /**************************************************************/
 /* removable device support */
 
-- 
cgit v1.1