aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile1
-rw-r--r--block.c3
-rw-r--r--block.h1
-rw-r--r--block/raw-posix-aio.h6
-rw-r--r--block/raw-posix.c30
-rwxr-xr-xconfigure31
-rw-r--r--linux-aio.c204
-rw-r--r--qemu-config.c4
-rw-r--r--qemu-io.c7
-rw-r--r--qemu-options.hx4
-rw-r--r--vl.c22
11 files changed, 305 insertions, 8 deletions
diff --git a/Makefile b/Makefile
index b0a84f3..bdac9b3 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,7 @@ recurse-all: $(SUBDIR_RULES) $(ROMSUBDIR_RULES)
block-obj-y = cutils.o cache-utils.o qemu-malloc.o qemu-option.o module.o
block-obj-y += nbd.o block.o aio.o aes.o osdep.o
block-obj-$(CONFIG_POSIX) += posix-aio-compat.o
+block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o
block-nested-y += cow.o qcow.o vdi.o vmdk.o cloop.o dmg.o bochs.o vpc.o vvfat.o
block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
diff --git a/block.c b/block.c
index 82ffea8..033957d 100644
--- a/block.c
+++ b/block.c
@@ -411,7 +411,8 @@ int bdrv_open2(BlockDriverState *bs, const char *filename, int flags,
/* Note: for compatibility, we open disk image files as RDWR, and
RDONLY as fallback */
if (!(flags & BDRV_O_FILE))
- open_flags = BDRV_O_RDWR | (flags & BDRV_O_CACHE_MASK);
+ open_flags = BDRV_O_RDWR |
+ (flags & (BDRV_O_CACHE_MASK|BDRV_O_NATIVE_AIO));
else
open_flags = flags & ~(BDRV_O_FILE | BDRV_O_SNAPSHOT);
ret = drv->bdrv_open(bs, filename, open_flags);
diff --git a/block.h b/block.h
index ccd4c1e..28bf357 100644
--- a/block.h
+++ b/block.h
@@ -37,6 +37,7 @@ typedef struct QEMUSnapshotInfo {
bdrv_file_open()) */
#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
#define BDRV_O_CACHE_WB 0x0040 /* use write-back caching */
+#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */
#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_CACHE_WB)
diff --git a/block/raw-posix-aio.h b/block/raw-posix-aio.h
index 6761cd3..244bc8b 100644
--- a/block/raw-posix-aio.h
+++ b/block/raw-posix-aio.h
@@ -33,4 +33,10 @@ BlockDriverAIOCB *paio_ioctl(BlockDriverState *bs, int fd,
unsigned long int req, void *buf,
BlockDriverCompletionFunc *cb, void *opaque);
+/* linux-aio.c - Linux native implementation */
+void *laio_init(void);
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type);
+
#endif /* QEMU_RAW_POSIX_AIO_H */
diff --git a/block/raw-posix.c b/block/raw-posix.c
index ca9bc61..8a7dc15 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -115,6 +115,7 @@ typedef struct BDRVRawState {
int fd_got_error;
int fd_media_changed;
#endif
+ int use_aio;
uint8_t* aligned_buf;
} BDRVRawState;
@@ -159,6 +160,7 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
}
s->fd = fd;
s->aligned_buf = NULL;
+
if ((bdrv_flags & BDRV_O_NOCACHE)) {
s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
if (s->aligned_buf == NULL) {
@@ -166,9 +168,22 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
}
}
- s->aio_ctx = paio_init();
- if (!s->aio_ctx) {
- goto out_free_buf;
+#ifdef CONFIG_LINUX_AIO
+ if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) ==
+ (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) {
+ s->aio_ctx = laio_init();
+ if (!s->aio_ctx) {
+ goto out_free_buf;
+ }
+ s->use_aio = 1;
+ } else
+#endif
+ {
+ s->aio_ctx = paio_init();
+ if (!s->aio_ctx) {
+ goto out_free_buf;
+ }
+ s->use_aio = 0;
}
return 0;
@@ -524,8 +539,13 @@ static BlockDriverAIOCB *raw_aio_submit(BlockDriverState *bs,
* boundary. Check if this is the case or telll the low-level
* driver that it needs to copy the buffer.
*/
- if (s->aligned_buf && !qiov_is_aligned(qiov)) {
- type |= QEMU_AIO_MISALIGNED;
+ if (s->aligned_buf) {
+ if (!qiov_is_aligned(qiov)) {
+ type |= QEMU_AIO_MISALIGNED;
+ } else if (s->use_aio) {
+ return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov,
+ nb_sectors, cb, opaque, type);
+ }
}
return paio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, nb_sectors,
diff --git a/configure b/configure
index b993b65..81272fa 100755
--- a/configure
+++ b/configure
@@ -191,6 +191,7 @@ vde=""
vnc_tls=""
vnc_sasl=""
xen=""
+linux_aio=""
gprof="no"
debug_tcg="no"
@@ -523,6 +524,10 @@ for opt do
;;
--enable-mixemu) mixemu="yes"
;;
+ --disable-linux-aio) linux_aio="no"
+ ;;
+ --enable-linux-aio) linux_aio="yes"
+ ;;
--enable-io-thread) io_thread="yes"
;;
--disable-blobs) blobs="no"
@@ -674,6 +679,8 @@ echo " --enable-uname-release=R Return R for uname -r in usermode emulation"
echo " --sparc_cpu=V Build qemu for Sparc architecture v7, v8, v8plus, v8plusa, v9"
echo " --disable-vde disable support for vde network"
echo " --enable-vde enable support for vde network"
+echo " --disable-linux-aio disable Linux AIO support"
+echo " --enable-linux-aio enable Linux AIO support"
echo " --enable-io-thread enable IO thread"
echo " --disable-blobs disable installing provided firmware blobs"
echo " --kerneldir=PATH look for kernel includes in PATH"
@@ -1304,6 +1311,26 @@ if test "$pthread" = no; then
fi
##########################################
+# linux-aio probe
+AIOLIBS=""
+
+if test "$linux_aio" != "no" ; then
+ cat > $TMPC <<EOF
+#include <libaio.h>
+#include <sys/eventfd.h>
+int main(void) { io_setup(0, NULL); io_set_eventfd(NULL, 0); eventfd(0, 0); return 0; }
+EOF
+ if compile_prog "" "-laio" ; then
+ linux_aio=yes
+ LIBS="$LIBS -laio"
+ else
+ if test "$linux_aio" = "yes" ; then
+ feature_not_found "linux AIO"
+ fi
+ fi
+fi
+
+##########################################
# iovec probe
cat > $TMPC <<EOF
#include <sys/types.h>
@@ -1638,6 +1665,7 @@ echo "NPTL support $nptl"
echo "GUEST_BASE $guest_base"
echo "vde support $vde"
echo "IO thread $io_thread"
+echo "Linux AIO support $linux_aio"
echo "Install blobs $blobs"
echo "KVM support $kvm"
echo "fdt support $fdt"
@@ -1811,6 +1839,9 @@ fi
if test "$io_thread" = "yes" ; then
echo "CONFIG_IOTHREAD=y" >> $config_host_mak
fi
+if test "$linux_aio" = "yes" ; then
+ echo "CONFIG_LINUX_AIO=y" >> $config_host_mak
+fi
if test "$blobs" = "yes" ; then
echo "INSTALL_BLOBS=yes" >> $config_host_mak
fi
diff --git a/linux-aio.c b/linux-aio.c
new file mode 100644
index 0000000..f53a08c
--- /dev/null
+++ b/linux-aio.c
@@ -0,0 +1,204 @@
+/*
+ * Linux native AIO support.
+ *
+ * Copyright (C) 2009 IBM, Corp.
+ * Copyright (C) 2009 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu-common.h"
+#include "qemu-aio.h"
+#include "block_int.h"
+#include "block/raw-posix-aio.h"
+
+#include <sys/eventfd.h>
+#include <libaio.h>
+
+/*
+ * Queue size (per-device).
+ *
+ * XXX: eventually we need to communicate this to the guest and/or make it
+ * tunable by the guest. If we get more outstanding requests at a time
+ * than this we will get EAGAIN from io_submit which is communicated to
+ * the guest as an I/O error.
+ */
+#define MAX_EVENTS 128
+
+struct qemu_laiocb {
+ BlockDriverAIOCB common;
+ struct qemu_laio_state *ctx;
+ struct iocb iocb;
+ ssize_t ret;
+ size_t nbytes;
+};
+
+struct qemu_laio_state {
+ io_context_t ctx;
+ int efd;
+ int count;
+};
+
+static inline ssize_t io_event_ret(struct io_event *ev)
+{
+ return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
+}
+
+static void qemu_laio_completion_cb(void *opaque)
+{
+ struct qemu_laio_state *s = opaque;
+
+ while (1) {
+ struct io_event events[MAX_EVENTS];
+ uint64_t val;
+ ssize_t ret;
+ struct timespec ts = { 0 };
+ int nevents, i;
+
+ do {
+ ret = read(s->efd, &val, sizeof(val));
+ } while (ret == 1 && errno == EINTR);
+
+ if (ret == -1 && errno == EAGAIN)
+ break;
+
+ if (ret != 8)
+ break;
+
+ do {
+ nevents = io_getevents(s->ctx, val, MAX_EVENTS, events, &ts);
+ } while (nevents == -EINTR);
+
+ for (i = 0; i < nevents; i++) {
+ struct iocb *iocb = events[i].obj;
+ struct qemu_laiocb *laiocb =
+ container_of(iocb, struct qemu_laiocb, iocb);
+
+ s->count--;
+
+ ret = laiocb->ret = io_event_ret(&events[i]);
+ if (ret != -ECANCELED) {
+ if (ret == laiocb->nbytes)
+ ret = 0;
+ else if (ret >= 0)
+ ret = -EINVAL;
+
+ laiocb->common.cb(laiocb->common.opaque, ret);
+ }
+
+ qemu_aio_release(laiocb);
+ }
+ }
+}
+
+static int qemu_laio_flush_cb(void *opaque)
+{
+ struct qemu_laio_state *s = opaque;
+
+ return (s->count > 0) ? 1 : 0;
+}
+
+static void laio_cancel(BlockDriverAIOCB *blockacb)
+{
+ struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb;
+ struct io_event event;
+ int ret;
+
+ if (laiocb->ret != -EINPROGRESS)
+ return;
+
+ /*
+ * Note that as of Linux 2.6.31 neither the block device code nor any
+ * filesystem implements cancellation of AIO request.
+ * Thus the polling loop below is the normal code path.
+ */
+ ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event);
+ if (ret == 0) {
+ laiocb->ret = -ECANCELED;
+ return;
+ }
+
+ /*
+ * We have to wait for the iocb to finish.
+ *
+ * The only way to get the iocb status update is by polling the io context.
+ * We might be able to do this slightly more optimal by removing the
+ * O_NONBLOCK flag.
+ */
+ while (laiocb->ret == -EINPROGRESS)
+ qemu_laio_completion_cb(laiocb->ctx);
+}
+
+static AIOPool laio_pool = {
+ .aiocb_size = sizeof(struct qemu_laiocb),
+ .cancel = laio_cancel,
+};
+
+BlockDriverAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd,
+ int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+ BlockDriverCompletionFunc *cb, void *opaque, int type)
+{
+ struct qemu_laio_state *s = aio_ctx;
+ struct qemu_laiocb *laiocb;
+ struct iocb *iocbs;
+ off_t offset = sector_num * 512;
+
+ laiocb = qemu_aio_get(&laio_pool, bs, cb, opaque);
+ if (!laiocb)
+ return NULL;
+ laiocb->nbytes = nb_sectors * 512;
+ laiocb->ctx = s;
+ laiocb->ret = -EINPROGRESS;
+
+ iocbs = &laiocb->iocb;
+
+ switch (type) {
+ case QEMU_AIO_WRITE:
+ io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
+ case QEMU_AIO_READ:
+ io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset);
+ break;
+ default:
+ fprintf(stderr, "%s: invalid AIO request type 0x%x.\n",
+ __func__, type);
+ goto out_free_aiocb;
+ }
+ io_set_eventfd(&laiocb->iocb, s->efd);
+ s->count++;
+
+ if (io_submit(s->ctx, 1, &iocbs) < 0)
+ goto out_dec_count;
+ return &laiocb->common;
+
+out_free_aiocb:
+ qemu_aio_release(laiocb);
+out_dec_count:
+ s->count--;
+ return NULL;
+}
+
+void *laio_init(void)
+{
+ struct qemu_laio_state *s;
+
+ s = qemu_mallocz(sizeof(*s));
+ s->efd = eventfd(0, 0);
+ if (s->efd == -1)
+ goto out_free_state;
+ fcntl(s->efd, F_SETFL, O_NONBLOCK);
+
+ if (io_setup(MAX_EVENTS, &s->ctx) != 0)
+ goto out_close_efd;
+
+ qemu_aio_set_fd_handler(s->efd, qemu_laio_completion_cb,
+ NULL, qemu_laio_flush_cb, s);
+
+ return s;
+
+out_close_efd:
+ close(s->efd);
+out_free_state:
+ qemu_free(s);
+ return NULL;
+}
diff --git a/qemu-config.c b/qemu-config.c
index 3dd473a..4808db0 100644
--- a/qemu-config.c
+++ b/qemu-config.c
@@ -53,6 +53,10 @@ QemuOptsList qemu_drive_opts = {
.type = QEMU_OPT_STRING,
.help = "host cache usage (none, writeback, writethrough)",
},{
+ .name = "aio",
+ .type = QEMU_OPT_STRING,
+ .help = "host AIO implementation (threads, native)",
+ },{
.name = "format",
.type = QEMU_OPT_STRING,
.help = "disk format (raw, qcow2, ...)",
diff --git a/qemu-io.c b/qemu-io.c
index a68f195..f96a4de 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1401,6 +1401,7 @@ static void usage(const char *name)
" -n, --nocache disable host cache\n"
" -g, --growable allow file to grow (only applies to protocols)\n"
" -m, --misalign misalign allocations for O_DIRECT\n"
+" -k, --native-aio use kernel AIO implementation (on Linux only)\n"
" -h, --help display this help and exit\n"
" -V, --version output version information and exit\n"
"\n",
@@ -1412,7 +1413,7 @@ int main(int argc, char **argv)
{
int readonly = 0;
int growable = 0;
- const char *sopt = "hVc:Crsnmg";
+ const char *sopt = "hVc:Crsnmgk";
struct option lopt[] = {
{ "help", 0, NULL, 'h' },
{ "version", 0, NULL, 'V' },
@@ -1424,6 +1425,7 @@ int main(int argc, char **argv)
{ "nocache", 0, NULL, 'n' },
{ "misalign", 0, NULL, 'm' },
{ "growable", 0, NULL, 'g' },
+ { "native-aio", 0, NULL, 'k' },
{ NULL, 0, NULL, 0 }
};
int c;
@@ -1455,6 +1457,9 @@ int main(int argc, char **argv)
case 'g':
growable = 1;
break;
+ case 'k':
+ flags |= BDRV_O_NATIVE_AIO;
+ break;
case 'V':
printf("%s version %s\n", progname, VERSION);
exit(0);
diff --git a/qemu-options.hx b/qemu-options.hx
index e3bd314..0c2b310 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -95,7 +95,7 @@ DEF("drive", HAS_ARG, QEMU_OPTION_drive,
"-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][,index=i]\n"
" [,cyls=c,heads=h,secs=s[,trans=t]][,snapshot=on|off]\n"
" [,cache=writethrough|writeback|none][,format=f][,serial=s]\n"
- " [,addr=A][,id=name]\n"
+ " [,addr=A][,id=name][,aio=threads|native]\n"
" use 'file' as a drive image\n")
DEF("set", HAS_ARG, QEMU_OPTION_set,
"-set group.id.arg=value\n"
@@ -128,6 +128,8 @@ These options have the same definition as they have in @option{-hdachs}.
@var{snapshot} is "on" or "off" and allows to enable snapshot for given drive (see @option{-snapshot}).
@item cache=@var{cache}
@var{cache} is "none", "writeback", or "writethrough" and controls how the host cache is used to access block data.
+@item aio=@var{aio}
+@var{aio} is "threads", or "native" and selects between pthread based disk I/O and native Linux AIO.
@item format=@var{format}
Specify which disk @var{format} will be used rather than detecting
the format. Can be used to specifiy format=raw to avoid interpreting
diff --git a/vl.c b/vl.c
index 9b2bf00..1085794 100644
--- a/vl.c
+++ b/vl.c
@@ -1916,6 +1916,7 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
int max_devs;
int index;
int cache;
+ int aio = 0;
int bdrv_flags, onerror;
const char *devaddr;
DriveInfo *dinfo;
@@ -2049,6 +2050,19 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
}
}
+#ifdef CONFIG_LINUX_AIO
+ if ((buf = qemu_opt_get(opts, "aio")) != NULL) {
+ if (!strcmp(buf, "threads"))
+ aio = 0;
+ else if (!strcmp(buf, "native"))
+ aio = 1;
+ else {
+ fprintf(stderr, "qemu: invalid aio option\n");
+ return NULL;
+ }
+ }
+#endif
+
if ((buf = qemu_opt_get(opts, "format")) != NULL) {
if (strcmp(buf, "?") == 0) {
fprintf(stderr, "qemu: Supported formats:");
@@ -2218,11 +2232,19 @@ DriveInfo *drive_init(QemuOpts *opts, void *opaque,
bdrv_flags |= BDRV_O_NOCACHE;
else if (cache == 2) /* write-back */
bdrv_flags |= BDRV_O_CACHE_WB;
+
+ if (aio == 1) {
+ bdrv_flags |= BDRV_O_NATIVE_AIO;
+ } else {
+ bdrv_flags &= ~BDRV_O_NATIVE_AIO;
+ }
+
if (bdrv_open2(dinfo->bdrv, file, bdrv_flags, drv) < 0) {
fprintf(stderr, "qemu: could not open disk image %s\n",
file);
return NULL;
}
+
if (bdrv_key_required(dinfo->bdrv))
autostart = 0;
*fatal_error = 0;