From 019d6b8ff0d495ded6977f24a4e8fd1c7fec09e0 Mon Sep 17 00:00:00 2001
From: Anthony Liguori <aliguori@us.ibm.com>
Date: Sat, 9 May 2009 17:14:19 -0500
Subject: Move block drivers into their own directory

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>
---
 block/bochs.c     |  259 +++++
 block/cloop.c     |  171 ++++
 block/cow.c       |  275 +++++
 block/dmg.c       |  301 ++++++
 block/nbd.c       |  196 ++++
 block/parallels.c |  181 ++++
 block/qcow.c      |  945 +++++++++++++++++
 block/qcow2.c     | 2931 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/raw-posix.c | 1438 ++++++++++++++++++++++++++
 block/raw-win32.c |  394 +++++++
 block/vmdk.c      |  833 +++++++++++++++
 block/vpc.c       |  606 +++++++++++
 block/vvfat.c     | 2855 +++++++++++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 11385 insertions(+)
 create mode 100644 block/bochs.c
 create mode 100644 block/cloop.c
 create mode 100644 block/cow.c
 create mode 100644 block/dmg.c
 create mode 100644 block/nbd.c
 create mode 100644 block/parallels.c
 create mode 100644 block/qcow.c
 create mode 100644 block/qcow2.c
 create mode 100644 block/raw-posix.c
 create mode 100644 block/raw-win32.c
 create mode 100644 block/vmdk.c
 create mode 100644 block/vpc.c
 create mode 100644 block/vvfat.c

(limited to 'block')

diff --git a/block/bochs.c b/block/bochs.c
new file mode 100644
index 0000000..bac81c4
--- /dev/null
+++ b/block/bochs.c
@@ -0,0 +1,259 @@
+/*
+ * Block driver for the various disk image formats used by Bochs
+ * Currently only for "growing" type in read-only mode
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "Bochs Virtual HD Image"
+#define HEADER_VERSION 0x00020000
+#define HEADER_V1 0x00010000
+#define HEADER_SIZE 512
+
+#define REDOLOG_TYPE "Redolog"
+#define GROWING_TYPE "Growing"
+
+// not allocated: 0xffffffff
+
+// always little-endian
+struct bochs_header_v1 {
+    char magic[32]; // "Bochs Virtual HD Image"
+    char type[16]; // "Redolog"
+    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    uint32_t version;
+    uint32_t header; // size of header
+
+    union {
+	struct {
+	    uint32_t catalog; // num of entries
+	    uint32_t bitmap; // bitmap size
+	    uint32_t extent; // extent size
+	    uint64_t disk; // disk size
+	    char padding[HEADER_SIZE - 64 - 8 - 20];
+	} redolog;
+	char padding[HEADER_SIZE - 64 - 8];
+    } extra;
+};
+
+// always little-endian
+struct bochs_header {
+    char magic[32]; // "Bochs Virtual HD Image"
+    char type[16]; // "Redolog"
+    char subtype[16]; // "Undoable" / "Volatile" / "Growing"
+    uint32_t version;
+    uint32_t header; // size of header
+
+    union {
+	struct {
+	    uint32_t catalog; // num of entries
+	    uint32_t bitmap; // bitmap size
+	    uint32_t extent; // extent size
+	    uint32_t reserved; // for ???
+	    uint64_t disk; // disk size
+	    char padding[HEADER_SIZE - 64 - 8 - 24];
+	} redolog;
+	char padding[HEADER_SIZE - 64 - 8];
+    } extra;
+};
+
+typedef struct BDRVBochsState {
+    int fd;
+
+    uint32_t *catalog_bitmap;
+    int catalog_size;
+
+    int data_offset;
+
+    int bitmap_blocks;
+    int extent_blocks;
+    int extent_size;
+} BDRVBochsState;
+
+static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const struct bochs_header *bochs = (const void *)buf;
+
+    if (buf_size < HEADER_SIZE)
+	return 0;
+
+    if (!strcmp(bochs->magic, HEADER_MAGIC) &&
+	!strcmp(bochs->type, REDOLOG_TYPE) &&
+	!strcmp(bochs->subtype, GROWING_TYPE) &&
+	((le32_to_cpu(bochs->version) == HEADER_VERSION) ||
+	(le32_to_cpu(bochs->version) == HEADER_V1)))
+	return 100;
+
+    return 0;
+}
+
+static int bochs_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVBochsState *s = bs->opaque;
+    int fd, i;
+    struct bochs_header bochs;
+    struct bochs_header_v1 header_v1;
+
+    fd = open(filename, O_RDWR | O_BINARY);
+    if (fd < 0) {
+        fd = open(filename, O_RDONLY | O_BINARY);
+        if (fd < 0)
+            return -1;
+    }
+
+    bs->read_only = 1; // no write support yet
+
+    s->fd = fd;
+
+    if (read(fd, &bochs, sizeof(bochs)) != sizeof(bochs)) {
+        goto fail;
+    }
+
+    if (strcmp(bochs.magic, HEADER_MAGIC) ||
+        strcmp(bochs.type, REDOLOG_TYPE) ||
+        strcmp(bochs.subtype, GROWING_TYPE) ||
+	((le32_to_cpu(bochs.version) != HEADER_VERSION) &&
+	(le32_to_cpu(bochs.version) != HEADER_V1))) {
+        goto fail;
+    }
+
+    if (le32_to_cpu(bochs.version) == HEADER_V1) {
+      memcpy(&header_v1, &bochs, sizeof(bochs));
+      bs->total_sectors = le64_to_cpu(header_v1.extra.redolog.disk) / 512;
+    } else {
+      bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512;
+    }
+
+    lseek(s->fd, le32_to_cpu(bochs.header), SEEK_SET);
+
+    s->catalog_size = le32_to_cpu(bochs.extra.redolog.catalog);
+    s->catalog_bitmap = qemu_malloc(s->catalog_size * 4);
+    if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) !=
+	s->catalog_size * 4)
+	goto fail;
+    for (i = 0; i < s->catalog_size; i++)
+	le32_to_cpus(&s->catalog_bitmap[i]);
+
+    s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4);
+
+    s->bitmap_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.bitmap) - 1) / 512;
+    s->extent_blocks = 1 + (le32_to_cpu(bochs.extra.redolog.extent) - 1) / 512;
+
+    s->extent_size = le32_to_cpu(bochs.extra.redolog.extent);
+
+    return 0;
+ fail:
+    close(fd);
+    return -1;
+}
+
+static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+    BDRVBochsState *s = bs->opaque;
+    int64_t offset = sector_num * 512;
+    int64_t extent_index, extent_offset, bitmap_offset, block_offset;
+    char bitmap_entry;
+
+    // seek to sector
+    extent_index = offset / s->extent_size;
+    extent_offset = (offset % s->extent_size) / 512;
+
+    if (s->catalog_bitmap[extent_index] == 0xffffffff)
+    {
+//	fprintf(stderr, "page not allocated [%x - %x:%x]\n",
+//	    sector_num, extent_index, extent_offset);
+	return -1; // not allocated
+    }
+
+    bitmap_offset = s->data_offset + (512 * s->catalog_bitmap[extent_index] *
+	(s->extent_blocks + s->bitmap_blocks));
+    block_offset = bitmap_offset + (512 * (s->bitmap_blocks + extent_offset));
+
+//    fprintf(stderr, "sect: %x [ext i: %x o: %x] -> %x bitmap: %x block: %x\n",
+//	sector_num, extent_index, extent_offset,
+//	le32_to_cpu(s->catalog_bitmap[extent_index]),
+//	bitmap_offset, block_offset);
+
+    // read in bitmap for current extent
+    lseek(s->fd, bitmap_offset + (extent_offset / 8), SEEK_SET);
+
+    read(s->fd, &bitmap_entry, 1);
+
+    if (!((bitmap_entry >> (extent_offset % 8)) & 1))
+    {
+//	fprintf(stderr, "sector (%x) in bitmap not allocated\n",
+//	    sector_num);
+	return -1; // not allocated
+    }
+
+    lseek(s->fd, block_offset, SEEK_SET);
+
+    return 0;
+}
+
+static int bochs_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVBochsState *s = bs->opaque;
+    int ret;
+
+    while (nb_sectors > 0) {
+	if (!seek_to_sector(bs, sector_num))
+	{
+	    ret = read(s->fd, buf, 512);
+	    if (ret != 512)
+		return -1;
+	}
+	else
+            memset(buf, 0, 512);
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+    return 0;
+}
+
+static void bochs_close(BlockDriverState *bs)
+{
+    BDRVBochsState *s = bs->opaque;
+    qemu_free(s->catalog_bitmap);
+    close(s->fd);
+}
+
+static BlockDriver bdrv_bochs = {
+    .format_name	= "bochs",
+    .instance_size	= sizeof(BDRVBochsState),
+    .bdrv_probe		= bochs_probe,
+    .bdrv_open		= bochs_open,
+    .bdrv_read		= bochs_read,
+    .bdrv_close		= bochs_close,
+};
+
+static void bdrv_bochs_init(void)
+{
+    bdrv_register(&bdrv_bochs);
+}
+
+block_init(bdrv_bochs_init);
diff --git a/block/cloop.c b/block/cloop.c
new file mode 100644
index 0000000..06c687e
--- /dev/null
+++ b/block/cloop.c
@@ -0,0 +1,171 @@
+/*
+ * QEMU Block driver for CLOOP images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVCloopState {
+    int fd;
+    uint32_t block_size;
+    uint32_t n_blocks;
+    uint64_t* offsets;
+    uint32_t sectors_per_block;
+    uint32_t current_block;
+    uint8_t *compressed_block;
+    uint8_t *uncompressed_block;
+    z_stream zstream;
+} BDRVCloopState;
+
+static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const char* magic_version_2_0="#!/bin/sh\n"
+	"#V2.0 Format\n"
+	"modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n";
+    int length=strlen(magic_version_2_0);
+    if(length>buf_size)
+	length=buf_size;
+    if(!memcmp(magic_version_2_0,buf,length))
+	return 2;
+    return 0;
+}
+
+static int cloop_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVCloopState *s = bs->opaque;
+    uint32_t offsets_size,max_compressed_block_size=1,i;
+
+    s->fd = open(filename, O_RDONLY | O_BINARY);
+    if (s->fd < 0)
+        return -errno;
+    bs->read_only = 1;
+
+    /* read header */
+    if(lseek(s->fd,128,SEEK_SET)<0) {
+cloop_close:
+	close(s->fd);
+	return -1;
+    }
+    if(read(s->fd,&s->block_size,4)<4)
+	goto cloop_close;
+    s->block_size=be32_to_cpu(s->block_size);
+    if(read(s->fd,&s->n_blocks,4)<4)
+	goto cloop_close;
+    s->n_blocks=be32_to_cpu(s->n_blocks);
+
+    /* read offsets */
+    offsets_size=s->n_blocks*sizeof(uint64_t);
+    s->offsets=(uint64_t*)qemu_malloc(offsets_size);
+    if(read(s->fd,s->offsets,offsets_size)<offsets_size)
+	goto cloop_close;
+    for(i=0;i<s->n_blocks;i++) {
+	s->offsets[i]=be64_to_cpu(s->offsets[i]);
+	if(i>0) {
+	    uint32_t size=s->offsets[i]-s->offsets[i-1];
+	    if(size>max_compressed_block_size)
+		max_compressed_block_size=size;
+	}
+    }
+
+    /* initialize zlib engine */
+    s->compressed_block = qemu_malloc(max_compressed_block_size+1);
+    s->uncompressed_block = qemu_malloc(s->block_size);
+    if(inflateInit(&s->zstream) != Z_OK)
+	goto cloop_close;
+    s->current_block=s->n_blocks;
+
+    s->sectors_per_block = s->block_size/512;
+    bs->total_sectors = s->n_blocks*s->sectors_per_block;
+    return 0;
+}
+
+static inline int cloop_read_block(BDRVCloopState *s,int block_num)
+{
+    if(s->current_block != block_num) {
+	int ret;
+        uint32_t bytes = s->offsets[block_num+1]-s->offsets[block_num];
+
+	lseek(s->fd, s->offsets[block_num], SEEK_SET);
+        ret = read(s->fd, s->compressed_block, bytes);
+        if (ret != bytes)
+            return -1;
+
+	s->zstream.next_in = s->compressed_block;
+	s->zstream.avail_in = bytes;
+	s->zstream.next_out = s->uncompressed_block;
+	s->zstream.avail_out = s->block_size;
+	ret = inflateReset(&s->zstream);
+	if(ret != Z_OK)
+	    return -1;
+	ret = inflate(&s->zstream, Z_FINISH);
+	if(ret != Z_STREAM_END || s->zstream.total_out != s->block_size)
+	    return -1;
+
+	s->current_block = block_num;
+    }
+    return 0;
+}
+
+static int cloop_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVCloopState *s = bs->opaque;
+    int i;
+
+    for(i=0;i<nb_sectors;i++) {
+	uint32_t sector_offset_in_block=((sector_num+i)%s->sectors_per_block),
+	    block_num=(sector_num+i)/s->sectors_per_block;
+	if(cloop_read_block(s, block_num) != 0)
+	    return -1;
+	memcpy(buf+i*512,s->uncompressed_block+sector_offset_in_block*512,512);
+    }
+    return 0;
+}
+
+static void cloop_close(BlockDriverState *bs)
+{
+    BDRVCloopState *s = bs->opaque;
+    close(s->fd);
+    if(s->n_blocks>0)
+	free(s->offsets);
+    free(s->compressed_block);
+    free(s->uncompressed_block);
+    inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_cloop = {
+    .format_name	= "cloop",
+    .instance_size	= sizeof(BDRVCloopState),
+    .bdrv_probe		= cloop_probe,
+    .bdrv_open		= cloop_open,
+    .bdrv_read		= cloop_read,
+    .bdrv_close		= cloop_close,
+};
+
+static void bdrv_cloop_init(void)
+{
+    bdrv_register(&bdrv_cloop);
+}
+
+block_init(bdrv_cloop_init);
diff --git a/block/cow.c b/block/cow.c
new file mode 100644
index 0000000..94b3549
--- /dev/null
+++ b/block/cow.c
@@ -0,0 +1,275 @@
+/*
+ * Block driver for the COW format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef _WIN32
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <sys/mman.h>
+
+/**************************************************************/
+/* COW block driver using file system holes */
+
+/* user mode linux compatible COW file */
+#define COW_MAGIC 0x4f4f4f4d  /* MOOO */
+#define COW_VERSION 2
+
+struct cow_header_v2 {
+    uint32_t magic;
+    uint32_t version;
+    char backing_file[1024];
+    int32_t mtime;
+    uint64_t size;
+    uint32_t sectorsize;
+};
+
+typedef struct BDRVCowState {
+    int fd;
+    uint8_t *cow_bitmap; /* if non NULL, COW mappings are used first */
+    uint8_t *cow_bitmap_addr; /* mmap address of cow_bitmap */
+    int cow_bitmap_size;
+    int64_t cow_sectors_offset;
+} BDRVCowState;
+
+static int cow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const struct cow_header_v2 *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(struct cow_header_v2) &&
+        be32_to_cpu(cow_header->magic) == COW_MAGIC &&
+        be32_to_cpu(cow_header->version) == COW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+static int cow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVCowState *s = bs->opaque;
+    int fd;
+    struct cow_header_v2 cow_header;
+    int64_t size;
+
+    fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE);
+    if (fd < 0) {
+        fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE);
+        if (fd < 0)
+            return -1;
+    }
+    s->fd = fd;
+    /* see if it is a cow image */
+    if (read(fd, &cow_header, sizeof(cow_header)) != sizeof(cow_header)) {
+        goto fail;
+    }
+
+    if (be32_to_cpu(cow_header.magic) != COW_MAGIC ||
+        be32_to_cpu(cow_header.version) != COW_VERSION) {
+        goto fail;
+    }
+
+    /* cow image found */
+    size = be64_to_cpu(cow_header.size);
+    bs->total_sectors = size / 512;
+
+    pstrcpy(bs->backing_file, sizeof(bs->backing_file),
+            cow_header.backing_file);
+
+    /* mmap the bitmap */
+    s->cow_bitmap_size = ((bs->total_sectors + 7) >> 3) + sizeof(cow_header);
+    s->cow_bitmap_addr = (void *)mmap(get_mmap_addr(s->cow_bitmap_size),
+                                      s->cow_bitmap_size,
+                                      PROT_READ | PROT_WRITE,
+                                      MAP_SHARED, s->fd, 0);
+    if (s->cow_bitmap_addr == MAP_FAILED)
+        goto fail;
+    s->cow_bitmap = s->cow_bitmap_addr + sizeof(cow_header);
+    s->cow_sectors_offset = (s->cow_bitmap_size + 511) & ~511;
+    return 0;
+ fail:
+    close(fd);
+    return -1;
+}
+
+static inline void cow_set_bit(uint8_t *bitmap, int64_t bitnum)
+{
+    bitmap[bitnum / 8] |= (1 << (bitnum%8));
+}
+
+static inline int is_bit_set(const uint8_t *bitmap, int64_t bitnum)
+{
+    return !!(bitmap[bitnum / 8] & (1 << (bitnum%8)));
+}
+
+
+/* Return true if first block has been changed (ie. current version is
+ * in COW file).  Set the number of continuous blocks for which that
+ * is true. */
+static inline int is_changed(uint8_t *bitmap,
+                             int64_t sector_num, int nb_sectors,
+                             int *num_same)
+{
+    int changed;
+
+    if (!bitmap || nb_sectors == 0) {
+	*num_same = nb_sectors;
+	return 0;
+    }
+
+    changed = is_bit_set(bitmap, sector_num);
+    for (*num_same = 1; *num_same < nb_sectors; (*num_same)++) {
+	if (is_bit_set(bitmap, sector_num + *num_same) != changed)
+	    break;
+    }
+
+    return changed;
+}
+
+static int cow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    BDRVCowState *s = bs->opaque;
+    return is_changed(s->cow_bitmap, sector_num, nb_sectors, pnum);
+}
+
+static int cow_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVCowState *s = bs->opaque;
+    int ret, n;
+
+    while (nb_sectors > 0) {
+        if (is_changed(s->cow_bitmap, sector_num, nb_sectors, &n)) {
+            lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET);
+            ret = read(s->fd, buf, n * 512);
+            if (ret != n * 512)
+                return -1;
+        } else {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+                if (ret < 0)
+                    return -1;
+            } else {
+            memset(buf, 0, n * 512);
+        }
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+
+static int cow_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVCowState *s = bs->opaque;
+    int ret, i;
+
+    lseek(s->fd, s->cow_sectors_offset + sector_num * 512, SEEK_SET);
+    ret = write(s->fd, buf, nb_sectors * 512);
+    if (ret != nb_sectors * 512)
+        return -1;
+    for (i = 0; i < nb_sectors; i++)
+        cow_set_bit(s->cow_bitmap, sector_num + i);
+    return 0;
+}
+
+static void cow_close(BlockDriverState *bs)
+{
+    BDRVCowState *s = bs->opaque;
+    munmap((void *)s->cow_bitmap_addr, s->cow_bitmap_size);
+    close(s->fd);
+}
+
+static int cow_create(const char *filename, int64_t image_sectors,
+                      const char *image_filename, int flags)
+{
+    int fd, cow_fd;
+    struct cow_header_v2 cow_header;
+    struct stat st;
+
+    if (flags)
+        return -ENOTSUP;
+
+    cow_fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+              0644);
+    if (cow_fd < 0)
+        return -1;
+    memset(&cow_header, 0, sizeof(cow_header));
+    cow_header.magic = cpu_to_be32(COW_MAGIC);
+    cow_header.version = cpu_to_be32(COW_VERSION);
+    if (image_filename) {
+        /* Note: if no file, we put a dummy mtime */
+        cow_header.mtime = cpu_to_be32(0);
+
+        fd = open(image_filename, O_RDONLY | O_BINARY);
+        if (fd < 0) {
+            close(cow_fd);
+            goto mtime_fail;
+        }
+        if (fstat(fd, &st) != 0) {
+            close(fd);
+            goto mtime_fail;
+        }
+        close(fd);
+        cow_header.mtime = cpu_to_be32(st.st_mtime);
+    mtime_fail:
+        pstrcpy(cow_header.backing_file, sizeof(cow_header.backing_file),
+                image_filename);
+    }
+    cow_header.sectorsize = cpu_to_be32(512);
+    cow_header.size = cpu_to_be64(image_sectors * 512);
+    write(cow_fd, &cow_header, sizeof(cow_header));
+    /* resize to include at least all the bitmap */
+    ftruncate(cow_fd, sizeof(cow_header) + ((image_sectors + 7) >> 3));
+    close(cow_fd);
+    return 0;
+}
+
+static void cow_flush(BlockDriverState *bs)
+{
+    BDRVCowState *s = bs->opaque;
+    fsync(s->fd);
+}
+
+static BlockDriver bdrv_cow = {
+    .format_name	= "cow",
+    .instance_size	= sizeof(BDRVCowState),
+    .bdrv_probe		= cow_probe,
+    .bdrv_open		= cow_open,
+    .bdrv_read		= cow_read,
+    .bdrv_write		= cow_write,
+    .bdrv_close		= cow_close,
+    .bdrv_create	= cow_create,
+    .bdrv_flush		= cow_flush,
+    .bdrv_is_allocated	= cow_is_allocated,
+};
+
+static void bdrv_cow_init(void)
+{
+    bdrv_register(&bdrv_cow);
+}
+
+block_init(bdrv_cow_init);
+#endif
diff --git a/block/dmg.c b/block/dmg.c
new file mode 100644
index 0000000..262560f
--- /dev/null
+++ b/block/dmg.c
@@ -0,0 +1,301 @@
+/*
+ * QEMU Block driver for DMG images
+ *
+ * Copyright (c) 2004 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "bswap.h"
+#include "module.h"
+#include <zlib.h>
+
+typedef struct BDRVDMGState {
+    int fd;
+
+    /* each chunk contains a certain number of sectors,
+     * offsets[i] is the offset in the .dmg file,
+     * lengths[i] is the length of the compressed chunk,
+     * sectors[i] is the sector beginning at offsets[i],
+     * sectorcounts[i] is the number of sectors in that chunk,
+     * the sectors array is ordered
+     * 0<=i<n_chunks */
+
+    uint32_t n_chunks;
+    uint32_t* types;
+    uint64_t* offsets;
+    uint64_t* lengths;
+    uint64_t* sectors;
+    uint64_t* sectorcounts;
+    uint32_t current_chunk;
+    uint8_t *compressed_chunk;
+    uint8_t *uncompressed_chunk;
+    z_stream zstream;
+} BDRVDMGState;
+
+static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    int len=strlen(filename);
+    if(len>4 && !strcmp(filename+len-4,".dmg"))
+	return 2;
+    return 0;
+}
+
+static off_t read_off(int fd)
+{
+	uint64_t buffer;
+	if(read(fd,&buffer,8)<8)
+		return 0;
+	return be64_to_cpu(buffer);
+}
+
+static off_t read_uint32(int fd)
+{
+	uint32_t buffer;
+	if(read(fd,&buffer,4)<4)
+		return 0;
+	return be32_to_cpu(buffer);
+}
+
+static int dmg_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVDMGState *s = bs->opaque;
+    off_t info_begin,info_end,last_in_offset,last_out_offset;
+    uint32_t count;
+    uint32_t max_compressed_size=1,max_sectors_per_chunk=1,i;
+
+    s->fd = open(filename, O_RDONLY | O_BINARY);
+    if (s->fd < 0)
+        return -errno;
+    bs->read_only = 1;
+    s->n_chunks = 0;
+    s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL;
+
+    /* read offset of info blocks */
+    if(lseek(s->fd,-0x1d8,SEEK_END)<0) {
+dmg_close:
+	close(s->fd);
+	/* open raw instead */
+	bs->drv=bdrv_find_format("raw");
+	return bs->drv->bdrv_open(bs, filename, flags);
+    }
+    info_begin=read_off(s->fd);
+    if(info_begin==0)
+	goto dmg_close;
+    if(lseek(s->fd,info_begin,SEEK_SET)<0)
+	goto dmg_close;
+    if(read_uint32(s->fd)!=0x100)
+	goto dmg_close;
+    if((count = read_uint32(s->fd))==0)
+	goto dmg_close;
+    info_end = info_begin+count;
+    if(lseek(s->fd,0xf8,SEEK_CUR)<0)
+	goto dmg_close;
+
+    /* read offsets */
+    last_in_offset = last_out_offset = 0;
+    while(lseek(s->fd,0,SEEK_CUR)<info_end) {
+        uint32_t type;
+
+	count = read_uint32(s->fd);
+	if(count==0)
+	    goto dmg_close;
+	type = read_uint32(s->fd);
+	if(type!=0x6d697368 || count<244)
+	    lseek(s->fd,count-4,SEEK_CUR);
+	else {
+	    int new_size, chunk_count;
+	    if(lseek(s->fd,200,SEEK_CUR)<0)
+	        goto dmg_close;
+	    chunk_count = (count-204)/40;
+	    new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count);
+	    s->types = qemu_realloc(s->types, new_size/2);
+	    s->offsets = qemu_realloc(s->offsets, new_size);
+	    s->lengths = qemu_realloc(s->lengths, new_size);
+	    s->sectors = qemu_realloc(s->sectors, new_size);
+	    s->sectorcounts = qemu_realloc(s->sectorcounts, new_size);
+
+	    for(i=s->n_chunks;i<s->n_chunks+chunk_count;i++) {
+		s->types[i] = read_uint32(s->fd);
+		if(s->types[i]!=0x80000005 && s->types[i]!=1 && s->types[i]!=2) {
+		    if(s->types[i]==0xffffffff) {
+			last_in_offset = s->offsets[i-1]+s->lengths[i-1];
+			last_out_offset = s->sectors[i-1]+s->sectorcounts[i-1];
+		    }
+		    chunk_count--;
+		    i--;
+		    if(lseek(s->fd,36,SEEK_CUR)<0)
+			goto dmg_close;
+		    continue;
+		}
+		read_uint32(s->fd);
+		s->sectors[i] = last_out_offset+read_off(s->fd);
+		s->sectorcounts[i] = read_off(s->fd);
+		s->offsets[i] = last_in_offset+read_off(s->fd);
+		s->lengths[i] = read_off(s->fd);
+		if(s->lengths[i]>max_compressed_size)
+		    max_compressed_size = s->lengths[i];
+		if(s->sectorcounts[i]>max_sectors_per_chunk)
+		    max_sectors_per_chunk = s->sectorcounts[i];
+	    }
+	    s->n_chunks+=chunk_count;
+	}
+    }
+
+    /* initialize zlib engine */
+    s->compressed_chunk = qemu_malloc(max_compressed_size+1);
+    s->uncompressed_chunk = qemu_malloc(512*max_sectors_per_chunk);
+    if(inflateInit(&s->zstream) != Z_OK)
+	goto dmg_close;
+
+    s->current_chunk = s->n_chunks;
+
+    return 0;
+}
+
+static inline int is_sector_in_chunk(BDRVDMGState* s,
+		uint32_t chunk_num,int sector_num)
+{
+    if(chunk_num>=s->n_chunks || s->sectors[chunk_num]>sector_num ||
+	    s->sectors[chunk_num]+s->sectorcounts[chunk_num]<=sector_num)
+	return 0;
+    else
+	return -1;
+}
+
+static inline uint32_t search_chunk(BDRVDMGState* s,int sector_num)
+{
+    /* binary search */
+    uint32_t chunk1=0,chunk2=s->n_chunks,chunk3;
+    while(chunk1!=chunk2) {
+	chunk3 = (chunk1+chunk2)/2;
+	if(s->sectors[chunk3]>sector_num)
+	    chunk2 = chunk3;
+	else if(s->sectors[chunk3]+s->sectorcounts[chunk3]>sector_num)
+	    return chunk3;
+	else
+	    chunk1 = chunk3;
+    }
+    return s->n_chunks; /* error */
+}
+
+static inline int dmg_read_chunk(BDRVDMGState *s,int sector_num)
+{
+    if(!is_sector_in_chunk(s,s->current_chunk,sector_num)) {
+	int ret;
+	uint32_t chunk = search_chunk(s,sector_num);
+
+	if(chunk>=s->n_chunks)
+	    return -1;
+
+	s->current_chunk = s->n_chunks;
+	switch(s->types[chunk]) {
+	case 0x80000005: { /* zlib compressed */
+	    int i;
+
+	    ret = lseek(s->fd, s->offsets[chunk], SEEK_SET);
+	    if(ret<0)
+		return -1;
+
+	    /* we need to buffer, because only the chunk as whole can be
+	     * inflated. */
+	    i=0;
+	    do {
+		ret = read(s->fd, s->compressed_chunk+i, s->lengths[chunk]-i);
+		if(ret<0 && errno==EINTR)
+		    ret=0;
+		i+=ret;
+	    } while(ret>=0 && ret+i<s->lengths[chunk]);
+
+	    if (ret != s->lengths[chunk])
+		return -1;
+
+	    s->zstream.next_in = s->compressed_chunk;
+	    s->zstream.avail_in = s->lengths[chunk];
+	    s->zstream.next_out = s->uncompressed_chunk;
+	    s->zstream.avail_out = 512*s->sectorcounts[chunk];
+	    ret = inflateReset(&s->zstream);
+	    if(ret != Z_OK)
+		return -1;
+	    ret = inflate(&s->zstream, Z_FINISH);
+	    if(ret != Z_STREAM_END || s->zstream.total_out != 512*s->sectorcounts[chunk])
+		return -1;
+	    break; }
+	case 1: /* copy */
+	    ret = read(s->fd, s->uncompressed_chunk, s->lengths[chunk]);
+	    if (ret != s->lengths[chunk])
+		return -1;
+	    break;
+	case 2: /* zero */
+	    memset(s->uncompressed_chunk, 0, 512*s->sectorcounts[chunk]);
+	    break;
+	}
+	s->current_chunk = chunk;
+    }
+    return 0;
+}
+
+static int dmg_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVDMGState *s = bs->opaque;
+    int i;
+
+    for(i=0;i<nb_sectors;i++) {
+	uint32_t sector_offset_in_chunk;
+	if(dmg_read_chunk(s, sector_num+i) != 0)
+	    return -1;
+	sector_offset_in_chunk = sector_num+i-s->sectors[s->current_chunk];
+	memcpy(buf+i*512,s->uncompressed_chunk+sector_offset_in_chunk*512,512);
+    }
+    return 0;
+}
+
+static void dmg_close(BlockDriverState *bs)
+{
+    BDRVDMGState *s = bs->opaque;
+    close(s->fd);
+    if(s->n_chunks>0) {
+	free(s->types);
+	free(s->offsets);
+	free(s->lengths);
+	free(s->sectors);
+	free(s->sectorcounts);
+    }
+    free(s->compressed_chunk);
+    free(s->uncompressed_chunk);
+    inflateEnd(&s->zstream);
+}
+
+static BlockDriver bdrv_dmg = {
+    .format_name	= "dmg",
+    .instance_size	= sizeof(BDRVDMGState),
+    .bdrv_probe		= dmg_probe,
+    .bdrv_open		= dmg_open,
+    .bdrv_read		= dmg_read,
+    .bdrv_close		= dmg_close,
+};
+
+static void bdrv_dmg_init(void)
+{
+    bdrv_register(&bdrv_dmg);
+}
+
+block_init(bdrv_dmg_init);
diff --git a/block/nbd.c b/block/nbd.c
new file mode 100644
index 0000000..47d4778
--- /dev/null
+++ b/block/nbd.c
@@ -0,0 +1,196 @@
+/*
+ * QEMU Block driver for  NBD
+ *
+ * Copyright (C) 2008 Bull S.A.S.
+ *     Author: Laurent Vivier <Laurent.Vivier@bull.net>
+ *
+ * Some parts:
+ *    Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "nbd.h"
+#include "module.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+
+typedef struct BDRVNBDState {
+    int sock;
+    off_t size;
+    size_t blocksize;
+} BDRVNBDState;
+
+static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
+{
+    BDRVNBDState *s = bs->opaque;
+    const char *host;
+    const char *unixpath;
+    int sock;
+    off_t size;
+    size_t blocksize;
+    int ret;
+
+    if ((flags & BDRV_O_CREAT))
+        return -EINVAL;
+
+    if (!strstart(filename, "nbd:", &host))
+        return -EINVAL;
+
+    if (strstart(host, "unix:", &unixpath)) {
+
+        if (unixpath[0] != '/')
+            return -EINVAL;
+
+        sock = unix_socket_outgoing(unixpath);
+
+    } else {
+        uint16_t port;
+        char *p, *r;
+        char hostname[128];
+
+        pstrcpy(hostname, 128, host);
+
+        p = strchr(hostname, ':');
+        if (p == NULL)
+            return -EINVAL;
+
+        *p = '\0';
+        p++;
+
+        port = strtol(p, &r, 0);
+        if (r == p)
+            return -EINVAL;
+        sock = tcp_socket_outgoing(hostname, port);
+    }
+
+    if (sock == -1)
+        return -errno;
+
+    ret = nbd_receive_negotiate(sock, &size, &blocksize);
+    if (ret == -1)
+        return -errno;
+
+    s->sock = sock;
+    s->size = size;
+    s->blocksize = blocksize;
+
+    return 0;
+}
+
+static int nbd_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVNBDState *s = bs->opaque;
+    struct nbd_request request;
+    struct nbd_reply reply;
+
+    request.type = NBD_CMD_READ;
+    request.handle = (uint64_t)(intptr_t)bs;
+    request.from = sector_num * 512;;
+    request.len = nb_sectors * 512;
+
+    if (nbd_send_request(s->sock, &request) == -1)
+        return -errno;
+
+    if (nbd_receive_reply(s->sock, &reply) == -1)
+        return -errno;
+
+    if (reply.error !=0)
+        return -reply.error;
+
+    if (reply.handle != request.handle)
+        return -EIO;
+
+    if (nbd_wr_sync(s->sock, buf, request.len, 1) != request.len)
+        return -EIO;
+
+    return 0;
+}
+
+static int nbd_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVNBDState *s = bs->opaque;
+    struct nbd_request request;
+    struct nbd_reply reply;
+
+    request.type = NBD_CMD_WRITE;
+    request.handle = (uint64_t)(intptr_t)bs;
+    request.from = sector_num * 512;;
+    request.len = nb_sectors * 512;
+
+    if (nbd_send_request(s->sock, &request) == -1)
+        return -errno;
+
+    if (nbd_wr_sync(s->sock, (uint8_t*)buf, request.len, 0) != request.len)
+        return -EIO;
+
+    if (nbd_receive_reply(s->sock, &reply) == -1)
+        return -errno;
+
+    if (reply.error !=0)
+        return -reply.error;
+
+    if (reply.handle != request.handle)
+        return -EIO;
+
+    return 0;
+}
+
+static void nbd_close(BlockDriverState *bs)
+{
+    BDRVNBDState *s = bs->opaque;
+    struct nbd_request request;
+
+    request.type = NBD_CMD_DISC;
+    request.handle = (uint64_t)(intptr_t)bs;
+    request.from = 0;
+    request.len = 0;
+    nbd_send_request(s->sock, &request);
+
+    close(s->sock);
+}
+
+static int64_t nbd_getlength(BlockDriverState *bs)
+{
+    BDRVNBDState *s = bs->opaque;
+
+    return s->size;
+}
+
+static BlockDriver bdrv_nbd = {
+    .format_name	= "nbd",
+    .instance_size	= sizeof(BDRVNBDState),
+    .bdrv_open		= nbd_open,
+    .bdrv_read		= nbd_read,
+    .bdrv_write		= nbd_write,
+    .bdrv_close		= nbd_close,
+    .bdrv_getlength	= nbd_getlength,
+    .protocol_name	= "nbd",
+};
+
+static void bdrv_nbd_init(void)
+{
+    bdrv_register(&bdrv_nbd);
+}
+
+block_init(bdrv_nbd_init);
diff --git a/block/parallels.c b/block/parallels.c
new file mode 100644
index 0000000..0b64a5c
--- /dev/null
+++ b/block/parallels.c
@@ -0,0 +1,181 @@
+/*
+ * Block driver for Parallels disk image format
+ *
+ * Copyright (c) 2007 Alex Beregszaszi
+ *
+ * This code is based on comparing different disk images created by Parallels.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_MAGIC "WithoutFreeSpace"
+#define HEADER_VERSION 2
+#define HEADER_SIZE 64
+
+// always little-endian
+struct parallels_header {
+    char magic[16]; // "WithoutFreeSpace"
+    uint32_t version;
+    uint32_t heads;
+    uint32_t cylinders;
+    uint32_t tracks;
+    uint32_t catalog_entries;
+    uint32_t nb_sectors;
+    char padding[24];
+} __attribute__((packed));
+
+typedef struct BDRVParallelsState {
+    int fd;
+
+    uint32_t *catalog_bitmap;
+    int catalog_size;
+
+    int tracks;
+} BDRVParallelsState;
+
+static int parallels_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const struct parallels_header *ph = (const void *)buf;
+
+    if (buf_size < HEADER_SIZE)
+	return 0;
+
+    if (!memcmp(ph->magic, HEADER_MAGIC, 16) &&
+	(le32_to_cpu(ph->version) == HEADER_VERSION))
+	return 100;
+
+    return 0;
+}
+
+static int parallels_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVParallelsState *s = bs->opaque;
+    int fd, i;
+    struct parallels_header ph;
+
+    fd = open(filename, O_RDWR | O_BINARY | O_LARGEFILE);
+    if (fd < 0) {
+        fd = open(filename, O_RDONLY | O_BINARY | O_LARGEFILE);
+        if (fd < 0)
+            return -1;
+    }
+
+    bs->read_only = 1; // no write support yet
+
+    s->fd = fd;
+
+    if (read(fd, &ph, sizeof(ph)) != sizeof(ph))
+        goto fail;
+
+    if (memcmp(ph.magic, HEADER_MAGIC, 16) ||
+	(le32_to_cpu(ph.version) != HEADER_VERSION)) {
+        goto fail;
+    }
+
+    bs->total_sectors = le32_to_cpu(ph.nb_sectors);
+
+    if (lseek(s->fd, 64, SEEK_SET) != 64)
+	goto fail;
+
+    s->tracks = le32_to_cpu(ph.tracks);
+
+    s->catalog_size = le32_to_cpu(ph.catalog_entries);
+    s->catalog_bitmap = qemu_malloc(s->catalog_size * 4);
+    if (read(s->fd, s->catalog_bitmap, s->catalog_size * 4) !=
+	s->catalog_size * 4)
+	goto fail;
+    for (i = 0; i < s->catalog_size; i++)
+	le32_to_cpus(&s->catalog_bitmap[i]);
+
+    return 0;
+fail:
+    if (s->catalog_bitmap)
+	qemu_free(s->catalog_bitmap);
+    close(fd);
+    return -1;
+}
+
+static inline int seek_to_sector(BlockDriverState *bs, int64_t sector_num)
+{
+    BDRVParallelsState *s = bs->opaque;
+    uint32_t index, offset, position;
+
+    index = sector_num / s->tracks;
+    offset = sector_num % s->tracks;
+
+    // not allocated
+    if ((index > s->catalog_size) || (s->catalog_bitmap[index] == 0))
+	return -1;
+
+    position = (s->catalog_bitmap[index] + offset) * 512;
+
+//    fprintf(stderr, "sector: %llx index=%x offset=%x pointer=%x position=%x\n",
+//	sector_num, index, offset, s->catalog_bitmap[index], position);
+
+    if (lseek(s->fd, position, SEEK_SET) != position)
+	return -1;
+
+    return 0;
+}
+
+static int parallels_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVParallelsState *s = bs->opaque;
+
+    while (nb_sectors > 0) {
+	if (!seek_to_sector(bs, sector_num)) {
+	    if (read(s->fd, buf, 512) != 512)
+		return -1;
+	} else
+            memset(buf, 0, 512);
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+    return 0;
+}
+
+static void parallels_close(BlockDriverState *bs)
+{
+    BDRVParallelsState *s = bs->opaque;
+    qemu_free(s->catalog_bitmap);
+    close(s->fd);
+}
+
+static BlockDriver bdrv_parallels = {
+    .format_name	= "parallels",
+    .instance_size	= sizeof(BDRVParallelsState),
+    .bdrv_probe		= parallels_probe,
+    .bdrv_open		= parallels_open,
+    .bdrv_read		= parallels_read,
+    .bdrv_close		= parallels_close,
+};
+
+static void bdrv_parallels_init(void)
+{
+    bdrv_register(&bdrv_parallels);
+}
+
+block_init(bdrv_parallels_init);
diff --git a/block/qcow.c b/block/qcow.c
new file mode 100644
index 0000000..1cf7c3b
--- /dev/null
+++ b/block/qcow.c
@@ -0,0 +1,945 @@
+/*
+ * Block driver for the QCOW format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+
+/**************************************************************/
+/* QEMU COW block driver with compression and encryption support */
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 1
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_OFLAG_COMPRESSED (1LL << 63)
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t mtime;
+    uint64_t size; /* in bytes */
+    uint8_t cluster_bits;
+    uint8_t l2_bits;
+    uint32_t crypt_method;
+    uint64_t l1_table_offset;
+} QCowHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVQcowState {
+    BlockDriverState *hd;
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+    uint64_t *l2_cache;
+    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+} BDRVQcowState;
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) == QCOW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, shift, ret;
+    QCowHeader header;
+
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
+        goto fail;
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be32_to_cpus(&header.mtime);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+
+    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
+        goto fail;
+    if (header.size <= 1 || header.cluster_bits < 9)
+        goto fail;
+    if (header.crypt_method > QCOW_CRYPT_AES)
+        goto fail;
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header)
+        bs->encrypted = 1;
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = header.l2_bits;
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
+
+    /* read the level 1 table */
+    shift = s->cluster_bits + s->l2_bits;
+    s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
+
+    s->l1_table_offset = header.l1_table_offset;
+    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    if (!s->l1_table)
+        goto fail;
+    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
+        s->l1_size * sizeof(uint64_t))
+        goto fail;
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+    /* alloc L2 cache */
+    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    if (!s->l2_cache)
+        goto fail;
+    s->cluster_cache = qemu_malloc(s->cluster_size);
+    if (!s->cluster_cache)
+        goto fail;
+    s->cluster_data = qemu_malloc(s->cluster_size);
+    if (!s->cluster_data)
+        goto fail;
+    s->cluster_cache_offset = -1;
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023)
+            len = 1023;
+        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
+            goto fail;
+        bs->backing_file[len] = '\0';
+    }
+    return 0;
+
+ fail:
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+#if 0
+    /* test */
+    {
+        uint8_t in[16];
+        uint8_t out[16];
+        uint8_t tmp[16];
+        for(i=0;i<16;i++)
+            in[i] = i;
+        AES_encrypt(in, tmp, &s->aes_encrypt_key);
+        AES_decrypt(tmp, out, &s->aes_decrypt_key);
+        for(i = 0; i < 16; i++)
+            printf(" %02x", tmp[i]);
+        printf("\n");
+        for(i = 0; i < 16; i++)
+            printf(" %02x", out[i]);
+        printf("\n");
+    }
+#endif
+    return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+/* 'allocate' is:
+ *
+ * 0 to not allocate.
+ *
+ * 1 to allocate a normal cluster (for sector indexes 'n_start' to
+ * 'n_end')
+ *
+ * 2 to allocate a compressed cluster of size
+ * 'compressed_size'. 'compressed_size' must be > 0 and <
+ * cluster_size
+ *
+ * return 0 if not allocated.
+ */
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                   uint64_t offset, int allocate,
+                                   int compressed_size,
+                                   int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index, i, j, l1_index, l2_index;
+    uint64_t l2_offset, *l2_table, cluster_offset, tmp;
+    uint32_t min_count;
+    int new_l2_table;
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    l2_offset = s->l1_table[l1_index];
+    new_l2_table = 0;
+    if (!l2_offset) {
+        if (!allocate)
+            return 0;
+        /* allocate a new l2 entry */
+        l2_offset = bdrv_getlength(s->hd);
+        /* round to cluster size */
+        l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
+        /* update the L1 entry */
+        s->l1_table[l1_index] = l2_offset;
+        tmp = cpu_to_be64(l2_offset);
+        if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
+                        &tmp, sizeof(tmp)) != sizeof(tmp))
+            return 0;
+        new_l2_table = 1;
+    }
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            l2_table = s->l2_cache + (i << s->l2_bits);
+            goto found;
+        }
+    }
+    /* not found: load a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    if (new_l2_table) {
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+        if (bdrv_pwrite(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return 0;
+    } else {
+        if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return 0;
+    }
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+ found:
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (!cluster_offset ||
+        ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
+        if (!allocate)
+            return 0;
+        /* allocate a new cluster */
+        if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
+            (n_end - n_start) < s->cluster_sectors) {
+            /* if the cluster is already compressed, we must
+               decompress it in the case it is not completely
+               overwritten */
+            if (decompress_cluster(s, cluster_offset) < 0)
+                return 0;
+            cluster_offset = bdrv_getlength(s->hd);
+            cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                ~(s->cluster_size - 1);
+            /* write the cluster content */
+            if (bdrv_pwrite(s->hd, cluster_offset, s->cluster_cache, s->cluster_size) !=
+                s->cluster_size)
+                return -1;
+        } else {
+            cluster_offset = bdrv_getlength(s->hd);
+            if (allocate == 1) {
+                /* round to cluster size */
+                cluster_offset = (cluster_offset + s->cluster_size - 1) &
+                    ~(s->cluster_size - 1);
+                bdrv_truncate(s->hd, cluster_offset + s->cluster_size);
+                /* if encrypted, we must initialize the cluster
+                   content which won't be written */
+                if (s->crypt_method &&
+                    (n_end - n_start) < s->cluster_sectors) {
+                    uint64_t start_sect;
+                    start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
+                    memset(s->cluster_data + 512, 0x00, 512);
+                    for(i = 0; i < s->cluster_sectors; i++) {
+                        if (i < n_start || i >= n_end) {
+                            encrypt_sectors(s, start_sect + i,
+                                            s->cluster_data,
+                                            s->cluster_data + 512, 1, 1,
+                                            &s->aes_encrypt_key);
+                            if (bdrv_pwrite(s->hd, cluster_offset + i * 512,
+                                            s->cluster_data, 512) != 512)
+                                return -1;
+                        }
+                    }
+                }
+            } else if (allocate == 2) {
+                cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                    (uint64_t)compressed_size << (63 - s->cluster_bits);
+            }
+        }
+        /* update L2 table */
+        tmp = cpu_to_be64(cluster_offset);
+        l2_table[l2_index] = tmp;
+        if (bdrv_pwrite(s->hd,
+                        l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) != sizeof(tmp))
+            return 0;
+    }
+    return cluster_offset;
+}
+
+static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+    index_in_cluster = sector_num & (s->cluster_sectors - 1);
+    n = s->cluster_sectors - index_in_cluster;
+    if (n > nb_sectors)
+        n = nb_sectors;
+    *pnum = n;
+    return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
+{
+    int ret, csize;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        csize = cluster_offset >> (63 - s->cluster_bits);
+        csize &= (s->cluster_size - 1);
+        ret = bdrv_pread(s->hd, coffset, s->cluster_data, csize);
+        if (ret != csize)
+            return -1;
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data, csize) < 0) {
+            return -1;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+#if 0
+
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+                     uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        if (!cluster_offset) {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
+                if (ret < 0)
+                    return -1;
+            } else {
+                memset(buf, 0, 512 * n);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            if (decompress_cluster(s, cluster_offset) < 0)
+                return -1;
+            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+        } else {
+            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+            if (ret != n * 512)
+                return -1;
+            if (s->crypt_method) {
+                encrypt_sectors(s, sector_num, buf, buf, n, 0,
+                                &s->aes_decrypt_key);
+            }
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+#endif
+
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0,
+                                            index_in_cluster,
+                                            index_in_cluster + n);
+        if (!cluster_offset)
+            return -1;
+        if (s->crypt_method) {
+            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
+                            &s->aes_encrypt_key);
+            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
+                              s->cluster_data, n * 512);
+        } else {
+            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+        }
+        if (ret != n * 512)
+            return -1;
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+    return 0;
+}
+
+typedef struct QCowAIOCB {
+    BlockDriverAIOCB common;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    uint8_t *buf;
+    void *orig_buf;
+    int nb_sectors;
+    int n;
+    uint64_t cluster_offset;
+    uint8_t *cluster_data;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    BlockDriverAIOCB *hd_aiocb;
+} QCowAIOCB;
+
+static void qcow_aio_read_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0)
+        goto done;
+
+ redo:
+    /* post process the read buffer */
+    if (!acb->cluster_offset) {
+        /* nothing to do */
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* nothing to do */
+    } else {
+        if (s->crypt_method) {
+            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
+                            acb->n, 0,
+                            &s->aes_decrypt_key);
+        }
+    }
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    /* prepare next AIO request */
+    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
+                                             0, 0, 0, 0);
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+    acb->n = s->cluster_sectors - index_in_cluster;
+    if (acb->n > acb->nb_sectors)
+        acb->n = acb->nb_sectors;
+
+    if (!acb->cluster_offset) {
+        if (bs->backing_hd) {
+            /* read from the base image */
+            acb->hd_iov.iov_base = (void *)acb->buf;
+            acb->hd_iov.iov_len = acb->n * 512;
+            qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+            acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+                &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+            if (acb->hd_aiocb == NULL)
+                goto done;
+        } else {
+            /* Note: in this case, no need to wait */
+            memset(acb->buf, 0, 512 * acb->n);
+            goto redo;
+        }
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* add AIO support for compressed blocks ? */
+        if (decompress_cluster(s, acb->cluster_offset) < 0)
+            goto done;
+        memcpy(acb->buf,
+               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
+        goto redo;
+    } else {
+        if ((acb->cluster_offset & 511) != 0) {
+            ret = -EIO;
+            goto done;
+        }
+        acb->hd_iov.iov_base = (void *)acb->buf;
+        acb->hd_iov.iov_len = acb->n * 512;
+        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+        acb->hd_aiocb = bdrv_aio_readv(s->hd,
+                            (acb->cluster_offset >> 9) + index_in_cluster,
+                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+        if (acb->hd_aiocb == NULL)
+            goto done;
+    }
+
+    return;
+
+done:
+    if (acb->qiov->niov > 1) {
+        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+        qemu_vfree(acb->orig_buf);
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QCowAIOCB *acb;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->hd_aiocb = NULL;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    if (qiov->niov > 1)
+        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+    else
+        acb->buf = (uint8_t *)qiov->iov->iov_base;
+    acb->nb_sectors = nb_sectors;
+    acb->n = 0;
+    acb->cluster_offset = 0;
+
+    qcow_aio_read_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_aio_write_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    uint64_t cluster_offset;
+    const uint8_t *src_buf;
+
+    acb->hd_aiocb = NULL;
+
+    if (ret < 0)
+        goto done;
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+    acb->n = s->cluster_sectors - index_in_cluster;
+    if (acb->n > acb->nb_sectors)
+        acb->n = acb->nb_sectors;
+    cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
+                                        index_in_cluster,
+                                        index_in_cluster + acb->n);
+    if (!cluster_offset || (cluster_offset & 511) != 0) {
+        ret = -EIO;
+        goto done;
+    }
+    if (s->crypt_method) {
+        if (!acb->cluster_data) {
+            acb->cluster_data = qemu_mallocz(s->cluster_size);
+            if (!acb->cluster_data) {
+                ret = -ENOMEM;
+                goto done;
+            }
+        }
+        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
+                        acb->n, 1, &s->aes_encrypt_key);
+        src_buf = acb->cluster_data;
+    } else {
+        src_buf = acb->buf;
+    }
+
+    acb->hd_iov.iov_base = (void *)src_buf;
+    acb->hd_iov.iov_len = acb->n * 512;
+    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+    acb->hd_aiocb = bdrv_aio_writev(s->hd,
+                                    (cluster_offset >> 9) + index_in_cluster,
+                                    &acb->hd_qiov, acb->n,
+                                    qcow_aio_write_cb, acb);
+    if (acb->hd_aiocb == NULL)
+        goto done;
+    return;
+
+done:
+    if (acb->qiov->niov > 1)
+        qemu_vfree(acb->orig_buf);
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->hd_aiocb = NULL;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    if (qiov->niov > 1) {
+        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+        qemu_iovec_to_buffer(qiov, acb->buf);
+    } else {
+        acb->buf = (uint8_t *)qiov->iov->iov_base;
+    }
+    acb->nb_sectors = nb_sectors;
+    acb->n = 0;
+
+    qcow_aio_write_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
+    if (acb->hd_aiocb)
+        bdrv_aio_cancel(acb->hd_aiocb);
+    qemu_aio_release(acb);
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    bdrv_delete(s->hd);
+}
+
+static int qcow_create(const char *filename, int64_t total_size,
+                      const char *backing_file, int flags)
+{
+    int fd, header_size, backing_filename_len, l1_size, i, shift;
+    QCowHeader header;
+    uint64_t tmp;
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -1;
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        if (strcmp(backing_file, "fat:")) {
+            header.backing_file_offset = cpu_to_be64(header_size);
+            backing_filename_len = strlen(backing_file);
+            header.backing_file_size = cpu_to_be32(backing_filename_len);
+            header_size += backing_filename_len;
+        } else {
+            /* special backing file for vvfat */
+            backing_file = NULL;
+        }
+        header.cluster_bits = 9; /* 512 byte cluster to avoid copying
+                                    unmodifyed sectors */
+        header.l2_bits = 12; /* 32 KB L2 tables */
+    } else {
+        header.cluster_bits = 12; /* 4 KB clusters */
+        header.l2_bits = 9; /* 4 KB L2 tables */
+    }
+    header_size = (header_size + 7) & ~7;
+    shift = header.cluster_bits + header.l2_bits;
+    l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
+
+    header.l1_table_offset = cpu_to_be64(header_size);
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+
+    /* write all the data */
+    write(fd, &header, sizeof(header));
+    if (backing_file) {
+        write(fd, backing_file, backing_filename_len);
+    }
+    lseek(fd, header_size, SEEK_SET);
+    tmp = 0;
+    for(i = 0;i < l1_size; i++) {
+        write(fd, &tmp, sizeof(tmp));
+    }
+    close(fd);
+    return 0;
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
+	return -1;
+    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors != s->cluster_sectors)
+        return -EINVAL;
+
+    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+    if (!out_buf)
+        return -1;
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        qemu_free(out_buf);
+        return -1;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        qemu_free(out_buf);
+        deflateEnd(&strm);
+        return -1;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        qcow_write(bs, sector_num, buf, s->cluster_sectors);
+    } else {
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
+                                            out_len, 0, 0);
+        cluster_offset &= s->cluster_offset_mask;
+        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
+            qemu_free(out_buf);
+            return -1;
+        }
+    }
+
+    qemu_free(out_buf);
+    return 0;
+}
+
+static void qcow_flush(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdrv_flush(s->hd);
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    return 0;
+}
+
+static BlockDriver bdrv_qcow = {
+    .format_name	= "qcow",
+    .instance_size	= sizeof(BDRVQcowState),
+    .bdrv_probe		= qcow_probe,
+    .bdrv_open		= qcow_open,
+    .bdrv_close		= qcow_close,
+    .bdrv_create	= qcow_create,
+    .bdrv_flush		= qcow_flush,
+    .bdrv_is_allocated	= qcow_is_allocated,
+    .bdrv_set_key	= qcow_set_key,
+    .bdrv_make_empty	= qcow_make_empty,
+    .bdrv_aio_readv	= qcow_aio_readv,
+    .bdrv_aio_writev	= qcow_aio_writev,
+    .bdrv_aio_cancel	= qcow_aio_cancel,
+    .aiocb_size		= sizeof(QCowAIOCB),
+    .bdrv_write_compressed = qcow_write_compressed,
+    .bdrv_get_info	= qcow_get_info,
+};
+
+static void bdrv_qcow_init(void)
+{
+    bdrv_register(&bdrv_qcow);
+}
+
+block_init(bdrv_qcow_init);
diff --git a/block/qcow2.c b/block/qcow2.c
new file mode 100644
index 0000000..a6de9b6
--- /dev/null
+++ b/block/qcow2.c
@@ -0,0 +1,2931 @@
+/*
+ * Block driver for the QCOW version 2 format
+ *
+ * Copyright (c) 2004-2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+#include <zlib.h>
+#include "aes.h"
+
+/*
+  Differences with QCOW:
+
+  - Support for multiple incremental snapshots.
+  - Memory management by reference counts.
+  - Clusters which have a reference count of one have the bit
+    QCOW_OFLAG_COPIED to optimize write performance.
+  - Size of compressed clusters is stored in sectors to reduce bit usage
+    in the cluster offsets.
+  - Support for storing additional data (such as the VM state) in the
+    snapshots.
+  - If a backing store is used, the cluster size is not constrained
+    (could be backported to QCOW).
+  - L2 tables have always a size of one cluster.
+*/
+
+//#define DEBUG_ALLOC
+//#define DEBUG_ALLOC2
+//#define DEBUG_EXT
+
+#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
+#define QCOW_VERSION 2
+
+#define QCOW_CRYPT_NONE 0
+#define QCOW_CRYPT_AES  1
+
+#define QCOW_MAX_CRYPT_CLUSTERS 32
+
+/* indicate that the refcount of the referenced cluster is exactly one. */
+#define QCOW_OFLAG_COPIED     (1LL << 63)
+/* indicate that the cluster is compressed (they never have the copied flag) */
+#define QCOW_OFLAG_COMPRESSED (1LL << 62)
+
+#define REFCOUNT_SHIFT 1 /* refcount size is 2 bytes */
+
+typedef struct QCowHeader {
+    uint32_t magic;
+    uint32_t version;
+    uint64_t backing_file_offset;
+    uint32_t backing_file_size;
+    uint32_t cluster_bits;
+    uint64_t size; /* in bytes */
+    uint32_t crypt_method;
+    uint32_t l1_size; /* XXX: save number of clusters instead ? */
+    uint64_t l1_table_offset;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_clusters;
+    uint32_t nb_snapshots;
+    uint64_t snapshots_offset;
+} QCowHeader;
+
+
+typedef struct {
+    uint32_t magic;
+    uint32_t len;
+} QCowExtension;
+#define  QCOW_EXT_MAGIC_END 0
+#define  QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA
+
+
+typedef struct __attribute__((packed)) QCowSnapshotHeader {
+    /* header is 8 byte aligned */
+    uint64_t l1_table_offset;
+
+    uint32_t l1_size;
+    uint16_t id_str_size;
+    uint16_t name_size;
+
+    uint32_t date_sec;
+    uint32_t date_nsec;
+
+    uint64_t vm_clock_nsec;
+
+    uint32_t vm_state_size;
+    uint32_t extra_data_size; /* for extension */
+    /* extra data follows */
+    /* id_str follows */
+    /* name follows  */
+} QCowSnapshotHeader;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct QCowSnapshot {
+    uint64_t l1_table_offset;
+    uint32_t l1_size;
+    char *id_str;
+    char *name;
+    uint32_t vm_state_size;
+    uint32_t date_sec;
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec;
+} QCowSnapshot;
+
+typedef struct BDRVQcowState {
+    BlockDriverState *hd;
+    int cluster_bits;
+    int cluster_size;
+    int cluster_sectors;
+    int l2_bits;
+    int l2_size;
+    int l1_size;
+    int l1_vm_state_index;
+    int csize_shift;
+    int csize_mask;
+    uint64_t cluster_offset_mask;
+    uint64_t l1_table_offset;
+    uint64_t *l1_table;
+    uint64_t *l2_cache;
+    uint64_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+    uint8_t *cluster_cache;
+    uint8_t *cluster_data;
+    uint64_t cluster_cache_offset;
+
+    uint64_t *refcount_table;
+    uint64_t refcount_table_offset;
+    uint32_t refcount_table_size;
+    uint64_t refcount_block_cache_offset;
+    uint16_t *refcount_block_cache;
+    int64_t free_cluster_index;
+    int64_t free_byte_offset;
+
+    uint32_t crypt_method; /* current crypt method, 0 if no key yet */
+    uint32_t crypt_method_header;
+    AES_KEY aes_encrypt_key;
+    AES_KEY aes_decrypt_key;
+    uint64_t snapshots_offset;
+    int snapshots_size;
+    int nb_snapshots;
+    QCowSnapshot *snapshots;
+} BDRVQcowState;
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset);
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+                     uint8_t *buf, int nb_sectors);
+static int qcow_read_snapshots(BlockDriverState *bs);
+static void qcow_free_snapshots(BlockDriverState *bs);
+static int refcount_init(BlockDriverState *bs);
+static void refcount_close(BlockDriverState *bs);
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index);
+static int update_cluster_refcount(BlockDriverState *bs,
+                                   int64_t cluster_index,
+                                   int addend);
+static void update_refcount(BlockDriverState *bs,
+                            int64_t offset, int64_t length,
+                            int addend);
+static int64_t alloc_clusters(BlockDriverState *bs, int64_t size);
+static int64_t alloc_bytes(BlockDriverState *bs, int size);
+static void free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size);
+static int check_refcounts(BlockDriverState *bs);
+
+static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    const QCowHeader *cow_header = (const void *)buf;
+
+    if (buf_size >= sizeof(QCowHeader) &&
+        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
+        be32_to_cpu(cow_header->version) == QCOW_VERSION)
+        return 100;
+    else
+        return 0;
+}
+
+
+/* 
+ * read qcow2 extension and fill bs
+ * start reading from start_offset
+ * finish reading upon magic of value 0 or when end_offset reached
+ * unknown magic is skipped (future extension this version knows nothing about)
+ * return 0 upon success, non-0 otherwise
+ */
+static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset,
+                                uint64_t end_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowExtension ext;
+    uint64_t offset;
+
+#ifdef DEBUG_EXT
+    printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
+#endif
+    offset = start_offset;
+    while (offset < end_offset) {
+
+#ifdef DEBUG_EXT
+        /* Sanity check */
+        if (offset > s->cluster_size)
+            printf("qcow_handle_extension: suspicious offset %lu\n", offset);
+
+        printf("attemting to read extended header in offset %lu\n", offset);
+#endif
+
+        if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) {
+            fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n",
+                    (unsigned long long)offset);
+            return 1;
+        }
+        be32_to_cpus(&ext.magic);
+        be32_to_cpus(&ext.len);
+        offset += sizeof(ext);
+#ifdef DEBUG_EXT
+        printf("ext.magic = 0x%x\n", ext.magic);
+#endif
+        switch (ext.magic) {
+        case QCOW_EXT_MAGIC_END:
+            return 0;
+
+        case QCOW_EXT_MAGIC_BACKING_FORMAT:
+            if (ext.len >= sizeof(bs->backing_format)) {
+                fprintf(stderr, "ERROR: ext_backing_format: len=%u too large"
+                        " (>=%zu)\n",
+                        ext.len, sizeof(bs->backing_format));
+                return 2;
+            }
+            if (bdrv_pread(s->hd, offset , bs->backing_format,
+                           ext.len) != ext.len)
+                return 3;
+            bs->backing_format[ext.len] = '\0';
+#ifdef DEBUG_EXT
+            printf("Qcow2: Got format extension %s\n", bs->backing_format);
+#endif
+            offset += ((ext.len + 7) & ~7);
+            break;
+
+        default:
+            /* unknown magic -- just skip it */
+            offset += ((ext.len + 7) & ~7);
+            break;
+        }
+    }
+
+    return 0;
+}
+
+
+static int qcow_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVQcowState *s = bs->opaque;
+    int len, i, shift, ret;
+    QCowHeader header;
+    uint64_t ext_end;
+
+    /* Performance is terrible right now with cache=writethrough due mainly
+     * to reference count updates.  If the user does not explicitly specify
+     * a caching type, force to writeback caching.
+     */
+    if ((flags & BDRV_O_CACHE_DEF)) {
+        flags |= BDRV_O_CACHE_WB;
+        flags &= ~BDRV_O_CACHE_DEF;
+    }
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+    if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header))
+        goto fail;
+    be32_to_cpus(&header.magic);
+    be32_to_cpus(&header.version);
+    be64_to_cpus(&header.backing_file_offset);
+    be32_to_cpus(&header.backing_file_size);
+    be64_to_cpus(&header.size);
+    be32_to_cpus(&header.cluster_bits);
+    be32_to_cpus(&header.crypt_method);
+    be64_to_cpus(&header.l1_table_offset);
+    be32_to_cpus(&header.l1_size);
+    be64_to_cpus(&header.refcount_table_offset);
+    be32_to_cpus(&header.refcount_table_clusters);
+    be64_to_cpus(&header.snapshots_offset);
+    be32_to_cpus(&header.nb_snapshots);
+
+    if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
+        goto fail;
+    if (header.size <= 1 ||
+        header.cluster_bits < 9 ||
+        header.cluster_bits > 16)
+        goto fail;
+    if (header.crypt_method > QCOW_CRYPT_AES)
+        goto fail;
+    s->crypt_method_header = header.crypt_method;
+    if (s->crypt_method_header)
+        bs->encrypted = 1;
+    s->cluster_bits = header.cluster_bits;
+    s->cluster_size = 1 << s->cluster_bits;
+    s->cluster_sectors = 1 << (s->cluster_bits - 9);
+    s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
+    s->l2_size = 1 << s->l2_bits;
+    bs->total_sectors = header.size / 512;
+    s->csize_shift = (62 - (s->cluster_bits - 8));
+    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
+    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
+    s->refcount_table_offset = header.refcount_table_offset;
+    s->refcount_table_size =
+        header.refcount_table_clusters << (s->cluster_bits - 3);
+
+    s->snapshots_offset = header.snapshots_offset;
+    s->nb_snapshots = header.nb_snapshots;
+
+    /* read the level 1 table */
+    s->l1_size = header.l1_size;
+    shift = s->cluster_bits + s->l2_bits;
+    s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift;
+    /* the L1 table must contain at least enough entries to put
+       header.size bytes */
+    if (s->l1_size < s->l1_vm_state_index)
+        goto fail;
+    s->l1_table_offset = header.l1_table_offset;
+    s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
+        s->l1_size * sizeof(uint64_t))
+        goto fail;
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+    /* alloc L2 cache */
+    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    s->cluster_cache = qemu_malloc(s->cluster_size);
+    /* one more sector for decompressed data alignment */
+    s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+                                  + 512);
+    s->cluster_cache_offset = -1;
+
+    if (refcount_init(bs) < 0)
+        goto fail;
+
+    /* read qcow2 extensions */
+    if (header.backing_file_offset)
+        ext_end = header.backing_file_offset;
+    else
+        ext_end = s->cluster_size;
+    if (qcow_read_extensions(bs, sizeof(header), ext_end))
+        goto fail;
+
+    /* read the backing file name */
+    if (header.backing_file_offset != 0) {
+        len = header.backing_file_size;
+        if (len > 1023)
+            len = 1023;
+        if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len)
+            goto fail;
+        bs->backing_file[len] = '\0';
+    }
+    if (qcow_read_snapshots(bs) < 0)
+        goto fail;
+
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+
+ fail:
+    qcow_free_snapshots(bs);
+    refcount_close(bs);
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+static int qcow_set_key(BlockDriverState *bs, const char *key)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint8_t keybuf[16];
+    int len, i;
+
+    memset(keybuf, 0, 16);
+    len = strlen(key);
+    if (len > 16)
+        len = 16;
+    /* XXX: we could compress the chars to 7 bits to increase
+       entropy */
+    for(i = 0;i < len;i++) {
+        keybuf[i] = key[i];
+    }
+    s->crypt_method = s->crypt_method_header;
+
+    if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
+        return -1;
+    if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
+        return -1;
+#if 0
+    /* test */
+    {
+        uint8_t in[16];
+        uint8_t out[16];
+        uint8_t tmp[16];
+        for(i=0;i<16;i++)
+            in[i] = i;
+        AES_encrypt(in, tmp, &s->aes_encrypt_key);
+        AES_decrypt(tmp, out, &s->aes_decrypt_key);
+        for(i = 0; i < 16; i++)
+            printf(" %02x", tmp[i]);
+        printf("\n");
+        for(i = 0; i < 16; i++)
+            printf(" %02x", out[i]);
+        printf("\n");
+    }
+#endif
+    return 0;
+}
+
+/* The crypt function is compatible with the linux cryptoloop
+   algorithm for < 4 GB images. NOTE: out_buf == in_buf is
+   supported */
+static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
+                            uint8_t *out_buf, const uint8_t *in_buf,
+                            int nb_sectors, int enc,
+                            const AES_KEY *key)
+{
+    union {
+        uint64_t ll[2];
+        uint8_t b[16];
+    } ivec;
+    int i;
+
+    for(i = 0; i < nb_sectors; i++) {
+        ivec.ll[0] = cpu_to_le64(sector_num);
+        ivec.ll[1] = 0;
+        AES_cbc_encrypt(in_buf, out_buf, 512, key,
+                        ivec.b, enc);
+        sector_num++;
+        in_buf += 512;
+        out_buf += 512;
+    }
+}
+
+static int copy_sectors(BlockDriverState *bs, uint64_t start_sect,
+                        uint64_t cluster_offset, int n_start, int n_end)
+{
+    BDRVQcowState *s = bs->opaque;
+    int n, ret;
+
+    n = n_end - n_start;
+    if (n <= 0)
+        return 0;
+    ret = qcow_read(bs, start_sect + n_start, s->cluster_data, n);
+    if (ret < 0)
+        return ret;
+    if (s->crypt_method) {
+        encrypt_sectors(s, start_sect + n_start,
+                        s->cluster_data,
+                        s->cluster_data, n, 1,
+                        &s->aes_encrypt_key);
+    }
+    ret = bdrv_write(s->hd, (cluster_offset >> 9) + n_start,
+                     s->cluster_data, n);
+    if (ret < 0)
+        return ret;
+    return 0;
+}
+
+static void l2_cache_reset(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
+    memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
+}
+
+static inline int l2_cache_new_entry(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint32_t min_count;
+    int min_index, i;
+
+    /* find a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    return min_index;
+}
+
+static int64_t align_offset(int64_t offset, int n)
+{
+    offset = (offset + n - 1) & ~(n - 1);
+    return offset;
+}
+
+static int grow_l1_table(BlockDriverState *bs, int min_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_l1_size, new_l1_size2, ret, i;
+    uint64_t *new_l1_table;
+    uint64_t new_l1_table_offset;
+    uint8_t data[12];
+
+    new_l1_size = s->l1_size;
+    if (min_size <= new_l1_size)
+        return 0;
+    while (min_size > new_l1_size) {
+        new_l1_size = (new_l1_size * 3 + 1) / 2;
+    }
+#ifdef DEBUG_ALLOC2
+    printf("grow l1_table from %d to %d\n", s->l1_size, new_l1_size);
+#endif
+
+    new_l1_size2 = sizeof(uint64_t) * new_l1_size;
+    new_l1_table = qemu_mallocz(new_l1_size2);
+    memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t));
+
+    /* write new table (align to cluster) */
+    new_l1_table_offset = alloc_clusters(bs, new_l1_size2);
+
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
+    ret = bdrv_pwrite(s->hd, new_l1_table_offset, new_l1_table, new_l1_size2);
+    if (ret != new_l1_size2)
+        goto fail;
+    for(i = 0; i < s->l1_size; i++)
+        new_l1_table[i] = be64_to_cpu(new_l1_table[i]);
+
+    /* set new table */
+    cpu_to_be32w((uint32_t*)data, new_l1_size);
+    cpu_to_be64w((uint64_t*)(data + 4), new_l1_table_offset);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, l1_size), data,
+                sizeof(data)) != sizeof(data))
+        goto fail;
+    qemu_free(s->l1_table);
+    free_clusters(bs, s->l1_table_offset, s->l1_size * sizeof(uint64_t));
+    s->l1_table_offset = new_l1_table_offset;
+    s->l1_table = new_l1_table;
+    s->l1_size = new_l1_size;
+    return 0;
+ fail:
+    qemu_free(s->l1_table);
+    return -EIO;
+}
+
+/*
+ * seek_l2_table
+ *
+ * seek l2_offset in the l2_cache table
+ * if not found, return NULL,
+ * if found,
+ *   increments the l2 cache hit count of the entry,
+ *   if counter overflow, divide by two all counters
+ *   return the pointer to the l2 cache entry
+ *
+ */
+
+static uint64_t *seek_l2_table(BDRVQcowState *s, uint64_t l2_offset)
+{
+    int i, j;
+
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            return s->l2_cache + (i << s->l2_bits);
+        }
+    }
+    return NULL;
+}
+
+/*
+ * l2_load
+ *
+ * Loads a L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns a pointer to the L2 table on success, or NULL if the read from
+ * the image file failed.
+ */
+
+static uint64_t *l2_load(BlockDriverState *bs, uint64_t l2_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index;
+    uint64_t *l2_table;
+
+    /* seek if the table for the given offset is in the cache */
+
+    l2_table = seek_l2_table(s, l2_offset);
+    if (l2_table != NULL)
+        return l2_table;
+
+    /* not found: load a new entry in the least used one */
+
+    min_index = l2_cache_new_entry(bs);
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+    if (bdrv_pread(s->hd, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
+        s->l2_size * sizeof(uint64_t))
+        return NULL;
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+
+    return l2_table;
+}
+
+/*
+ * l2_allocate
+ *
+ * Allocate a new l2 entry in the file. If l1_index points to an already
+ * used entry in the L2 table (i.e. we are doing a copy on write for the L2
+ * table) copy the contents of the old L2 table into the newly allocated one.
+ * Otherwise the new table is initialized with zeros.
+ *
+ */
+
+static uint64_t *l2_allocate(BlockDriverState *bs, int l1_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int min_index;
+    uint64_t old_l2_offset, tmp;
+    uint64_t *l2_table, l2_offset;
+
+    old_l2_offset = s->l1_table[l1_index];
+
+    /* allocate a new l2 entry */
+
+    l2_offset = alloc_clusters(bs, s->l2_size * sizeof(uint64_t));
+
+    /* update the L1 entry */
+
+    s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
+
+    tmp = cpu_to_be64(l2_offset | QCOW_OFLAG_COPIED);
+    if (bdrv_pwrite(s->hd, s->l1_table_offset + l1_index * sizeof(tmp),
+                    &tmp, sizeof(tmp)) != sizeof(tmp))
+        return NULL;
+
+    /* allocate a new entry in the l2 cache */
+
+    min_index = l2_cache_new_entry(bs);
+    l2_table = s->l2_cache + (min_index << s->l2_bits);
+
+    if (old_l2_offset == 0) {
+        /* if there was no old l2 table, clear the new table */
+        memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
+    } else {
+        /* if there was an old l2 table, read it from the disk */
+        if (bdrv_pread(s->hd, old_l2_offset,
+                       l2_table, s->l2_size * sizeof(uint64_t)) !=
+            s->l2_size * sizeof(uint64_t))
+            return NULL;
+    }
+    /* write the l2 table to the file */
+    if (bdrv_pwrite(s->hd, l2_offset,
+                    l2_table, s->l2_size * sizeof(uint64_t)) !=
+        s->l2_size * sizeof(uint64_t))
+        return NULL;
+
+    /* update the l2 cache entry */
+
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+
+    return l2_table;
+}
+
+static int size_to_clusters(BDRVQcowState *s, int64_t size)
+{
+    return (size + (s->cluster_size - 1)) >> s->cluster_bits;
+}
+
+static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size,
+        uint64_t *l2_table, uint64_t start, uint64_t mask)
+{
+    int i;
+    uint64_t offset = be64_to_cpu(l2_table[0]) & ~mask;
+
+    if (!offset)
+        return 0;
+
+    for (i = start; i < start + nb_clusters; i++)
+        if (offset + i * cluster_size != (be64_to_cpu(l2_table[i]) & ~mask))
+            break;
+
+	return (i - start);
+}
+
+static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table)
+{
+    int i = 0;
+
+    while(nb_clusters-- && l2_table[i] == 0)
+        i++;
+
+    return i;
+}
+
+/*
+ * get_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * on entry, *num is the number of contiguous clusters we'd like to
+ * access following offset.
+ *
+ * on exit, *num is the number of contiguous clusters we can read.
+ *
+ * Return 1, if the offset is found
+ * Return 0, otherwise.
+ *
+ */
+
+static uint64_t get_cluster_offset(BlockDriverState *bs,
+                                   uint64_t offset, int *num)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l1_index, l2_index;
+    uint64_t l2_offset, *l2_table, cluster_offset;
+    int l1_bits, c;
+    int index_in_cluster, nb_available, nb_needed, nb_clusters;
+
+    index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
+    nb_needed = *num + index_in_cluster;
+
+    l1_bits = s->l2_bits + s->cluster_bits;
+
+    /* compute how many bytes there are between the offset and
+     * the end of the l1 entry
+     */
+
+    nb_available = (1 << l1_bits) - (offset & ((1 << l1_bits) - 1));
+
+    /* compute the number of available sectors */
+
+    nb_available = (nb_available >> 9) + index_in_cluster;
+
+    if (nb_needed > nb_available) {
+        nb_needed = nb_available;
+    }
+
+    cluster_offset = 0;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> l1_bits;
+    if (l1_index >= s->l1_size)
+        goto out;
+
+    l2_offset = s->l1_table[l1_index];
+
+    /* seek the l2 table of the given l2 offset */
+
+    if (!l2_offset)
+        goto out;
+
+    /* load the l2 table in memory */
+
+    l2_offset &= ~QCOW_OFLAG_COPIED;
+    l2_table = l2_load(bs, l2_offset);
+    if (l2_table == NULL)
+        return 0;
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    nb_clusters = size_to_clusters(s, nb_needed << 9);
+
+    if (!cluster_offset) {
+        /* how many empty clusters ? */
+        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+    } else {
+        /* how many allocated clusters ? */
+        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0, QCOW_OFLAG_COPIED);
+    }
+
+   nb_available = (c * s->cluster_sectors);
+out:
+    if (nb_available > nb_needed)
+        nb_available = nb_needed;
+
+    *num = nb_available - index_in_cluster;
+
+    return cluster_offset & ~QCOW_OFLAG_COPIED;
+}
+
+/*
+ * free_any_clusters
+ *
+ * free clusters according to its type: compressed or not
+ *
+ */
+
+static void free_any_clusters(BlockDriverState *bs,
+                              uint64_t cluster_offset, int nb_clusters)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    /* free the cluster */
+
+    if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        int nb_csectors;
+        nb_csectors = ((cluster_offset >> s->csize_shift) &
+                       s->csize_mask) + 1;
+        free_clusters(bs, (cluster_offset & s->cluster_offset_mask) & ~511,
+                      nb_csectors * 512);
+        return;
+    }
+
+    free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
+
+    return;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given disk offset, load (and allocate if needed)
+ * the l2 table.
+ *
+ * the l2 table offset in the qcow2 file and the cluster index
+ * in the l2 table are given to the caller.
+ *
+ */
+
+static int get_cluster_table(BlockDriverState *bs, uint64_t offset,
+                             uint64_t **new_l2_table,
+                             uint64_t *new_l2_offset,
+                             int *new_l2_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l1_index, l2_index, ret;
+    uint64_t l2_offset, *l2_table;
+
+    /* seek the the l2 offset in the l1 table */
+
+    l1_index = offset >> (s->l2_bits + s->cluster_bits);
+    if (l1_index >= s->l1_size) {
+        ret = grow_l1_table(bs, l1_index + 1);
+        if (ret < 0)
+            return 0;
+    }
+    l2_offset = s->l1_table[l1_index];
+
+    /* seek the l2 table of the given l2 offset */
+
+    if (l2_offset & QCOW_OFLAG_COPIED) {
+        /* load the l2 table in memory */
+        l2_offset &= ~QCOW_OFLAG_COPIED;
+        l2_table = l2_load(bs, l2_offset);
+        if (l2_table == NULL)
+            return 0;
+    } else {
+        if (l2_offset)
+            free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t));
+        l2_table = l2_allocate(bs, l1_index);
+        if (l2_table == NULL)
+            return 0;
+        l2_offset = s->l1_table[l1_index] & ~QCOW_OFLAG_COPIED;
+    }
+
+    /* find the cluster offset for the given disk offset */
+
+    l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
+
+    *new_l2_table = l2_table;
+    *new_l2_offset = l2_offset;
+    *new_l2_index = l2_index;
+
+    return 1;
+}
+
+/*
+ * alloc_compressed_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new compressed cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+static uint64_t alloc_compressed_cluster_offset(BlockDriverState *bs,
+                                                uint64_t offset,
+                                                int compressed_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index, ret;
+    uint64_t l2_offset, *l2_table, cluster_offset;
+    int nb_csectors;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    if (ret == 0)
+        return 0;
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+    if (cluster_offset & QCOW_OFLAG_COPIED)
+        return cluster_offset & ~QCOW_OFLAG_COPIED;
+
+    if (cluster_offset)
+        free_any_clusters(bs, cluster_offset, 1);
+
+    cluster_offset = alloc_bytes(bs, compressed_size);
+    nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) -
+                  (cluster_offset >> 9);
+
+    cluster_offset |= QCOW_OFLAG_COMPRESSED |
+                      ((uint64_t)nb_csectors << s->csize_shift);
+
+    /* update L2 table */
+
+    /* compressed clusters never have the copied flag */
+
+    l2_table[l2_index] = cpu_to_be64(cluster_offset);
+    if (bdrv_pwrite(s->hd,
+                    l2_offset + l2_index * sizeof(uint64_t),
+                    l2_table + l2_index,
+                    sizeof(uint64_t)) != sizeof(uint64_t))
+        return 0;
+
+    return cluster_offset;
+}
+
+typedef struct QCowL2Meta
+{
+    uint64_t offset;
+    int n_start;
+    int nb_available;
+    int nb_clusters;
+} QCowL2Meta;
+
+static int alloc_cluster_link_l2(BlockDriverState *bs, uint64_t cluster_offset,
+        QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, j = 0, l2_index, ret;
+    uint64_t *old_cluster, start_sect, l2_offset, *l2_table;
+
+    if (m->nb_clusters == 0)
+        return 0;
+
+    old_cluster = qemu_malloc(m->nb_clusters * sizeof(uint64_t));
+
+    /* copy content of unmodified sectors */
+    start_sect = (m->offset & ~(s->cluster_size - 1)) >> 9;
+    if (m->n_start) {
+        ret = copy_sectors(bs, start_sect, cluster_offset, 0, m->n_start);
+        if (ret < 0)
+            goto err;
+    }
+
+    if (m->nb_available & (s->cluster_sectors - 1)) {
+        uint64_t end = m->nb_available & ~(uint64_t)(s->cluster_sectors - 1);
+        ret = copy_sectors(bs, start_sect + end, cluster_offset + (end << 9),
+                m->nb_available - end, s->cluster_sectors);
+        if (ret < 0)
+            goto err;
+    }
+
+    ret = -EIO;
+    /* update L2 table */
+    if (!get_cluster_table(bs, m->offset, &l2_table, &l2_offset, &l2_index))
+        goto err;
+
+    for (i = 0; i < m->nb_clusters; i++) {
+        /* if two concurrent writes happen to the same unallocated cluster
+	 * each write allocates separate cluster and writes data concurrently.
+	 * The first one to complete updates l2 table with pointer to its
+	 * cluster the second one has to do RMW (which is done above by
+	 * copy_sectors()), update l2 table with its cluster pointer and free
+	 * old cluster. This is what this loop does */
+        if(l2_table[l2_index + i] != 0)
+            old_cluster[j++] = l2_table[l2_index + i];
+
+        l2_table[l2_index + i] = cpu_to_be64((cluster_offset +
+                    (i << s->cluster_bits)) | QCOW_OFLAG_COPIED);
+     }
+
+    if (bdrv_pwrite(s->hd, l2_offset + l2_index * sizeof(uint64_t),
+                l2_table + l2_index, m->nb_clusters * sizeof(uint64_t)) !=
+            m->nb_clusters * sizeof(uint64_t))
+        goto err;
+
+    for (i = 0; i < j; i++)
+        free_any_clusters(bs, be64_to_cpu(old_cluster[i]) & ~QCOW_OFLAG_COPIED,
+                          1);
+
+    ret = 0;
+err:
+    qemu_free(old_cluster);
+    return ret;
+ }
+
+/*
+ * alloc_cluster_offset
+ *
+ * For a given offset of the disk image, return cluster offset in
+ * qcow2 file.
+ *
+ * If the offset is not found, allocate a new cluster.
+ *
+ * Return the cluster offset if successful,
+ * Return 0, otherwise.
+ *
+ */
+
+static uint64_t alloc_cluster_offset(BlockDriverState *bs,
+                                     uint64_t offset,
+                                     int n_start, int n_end,
+                                     int *num, QCowL2Meta *m)
+{
+    BDRVQcowState *s = bs->opaque;
+    int l2_index, ret;
+    uint64_t l2_offset, *l2_table, cluster_offset;
+    int nb_clusters, i = 0;
+
+    ret = get_cluster_table(bs, offset, &l2_table, &l2_offset, &l2_index);
+    if (ret == 0)
+        return 0;
+
+    nb_clusters = size_to_clusters(s, n_end << 9);
+
+    nb_clusters = MIN(nb_clusters, s->l2_size - l2_index);
+
+    cluster_offset = be64_to_cpu(l2_table[l2_index]);
+
+    /* We keep all QCOW_OFLAG_COPIED clusters */
+
+    if (cluster_offset & QCOW_OFLAG_COPIED) {
+        nb_clusters = count_contiguous_clusters(nb_clusters, s->cluster_size,
+                &l2_table[l2_index], 0, 0);
+
+        cluster_offset &= ~QCOW_OFLAG_COPIED;
+        m->nb_clusters = 0;
+
+        goto out;
+    }
+
+    /* for the moment, multiple compressed clusters are not managed */
+
+    if (cluster_offset & QCOW_OFLAG_COMPRESSED)
+        nb_clusters = 1;
+
+    /* how many available clusters ? */
+
+    while (i < nb_clusters) {
+        i += count_contiguous_clusters(nb_clusters - i, s->cluster_size,
+                &l2_table[l2_index], i, 0);
+
+        if(be64_to_cpu(l2_table[l2_index + i]))
+            break;
+
+        i += count_contiguous_free_clusters(nb_clusters - i,
+                &l2_table[l2_index + i]);
+
+        cluster_offset = be64_to_cpu(l2_table[l2_index + i]);
+
+        if ((cluster_offset & QCOW_OFLAG_COPIED) ||
+                (cluster_offset & QCOW_OFLAG_COMPRESSED))
+            break;
+    }
+    nb_clusters = i;
+
+    /* allocate a new cluster */
+
+    cluster_offset = alloc_clusters(bs, nb_clusters * s->cluster_size);
+
+    /* save info needed for meta data update */
+    m->offset = offset;
+    m->n_start = n_start;
+    m->nb_clusters = nb_clusters;
+
+out:
+    m->nb_available = MIN(nb_clusters << (s->cluster_bits - 9), n_end);
+
+    *num = m->nb_available - n_start;
+
+    return cluster_offset;
+}
+
+static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    uint64_t cluster_offset;
+
+    *pnum = nb_sectors;
+    cluster_offset = get_cluster_offset(bs, sector_num << 9, pnum);
+
+    return (cluster_offset != 0);
+}
+
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+                             const uint8_t *buf, int buf_size)
+{
+    z_stream strm1, *strm = &strm1;
+    int ret, out_len;
+
+    memset(strm, 0, sizeof(*strm));
+
+    strm->next_in = (uint8_t *)buf;
+    strm->avail_in = buf_size;
+    strm->next_out = out_buf;
+    strm->avail_out = out_buf_size;
+
+    ret = inflateInit2(strm, -12);
+    if (ret != Z_OK)
+        return -1;
+    ret = inflate(strm, Z_FINISH);
+    out_len = strm->next_out - out_buf;
+    if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+        out_len != out_buf_size) {
+        inflateEnd(strm);
+        return -1;
+    }
+    inflateEnd(strm);
+    return 0;
+}
+
+static int decompress_cluster(BDRVQcowState *s, uint64_t cluster_offset)
+{
+    int ret, csize, nb_csectors, sector_offset;
+    uint64_t coffset;
+
+    coffset = cluster_offset & s->cluster_offset_mask;
+    if (s->cluster_cache_offset != coffset) {
+        nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+        sector_offset = coffset & 511;
+        csize = nb_csectors * 512 - sector_offset;
+        ret = bdrv_read(s->hd, coffset >> 9, s->cluster_data, nb_csectors);
+        if (ret < 0) {
+            return -1;
+        }
+        if (decompress_buffer(s->cluster_cache, s->cluster_size,
+                              s->cluster_data + sector_offset, csize) < 0) {
+            return -1;
+        }
+        s->cluster_cache_offset = coffset;
+    }
+    return 0;
+}
+
+/* handle reading after the end of the backing file */
+static int backing_read1(BlockDriverState *bs,
+                         int64_t sector_num, uint8_t *buf, int nb_sectors)
+{
+    int n1;
+    if ((sector_num + nb_sectors) <= bs->total_sectors)
+        return nb_sectors;
+    if (sector_num >= bs->total_sectors)
+        n1 = 0;
+    else
+        n1 = bs->total_sectors - sector_num;
+    memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1));
+    return n1;
+}
+
+static int qcow_read(BlockDriverState *bs, int64_t sector_num,
+                     uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, index_in_cluster, n, n1;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        n = nb_sectors;
+        cluster_offset = get_cluster_offset(bs, sector_num << 9, &n);
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        if (!cluster_offset) {
+            if (bs->backing_hd) {
+                /* read from the base image */
+                n1 = backing_read1(bs->backing_hd, sector_num, buf, n);
+                if (n1 > 0) {
+                    ret = bdrv_read(bs->backing_hd, sector_num, buf, n1);
+                    if (ret < 0)
+                        return -1;
+                }
+            } else {
+                memset(buf, 0, 512 * n);
+            }
+        } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
+            if (decompress_cluster(s, cluster_offset) < 0)
+                return -1;
+            memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
+        } else {
+            ret = bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+            if (ret != n * 512)
+                return -1;
+            if (s->crypt_method) {
+                encrypt_sectors(s, sector_num, buf, buf, n, 0,
+                                &s->aes_decrypt_key);
+            }
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+
+static int qcow_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, index_in_cluster, n;
+    uint64_t cluster_offset;
+    int n_end;
+    QCowL2Meta l2meta;
+
+    while (nb_sectors > 0) {
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n_end = index_in_cluster + nb_sectors;
+        if (s->crypt_method &&
+            n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+            n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+        cluster_offset = alloc_cluster_offset(bs, sector_num << 9,
+                                              index_in_cluster,
+                                              n_end, &n, &l2meta);
+        if (!cluster_offset)
+            return -1;
+        if (s->crypt_method) {
+            encrypt_sectors(s, sector_num, s->cluster_data, buf, n, 1,
+                            &s->aes_encrypt_key);
+            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512,
+                              s->cluster_data, n * 512);
+        } else {
+            ret = bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512);
+        }
+        if (ret != n * 512 || alloc_cluster_link_l2(bs, cluster_offset, &l2meta) < 0) {
+            free_any_clusters(bs, cluster_offset, l2meta.nb_clusters);
+            return -1;
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+    return 0;
+}
+
+typedef struct QCowAIOCB {
+    BlockDriverAIOCB common;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    uint8_t *buf;
+    void *orig_buf;
+    int nb_sectors;
+    int n;
+    uint64_t cluster_offset;
+    uint8_t *cluster_data;
+    BlockDriverAIOCB *hd_aiocb;
+    struct iovec hd_iov;
+    QEMUIOVector hd_qiov;
+    QEMUBH *bh;
+    QCowL2Meta l2meta;
+} QCowAIOCB;
+
+static void qcow_aio_read_cb(void *opaque, int ret);
+static void qcow_aio_read_bh(void *opaque)
+{
+    QCowAIOCB *acb = opaque;
+    qemu_bh_delete(acb->bh);
+    acb->bh = NULL;
+    qcow_aio_read_cb(opaque, 0);
+}
+
+static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb)
+{
+    if (acb->bh)
+        return -EIO;
+
+    acb->bh = qemu_bh_new(cb, acb);
+    if (!acb->bh)
+        return -EIO;
+
+    qemu_bh_schedule(acb->bh);
+
+    return 0;
+}
+
+static void qcow_aio_read_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster, n1;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0)
+        goto done;
+
+    /* post process the read buffer */
+    if (!acb->cluster_offset) {
+        /* nothing to do */
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* nothing to do */
+    } else {
+        if (s->crypt_method) {
+            encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
+                            acb->n, 0,
+                            &s->aes_decrypt_key);
+        }
+    }
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    /* prepare next AIO request */
+    acb->n = acb->nb_sectors;
+    acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, &acb->n);
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+
+    if (!acb->cluster_offset) {
+        if (bs->backing_hd) {
+            /* read from the base image */
+            n1 = backing_read1(bs->backing_hd, acb->sector_num,
+                               acb->buf, acb->n);
+            if (n1 > 0) {
+                acb->hd_iov.iov_base = (void *)acb->buf;
+                acb->hd_iov.iov_len = acb->n * 512;
+                qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+                acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+                                    &acb->hd_qiov, acb->n,
+				    qcow_aio_read_cb, acb);
+                if (acb->hd_aiocb == NULL)
+                    goto done;
+            } else {
+                ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+                if (ret < 0)
+                    goto done;
+            }
+        } else {
+            /* Note: in this case, no need to wait */
+            memset(acb->buf, 0, 512 * acb->n);
+            ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+            if (ret < 0)
+                goto done;
+        }
+    } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
+        /* add AIO support for compressed blocks ? */
+        if (decompress_cluster(s, acb->cluster_offset) < 0)
+            goto done;
+        memcpy(acb->buf,
+               s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
+        ret = qcow_schedule_bh(qcow_aio_read_bh, acb);
+        if (ret < 0)
+            goto done;
+    } else {
+        if ((acb->cluster_offset & 511) != 0) {
+            ret = -EIO;
+            goto done;
+        }
+
+        acb->hd_iov.iov_base = (void *)acb->buf;
+        acb->hd_iov.iov_len = acb->n * 512;
+        qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+        acb->hd_aiocb = bdrv_aio_readv(s->hd,
+                            (acb->cluster_offset >> 9) + index_in_cluster,
+                            &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
+        if (acb->hd_aiocb == NULL)
+            goto done;
+    }
+
+    return;
+done:
+    if (acb->qiov->niov > 1) {
+        qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
+        qemu_vfree(acb->orig_buf);
+    }
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque, int is_write)
+{
+    QCowAIOCB *acb;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->hd_aiocb = NULL;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    if (qiov->niov > 1) {
+        acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
+        if (is_write)
+            qemu_iovec_to_buffer(qiov, acb->buf);
+    } else {
+        acb->buf = (uint8_t *)qiov->iov->iov_base;
+    }
+    acb->nb_sectors = nb_sectors;
+    acb->n = 0;
+    acb->cluster_offset = 0;
+    acb->l2meta.nb_clusters = 0;
+    return acb;
+}
+
+static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    QCowAIOCB *acb;
+
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
+    if (!acb)
+        return NULL;
+
+    qcow_aio_read_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_aio_write_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+    int index_in_cluster;
+    const uint8_t *src_buf;
+    int n_end;
+
+    acb->hd_aiocb = NULL;
+
+    if (ret < 0)
+        goto done;
+
+    if (alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) {
+        free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters);
+        goto done;
+    }
+
+    acb->nb_sectors -= acb->n;
+    acb->sector_num += acb->n;
+    acb->buf += acb->n * 512;
+
+    if (acb->nb_sectors == 0) {
+        /* request completed */
+        ret = 0;
+        goto done;
+    }
+
+    index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
+    n_end = index_in_cluster + acb->nb_sectors;
+    if (s->crypt_method &&
+        n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors)
+        n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors;
+
+    acb->cluster_offset = alloc_cluster_offset(bs, acb->sector_num << 9,
+                                          index_in_cluster,
+                                          n_end, &acb->n, &acb->l2meta);
+    if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) {
+        ret = -EIO;
+        goto done;
+    }
+    if (s->crypt_method) {
+        if (!acb->cluster_data) {
+            acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS *
+                                             s->cluster_size);
+        }
+        encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
+                        acb->n, 1, &s->aes_encrypt_key);
+        src_buf = acb->cluster_data;
+    } else {
+        src_buf = acb->buf;
+    }
+    acb->hd_iov.iov_base = (void *)src_buf;
+    acb->hd_iov.iov_len = acb->n * 512;
+    qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
+    acb->hd_aiocb = bdrv_aio_writev(s->hd,
+                                    (acb->cluster_offset >> 9) + index_in_cluster,
+                                    &acb->hd_qiov, acb->n,
+                                    qcow_aio_write_cb, acb);
+    if (acb->hd_aiocb == NULL)
+        goto done;
+
+    return;
+
+done:
+    if (acb->qiov->niov > 1)
+        qemu_vfree(acb->orig_buf);
+    acb->common.cb(acb->common.opaque, ret);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    s->cluster_cache_offset = -1; /* disable compressed cache */
+
+    acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
+    if (!acb)
+        return NULL;
+
+    qcow_aio_write_cb(acb, 0);
+    return &acb->common;
+}
+
+static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    QCowAIOCB *acb = (QCowAIOCB *)blockacb;
+    if (acb->hd_aiocb)
+        bdrv_aio_cancel(acb->hd_aiocb);
+    qemu_aio_release(acb);
+}
+
+static void qcow_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    qemu_free(s->cluster_cache);
+    qemu_free(s->cluster_data);
+    refcount_close(bs);
+    bdrv_delete(s->hd);
+}
+
+/* XXX: use std qcow open function ? */
+typedef struct QCowCreateState {
+    int cluster_size;
+    int cluster_bits;
+    uint16_t *refcount_block;
+    uint64_t *refcount_table;
+    int64_t l1_table_offset;
+    int64_t refcount_table_offset;
+    int64_t refcount_block_offset;
+} QCowCreateState;
+
+static void create_refcount_update(QCowCreateState *s,
+                                   int64_t offset, int64_t size)
+{
+    int refcount;
+    int64_t start, last, cluster_offset;
+    uint16_t *p;
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1)  & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        p = &s->refcount_block[cluster_offset >> s->cluster_bits];
+        refcount = be16_to_cpu(*p);
+        refcount++;
+        *p = cpu_to_be16(refcount);
+    }
+}
+
+static int qcow_create2(const char *filename, int64_t total_size,
+                        const char *backing_file, const char *backing_format,
+                        int flags)
+{
+
+    int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits;
+    int ref_clusters, backing_format_len = 0;
+    QCowHeader header;
+    uint64_t tmp, offset;
+    QCowCreateState s1, *s = &s1;
+    QCowExtension ext_bf = {0, 0};
+
+
+    memset(s, 0, sizeof(*s));
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -1;
+    memset(&header, 0, sizeof(header));
+    header.magic = cpu_to_be32(QCOW_MAGIC);
+    header.version = cpu_to_be32(QCOW_VERSION);
+    header.size = cpu_to_be64(total_size * 512);
+    header_size = sizeof(header);
+    backing_filename_len = 0;
+    if (backing_file) {
+        if (backing_format) {
+            ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT;
+            backing_format_len = strlen(backing_format);
+            ext_bf.len = (backing_format_len + 7) & ~7;
+            header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7);
+        }
+        header.backing_file_offset = cpu_to_be64(header_size);
+        backing_filename_len = strlen(backing_file);
+        header.backing_file_size = cpu_to_be32(backing_filename_len);
+        header_size += backing_filename_len;
+    }
+    s->cluster_bits = 12;  /* 4 KB clusters */
+    s->cluster_size = 1 << s->cluster_bits;
+    header.cluster_bits = cpu_to_be32(s->cluster_bits);
+    header_size = (header_size + 7) & ~7;
+    if (flags & BLOCK_FLAG_ENCRYPT) {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
+    } else {
+        header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
+    }
+    l2_bits = s->cluster_bits - 3;
+    shift = s->cluster_bits + l2_bits;
+    l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift);
+    offset = align_offset(header_size, s->cluster_size);
+    s->l1_table_offset = offset;
+    header.l1_table_offset = cpu_to_be64(s->l1_table_offset);
+    header.l1_size = cpu_to_be32(l1_size);
+    offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size);
+
+    s->refcount_table = qemu_mallocz(s->cluster_size);
+
+    s->refcount_table_offset = offset;
+    header.refcount_table_offset = cpu_to_be64(offset);
+    header.refcount_table_clusters = cpu_to_be32(1);
+    offset += s->cluster_size;
+    s->refcount_block_offset = offset;
+
+    /* count how many refcount blocks needed */
+    tmp = offset >> s->cluster_bits;
+    ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1;
+    for (i=0; i < ref_clusters; i++) {
+        s->refcount_table[i] = cpu_to_be64(offset);
+        offset += s->cluster_size;
+    }
+
+    s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size);
+
+    /* update refcounts */
+    create_refcount_update(s, 0, header_size);
+    create_refcount_update(s, s->l1_table_offset, l1_size * sizeof(uint64_t));
+    create_refcount_update(s, s->refcount_table_offset, s->cluster_size);
+    create_refcount_update(s, s->refcount_block_offset, ref_clusters * s->cluster_size);
+
+    /* write all the data */
+    write(fd, &header, sizeof(header));
+    if (backing_file) {
+        if (backing_format_len) {
+            char zero[16];
+            int d = ext_bf.len - backing_format_len;
+
+            memset(zero, 0, sizeof(zero));
+            cpu_to_be32s(&ext_bf.magic);
+            cpu_to_be32s(&ext_bf.len);
+            write(fd, &ext_bf, sizeof(ext_bf));
+            write(fd, backing_format, backing_format_len);
+            if (d>0) {
+                write(fd, zero, d);
+            }
+        }
+        write(fd, backing_file, backing_filename_len);
+    }
+    lseek(fd, s->l1_table_offset, SEEK_SET);
+    tmp = 0;
+    for(i = 0;i < l1_size; i++) {
+        write(fd, &tmp, sizeof(tmp));
+    }
+    lseek(fd, s->refcount_table_offset, SEEK_SET);
+    write(fd, s->refcount_table, s->cluster_size);
+
+    lseek(fd, s->refcount_block_offset, SEEK_SET);
+    write(fd, s->refcount_block, ref_clusters * s->cluster_size);
+
+    qemu_free(s->refcount_table);
+    qemu_free(s->refcount_block);
+    close(fd);
+    return 0;
+}
+
+static int qcow_create(const char *filename, int64_t total_size,
+                       const char *backing_file, int flags)
+{
+    return qcow_create2(filename, total_size, backing_file, NULL, flags);
+}
+
+static int qcow_make_empty(BlockDriverState *bs)
+{
+#if 0
+    /* XXX: not correct */
+    BDRVQcowState *s = bs->opaque;
+    uint32_t l1_length = s->l1_size * sizeof(uint64_t);
+    int ret;
+
+    memset(s->l1_table, 0, l1_length);
+    if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0)
+        return -1;
+    ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length);
+    if (ret < 0)
+        return ret;
+
+    l2_cache_reset(bs);
+#endif
+    return 0;
+}
+
+/* XXX: put compressed sectors first, then all the cluster aligned
+   tables to avoid losing bytes in alignment */
+static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
+                                 const uint8_t *buf, int nb_sectors)
+{
+    BDRVQcowState *s = bs->opaque;
+    z_stream strm;
+    int ret, out_len;
+    uint8_t *out_buf;
+    uint64_t cluster_offset;
+
+    if (nb_sectors == 0) {
+        /* align end of file to a sector boundary to ease reading with
+           sector based I/Os */
+        cluster_offset = bdrv_getlength(s->hd);
+        cluster_offset = (cluster_offset + 511) & ~511;
+        bdrv_truncate(s->hd, cluster_offset);
+        return 0;
+    }
+
+    if (nb_sectors != s->cluster_sectors)
+        return -EINVAL;
+
+    out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
+
+    /* best compression, small window, no zlib header */
+    memset(&strm, 0, sizeof(strm));
+    ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                       Z_DEFLATED, -12,
+                       9, Z_DEFAULT_STRATEGY);
+    if (ret != 0) {
+        qemu_free(out_buf);
+        return -1;
+    }
+
+    strm.avail_in = s->cluster_size;
+    strm.next_in = (uint8_t *)buf;
+    strm.avail_out = s->cluster_size;
+    strm.next_out = out_buf;
+
+    ret = deflate(&strm, Z_FINISH);
+    if (ret != Z_STREAM_END && ret != Z_OK) {
+        qemu_free(out_buf);
+        deflateEnd(&strm);
+        return -1;
+    }
+    out_len = strm.next_out - out_buf;
+
+    deflateEnd(&strm);
+
+    if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+        /* could not compress: write normal cluster */
+        qcow_write(bs, sector_num, buf, s->cluster_sectors);
+    } else {
+        cluster_offset = alloc_compressed_cluster_offset(bs, sector_num << 9,
+                                              out_len);
+        if (!cluster_offset)
+            return -1;
+        cluster_offset &= s->cluster_offset_mask;
+        if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) {
+            qemu_free(out_buf);
+            return -1;
+        }
+    }
+
+    qemu_free(out_buf);
+    return 0;
+}
+
+static void qcow_flush(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdrv_flush(s->hd);
+}
+
+static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+    BDRVQcowState *s = bs->opaque;
+    bdi->cluster_size = s->cluster_size;
+    bdi->vm_state_offset = (int64_t)s->l1_vm_state_index <<
+        (s->cluster_bits + s->l2_bits);
+    return 0;
+}
+
+/*********************************************************/
+/* snapshot support */
+
+/* update the refcounts of snapshots and the copied flag */
+static int update_snapshot_refcount(BlockDriverState *bs,
+                                    int64_t l1_table_offset,
+                                    int l1_size,
+                                    int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
+    int64_t old_offset, old_l2_offset;
+    int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
+
+    l2_cache_reset(bs);
+
+    l2_table = NULL;
+    l1_table = NULL;
+    l1_size2 = l1_size * sizeof(uint64_t);
+    l1_allocated = 0;
+    if (l1_table_offset != s->l1_table_offset) {
+        l1_table = qemu_malloc(l1_size2);
+        l1_allocated = 1;
+        if (bdrv_pread(s->hd, l1_table_offset,
+                       l1_table, l1_size2) != l1_size2)
+            goto fail;
+        for(i = 0;i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    } else {
+        assert(l1_size == s->l1_size);
+        l1_table = s->l1_table;
+        l1_allocated = 0;
+    }
+
+    l2_size = s->l2_size * sizeof(uint64_t);
+    l2_table = qemu_malloc(l2_size);
+    l1_modified = 0;
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            old_l2_offset = l2_offset;
+            l2_offset &= ~QCOW_OFLAG_COPIED;
+            l2_modified = 0;
+            if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+                goto fail;
+            for(j = 0; j < s->l2_size; j++) {
+                offset = be64_to_cpu(l2_table[j]);
+                if (offset != 0) {
+                    old_offset = offset;
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    if (offset & QCOW_OFLAG_COMPRESSED) {
+                        nb_csectors = ((offset >> s->csize_shift) &
+                                       s->csize_mask) + 1;
+                        if (addend != 0)
+                            update_refcount(bs, (offset & s->cluster_offset_mask) & ~511,
+                                            nb_csectors * 512, addend);
+                        /* compressed clusters are never modified */
+                        refcount = 2;
+                    } else {
+                        if (addend != 0) {
+                            refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
+                        } else {
+                            refcount = get_refcount(bs, offset >> s->cluster_bits);
+                        }
+                    }
+
+                    if (refcount == 1) {
+                        offset |= QCOW_OFLAG_COPIED;
+                    }
+                    if (offset != old_offset) {
+                        l2_table[j] = cpu_to_be64(offset);
+                        l2_modified = 1;
+                    }
+                }
+            }
+            if (l2_modified) {
+                if (bdrv_pwrite(s->hd,
+                                l2_offset, l2_table, l2_size) != l2_size)
+                    goto fail;
+            }
+
+            if (addend != 0) {
+                refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
+            } else {
+                refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+            }
+            if (refcount == 1) {
+                l2_offset |= QCOW_OFLAG_COPIED;
+            }
+            if (l2_offset != old_l2_offset) {
+                l1_table[i] = l2_offset;
+                l1_modified = 1;
+            }
+        }
+    }
+    if (l1_modified) {
+        for(i = 0; i < l1_size; i++)
+            cpu_to_be64s(&l1_table[i]);
+        if (bdrv_pwrite(s->hd, l1_table_offset, l1_table,
+                        l1_size2) != l1_size2)
+            goto fail;
+        for(i = 0; i < l1_size; i++)
+            be64_to_cpus(&l1_table[i]);
+    }
+    if (l1_allocated)
+        qemu_free(l1_table);
+    qemu_free(l2_table);
+    return 0;
+ fail:
+    if (l1_allocated)
+        qemu_free(l1_table);
+    qemu_free(l2_table);
+    return -EIO;
+}
+
+static void qcow_free_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        qemu_free(s->snapshots[i].name);
+        qemu_free(s->snapshots[i].id_str);
+    }
+    qemu_free(s->snapshots);
+    s->snapshots = NULL;
+    s->nb_snapshots = 0;
+}
+
+static int qcow_read_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshotHeader h;
+    QCowSnapshot *sn;
+    int i, id_str_size, name_size;
+    int64_t offset;
+    uint32_t extra_data_size;
+
+    if (!s->nb_snapshots) {
+        s->snapshots = NULL;
+        s->snapshots_size = 0;
+        return 0;
+    }
+
+    offset = s->snapshots_offset;
+    s->snapshots = qemu_mallocz(s->nb_snapshots * sizeof(QCowSnapshot));
+    for(i = 0; i < s->nb_snapshots; i++) {
+        offset = align_offset(offset, 8);
+        if (bdrv_pread(s->hd, offset, &h, sizeof(h)) != sizeof(h))
+            goto fail;
+        offset += sizeof(h);
+        sn = s->snapshots + i;
+        sn->l1_table_offset = be64_to_cpu(h.l1_table_offset);
+        sn->l1_size = be32_to_cpu(h.l1_size);
+        sn->vm_state_size = be32_to_cpu(h.vm_state_size);
+        sn->date_sec = be32_to_cpu(h.date_sec);
+        sn->date_nsec = be32_to_cpu(h.date_nsec);
+        sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec);
+        extra_data_size = be32_to_cpu(h.extra_data_size);
+
+        id_str_size = be16_to_cpu(h.id_str_size);
+        name_size = be16_to_cpu(h.name_size);
+
+        offset += extra_data_size;
+
+        sn->id_str = qemu_malloc(id_str_size + 1);
+        if (bdrv_pread(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
+            goto fail;
+        offset += id_str_size;
+        sn->id_str[id_str_size] = '\0';
+
+        sn->name = qemu_malloc(name_size + 1);
+        if (bdrv_pread(s->hd, offset, sn->name, name_size) != name_size)
+            goto fail;
+        offset += name_size;
+        sn->name[name_size] = '\0';
+    }
+    s->snapshots_size = offset - s->snapshots_offset;
+    return 0;
+ fail:
+    qcow_free_snapshots(bs);
+    return -1;
+}
+
+/* add at the end of the file a new list of snapshots */
+static int qcow_write_snapshots(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    QCowSnapshotHeader h;
+    int i, name_size, id_str_size, snapshots_size;
+    uint64_t data64;
+    uint32_t data32;
+    int64_t offset, snapshots_offset;
+
+    /* compute the size of the snapshots */
+    offset = 0;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        offset = align_offset(offset, 8);
+        offset += sizeof(h);
+        offset += strlen(sn->id_str);
+        offset += strlen(sn->name);
+    }
+    snapshots_size = offset;
+
+    snapshots_offset = alloc_clusters(bs, snapshots_size);
+    offset = snapshots_offset;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        memset(&h, 0, sizeof(h));
+        h.l1_table_offset = cpu_to_be64(sn->l1_table_offset);
+        h.l1_size = cpu_to_be32(sn->l1_size);
+        h.vm_state_size = cpu_to_be32(sn->vm_state_size);
+        h.date_sec = cpu_to_be32(sn->date_sec);
+        h.date_nsec = cpu_to_be32(sn->date_nsec);
+        h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec);
+
+        id_str_size = strlen(sn->id_str);
+        name_size = strlen(sn->name);
+        h.id_str_size = cpu_to_be16(id_str_size);
+        h.name_size = cpu_to_be16(name_size);
+        offset = align_offset(offset, 8);
+        if (bdrv_pwrite(s->hd, offset, &h, sizeof(h)) != sizeof(h))
+            goto fail;
+        offset += sizeof(h);
+        if (bdrv_pwrite(s->hd, offset, sn->id_str, id_str_size) != id_str_size)
+            goto fail;
+        offset += id_str_size;
+        if (bdrv_pwrite(s->hd, offset, sn->name, name_size) != name_size)
+            goto fail;
+        offset += name_size;
+    }
+
+    /* update the various header fields */
+    data64 = cpu_to_be64(snapshots_offset);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, snapshots_offset),
+                    &data64, sizeof(data64)) != sizeof(data64))
+        goto fail;
+    data32 = cpu_to_be32(s->nb_snapshots);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, nb_snapshots),
+                    &data32, sizeof(data32)) != sizeof(data32))
+        goto fail;
+
+    /* free the old snapshot table */
+    free_clusters(bs, s->snapshots_offset, s->snapshots_size);
+    s->snapshots_offset = snapshots_offset;
+    s->snapshots_size = snapshots_size;
+    return 0;
+ fail:
+    return -1;
+}
+
+static void find_new_snapshot_id(BlockDriverState *bs,
+                                 char *id_str, int id_str_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, id, id_max = 0;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        id = strtoul(sn->id_str, NULL, 10);
+        if (id > id_max)
+            id_max = id;
+    }
+    snprintf(id_str, id_str_size, "%d", id_max + 1);
+}
+
+static int find_snapshot_by_id(BlockDriverState *bs, const char *id_str)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i;
+
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].id_str, id_str))
+            return i;
+    }
+    return -1;
+}
+
+static int find_snapshot_by_id_or_name(BlockDriverState *bs, const char *name)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, ret;
+
+    ret = find_snapshot_by_id(bs, name);
+    if (ret >= 0)
+        return ret;
+    for(i = 0; i < s->nb_snapshots; i++) {
+        if (!strcmp(s->snapshots[i].name, name))
+            return i;
+    }
+    return -1;
+}
+
+/* if no id is provided, a new one is constructed */
+static int qcow_snapshot_create(BlockDriverState *bs,
+                                QEMUSnapshotInfo *sn_info)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *snapshots1, sn1, *sn = &sn1;
+    int i, ret;
+    uint64_t *l1_table = NULL;
+
+    memset(sn, 0, sizeof(*sn));
+
+    if (sn_info->id_str[0] == '\0') {
+        /* compute a new id */
+        find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str));
+    }
+
+    /* check that the ID is unique */
+    if (find_snapshot_by_id(bs, sn_info->id_str) >= 0)
+        return -ENOENT;
+
+    sn->id_str = qemu_strdup(sn_info->id_str);
+    if (!sn->id_str)
+        goto fail;
+    sn->name = qemu_strdup(sn_info->name);
+    if (!sn->name)
+        goto fail;
+    sn->vm_state_size = sn_info->vm_state_size;
+    sn->date_sec = sn_info->date_sec;
+    sn->date_nsec = sn_info->date_nsec;
+    sn->vm_clock_nsec = sn_info->vm_clock_nsec;
+
+    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0)
+        goto fail;
+
+    /* create the L1 table of the snapshot */
+    sn->l1_table_offset = alloc_clusters(bs, s->l1_size * sizeof(uint64_t));
+    sn->l1_size = s->l1_size;
+
+    l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
+    for(i = 0; i < s->l1_size; i++) {
+        l1_table[i] = cpu_to_be64(s->l1_table[i]);
+    }
+    if (bdrv_pwrite(s->hd, sn->l1_table_offset,
+                    l1_table, s->l1_size * sizeof(uint64_t)) !=
+        (s->l1_size * sizeof(uint64_t)))
+        goto fail;
+    qemu_free(l1_table);
+    l1_table = NULL;
+
+    snapshots1 = qemu_malloc((s->nb_snapshots + 1) * sizeof(QCowSnapshot));
+    if (s->snapshots) {
+        memcpy(snapshots1, s->snapshots, s->nb_snapshots * sizeof(QCowSnapshot));
+        qemu_free(s->snapshots);
+    }
+    s->snapshots = snapshots1;
+    s->snapshots[s->nb_snapshots++] = *sn;
+
+    if (qcow_write_snapshots(bs) < 0)
+        goto fail;
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+ fail:
+    qemu_free(sn->name);
+    qemu_free(l1_table);
+    return -1;
+}
+
+/* copy the snapshot 'snapshot_name' into the current disk image */
+static int qcow_snapshot_goto(BlockDriverState *bs,
+                              const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int i, snapshot_index, l1_size2;
+
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0)
+        return -ENOENT;
+    sn = &s->snapshots[snapshot_index];
+
+    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, -1) < 0)
+        goto fail;
+
+    if (grow_l1_table(bs, sn->l1_size) < 0)
+        goto fail;
+
+    s->l1_size = sn->l1_size;
+    l1_size2 = s->l1_size * sizeof(uint64_t);
+    /* copy the snapshot l1 table to the current l1 table */
+    if (bdrv_pread(s->hd, sn->l1_table_offset,
+                   s->l1_table, l1_size2) != l1_size2)
+        goto fail;
+    if (bdrv_pwrite(s->hd, s->l1_table_offset,
+                    s->l1_table, l1_size2) != l1_size2)
+        goto fail;
+    for(i = 0;i < s->l1_size; i++) {
+        be64_to_cpus(&s->l1_table[i]);
+    }
+
+    if (update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1) < 0)
+        goto fail;
+
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+ fail:
+    return -EIO;
+}
+
+static int qcow_snapshot_delete(BlockDriverState *bs, const char *snapshot_id)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowSnapshot *sn;
+    int snapshot_index, ret;
+
+    snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id);
+    if (snapshot_index < 0)
+        return -ENOENT;
+    sn = &s->snapshots[snapshot_index];
+
+    ret = update_snapshot_refcount(bs, sn->l1_table_offset, sn->l1_size, -1);
+    if (ret < 0)
+        return ret;
+    /* must update the copied flag on the current cluster offsets */
+    ret = update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0);
+    if (ret < 0)
+        return ret;
+    free_clusters(bs, sn->l1_table_offset, sn->l1_size * sizeof(uint64_t));
+
+    qemu_free(sn->id_str);
+    qemu_free(sn->name);
+    memmove(sn, sn + 1, (s->nb_snapshots - snapshot_index - 1) * sizeof(*sn));
+    s->nb_snapshots--;
+    ret = qcow_write_snapshots(bs);
+    if (ret < 0) {
+        /* XXX: restore snapshot if error ? */
+        return ret;
+    }
+#ifdef DEBUG_ALLOC
+    check_refcounts(bs);
+#endif
+    return 0;
+}
+
+static int qcow_snapshot_list(BlockDriverState *bs,
+                              QEMUSnapshotInfo **psn_tab)
+{
+    BDRVQcowState *s = bs->opaque;
+    QEMUSnapshotInfo *sn_tab, *sn_info;
+    QCowSnapshot *sn;
+    int i;
+
+    sn_tab = qemu_mallocz(s->nb_snapshots * sizeof(QEMUSnapshotInfo));
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn_info = sn_tab + i;
+        sn = s->snapshots + i;
+        pstrcpy(sn_info->id_str, sizeof(sn_info->id_str),
+                sn->id_str);
+        pstrcpy(sn_info->name, sizeof(sn_info->name),
+                sn->name);
+        sn_info->vm_state_size = sn->vm_state_size;
+        sn_info->date_sec = sn->date_sec;
+        sn_info->date_nsec = sn->date_nsec;
+        sn_info->vm_clock_nsec = sn->vm_clock_nsec;
+    }
+    *psn_tab = sn_tab;
+    return s->nb_snapshots;
+}
+
+/*********************************************************/
+/* refcount handling */
+
+static int refcount_init(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret, refcount_table_size2, i;
+
+    s->refcount_block_cache = qemu_malloc(s->cluster_size);
+    refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
+    s->refcount_table = qemu_malloc(refcount_table_size2);
+    if (s->refcount_table_size > 0) {
+        ret = bdrv_pread(s->hd, s->refcount_table_offset,
+                         s->refcount_table, refcount_table_size2);
+        if (ret != refcount_table_size2)
+            goto fail;
+        for(i = 0; i < s->refcount_table_size; i++)
+            be64_to_cpus(&s->refcount_table[i]);
+    }
+    return 0;
+ fail:
+    return -ENOMEM;
+}
+
+static void refcount_close(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    qemu_free(s->refcount_block_cache);
+    qemu_free(s->refcount_table);
+}
+
+
+static int load_refcount_block(BlockDriverState *bs,
+                               int64_t refcount_block_offset)
+{
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+    ret = bdrv_pread(s->hd, refcount_block_offset, s->refcount_block_cache,
+                     s->cluster_size);
+    if (ret != s->cluster_size)
+        return -EIO;
+    s->refcount_block_cache_offset = refcount_block_offset;
+    return 0;
+}
+
+static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    int refcount_table_index, block_index;
+    int64_t refcount_block_offset;
+
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+    if (refcount_table_index >= s->refcount_table_size)
+        return 0;
+    refcount_block_offset = s->refcount_table[refcount_table_index];
+    if (!refcount_block_offset)
+        return 0;
+    if (refcount_block_offset != s->refcount_block_cache_offset) {
+        /* better than nothing: return allocated if read error */
+        if (load_refcount_block(bs, refcount_block_offset) < 0)
+            return 1;
+    }
+    block_index = cluster_index &
+        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+    return be16_to_cpu(s->refcount_block_cache[block_index]);
+}
+
+/* return < 0 if error */
+static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int i, nb_clusters;
+
+    nb_clusters = size_to_clusters(s, size);
+retry:
+    for(i = 0; i < nb_clusters; i++) {
+        int64_t i = s->free_cluster_index++;
+        if (get_refcount(bs, i) != 0)
+            goto retry;
+    }
+#ifdef DEBUG_ALLOC2
+    printf("alloc_clusters: size=%lld -> %lld\n",
+            size,
+            (s->free_cluster_index - nb_clusters) << s->cluster_bits);
+#endif
+    return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
+}
+
+static int64_t alloc_clusters(BlockDriverState *bs, int64_t size)
+{
+    int64_t offset;
+
+    offset = alloc_clusters_noref(bs, size);
+    update_refcount(bs, offset, size, 1);
+    return offset;
+}
+
+/* only used to allocate compressed sectors. We try to allocate
+   contiguous sectors. size must be <= cluster_size */
+static int64_t alloc_bytes(BlockDriverState *bs, int size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, cluster_offset;
+    int free_in_cluster;
+
+    assert(size > 0 && size <= s->cluster_size);
+    if (s->free_byte_offset == 0) {
+        s->free_byte_offset = alloc_clusters(bs, s->cluster_size);
+    }
+ redo:
+    free_in_cluster = s->cluster_size -
+        (s->free_byte_offset & (s->cluster_size - 1));
+    if (size <= free_in_cluster) {
+        /* enough space in current cluster */
+        offset = s->free_byte_offset;
+        s->free_byte_offset += size;
+        free_in_cluster -= size;
+        if (free_in_cluster == 0)
+            s->free_byte_offset = 0;
+        if ((offset & (s->cluster_size - 1)) != 0)
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+    } else {
+        offset = alloc_clusters(bs, s->cluster_size);
+        cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
+        if ((cluster_offset + s->cluster_size) == offset) {
+            /* we are lucky: contiguous data */
+            offset = s->free_byte_offset;
+            update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
+            s->free_byte_offset += size;
+        } else {
+            s->free_byte_offset = offset;
+            goto redo;
+        }
+    }
+    return offset;
+}
+
+static void free_clusters(BlockDriverState *bs,
+                          int64_t offset, int64_t size)
+{
+    update_refcount(bs, offset, size, -1);
+}
+
+static int grow_refcount_table(BlockDriverState *bs, int min_size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int new_table_size, new_table_size2, refcount_table_clusters, i, ret;
+    uint64_t *new_table;
+    int64_t table_offset;
+    uint8_t data[12];
+    int old_table_size;
+    int64_t old_table_offset;
+
+    if (min_size <= s->refcount_table_size)
+        return 0;
+    /* compute new table size */
+    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
+    for(;;) {
+        if (refcount_table_clusters == 0) {
+            refcount_table_clusters = 1;
+        } else {
+            refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
+        }
+        new_table_size = refcount_table_clusters << (s->cluster_bits - 3);
+        if (min_size <= new_table_size)
+            break;
+    }
+#ifdef DEBUG_ALLOC2
+    printf("grow_refcount_table from %d to %d\n",
+           s->refcount_table_size,
+           new_table_size);
+#endif
+    new_table_size2 = new_table_size * sizeof(uint64_t);
+    new_table = qemu_mallocz(new_table_size2);
+    memcpy(new_table, s->refcount_table,
+           s->refcount_table_size * sizeof(uint64_t));
+    for(i = 0; i < s->refcount_table_size; i++)
+        cpu_to_be64s(&new_table[i]);
+    /* Note: we cannot update the refcount now to avoid recursion */
+    table_offset = alloc_clusters_noref(bs, new_table_size2);
+    ret = bdrv_pwrite(s->hd, table_offset, new_table, new_table_size2);
+    if (ret != new_table_size2)
+        goto fail;
+    for(i = 0; i < s->refcount_table_size; i++)
+        be64_to_cpus(&new_table[i]);
+
+    cpu_to_be64w((uint64_t*)data, table_offset);
+    cpu_to_be32w((uint32_t*)(data + 8), refcount_table_clusters);
+    if (bdrv_pwrite(s->hd, offsetof(QCowHeader, refcount_table_offset),
+                    data, sizeof(data)) != sizeof(data))
+        goto fail;
+    qemu_free(s->refcount_table);
+    old_table_offset = s->refcount_table_offset;
+    old_table_size = s->refcount_table_size;
+    s->refcount_table = new_table;
+    s->refcount_table_size = new_table_size;
+    s->refcount_table_offset = table_offset;
+
+    update_refcount(bs, table_offset, new_table_size2, 1);
+    free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
+    return 0;
+ fail:
+    free_clusters(bs, table_offset, new_table_size2);
+    qemu_free(new_table);
+    return -EIO;
+}
+
+/* addend must be 1 or -1 */
+/* XXX: cache several refcount block clusters ? */
+static int update_cluster_refcount(BlockDriverState *bs,
+                                   int64_t cluster_index,
+                                   int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t offset, refcount_block_offset;
+    int ret, refcount_table_index, block_index, refcount;
+    uint64_t data64;
+
+    refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
+    if (refcount_table_index >= s->refcount_table_size) {
+        if (addend < 0)
+            return -EINVAL;
+        ret = grow_refcount_table(bs, refcount_table_index + 1);
+        if (ret < 0)
+            return ret;
+    }
+    refcount_block_offset = s->refcount_table[refcount_table_index];
+    if (!refcount_block_offset) {
+        if (addend < 0)
+            return -EINVAL;
+        /* create a new refcount block */
+        /* Note: we cannot update the refcount now to avoid recursion */
+        offset = alloc_clusters_noref(bs, s->cluster_size);
+        memset(s->refcount_block_cache, 0, s->cluster_size);
+        ret = bdrv_pwrite(s->hd, offset, s->refcount_block_cache, s->cluster_size);
+        if (ret != s->cluster_size)
+            return -EINVAL;
+        s->refcount_table[refcount_table_index] = offset;
+        data64 = cpu_to_be64(offset);
+        ret = bdrv_pwrite(s->hd, s->refcount_table_offset +
+                          refcount_table_index * sizeof(uint64_t),
+                          &data64, sizeof(data64));
+        if (ret != sizeof(data64))
+            return -EINVAL;
+
+        refcount_block_offset = offset;
+        s->refcount_block_cache_offset = offset;
+        update_refcount(bs, offset, s->cluster_size, 1);
+    } else {
+        if (refcount_block_offset != s->refcount_block_cache_offset) {
+            if (load_refcount_block(bs, refcount_block_offset) < 0)
+                return -EIO;
+        }
+    }
+    /* we can update the count and save it */
+    block_index = cluster_index &
+        ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
+    refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
+    refcount += addend;
+    if (refcount < 0 || refcount > 0xffff)
+        return -EINVAL;
+    if (refcount == 0 && cluster_index < s->free_cluster_index) {
+        s->free_cluster_index = cluster_index;
+    }
+    s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
+    if (bdrv_pwrite(s->hd,
+                    refcount_block_offset + (block_index << REFCOUNT_SHIFT),
+                    &s->refcount_block_cache[block_index], 2) != 2)
+        return -EIO;
+    return refcount;
+}
+
+static void update_refcount(BlockDriverState *bs,
+                            int64_t offset, int64_t length,
+                            int addend)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+
+#ifdef DEBUG_ALLOC2
+    printf("update_refcount: offset=%lld size=%lld addend=%d\n",
+           offset, length, addend);
+#endif
+    if (length <= 0)
+        return;
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + length - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        update_cluster_refcount(bs, cluster_offset >> s->cluster_bits, addend);
+    }
+}
+
+/*
+ * Increases the refcount for a range of clusters in a given refcount table.
+ * This is used to construct a temporary refcount table out of L1 and L2 tables
+ * which can be compared the the refcount table saved in the image.
+ *
+ * Returns the number of errors in the image that were found
+ */
+static int inc_refcounts(BlockDriverState *bs,
+                          uint16_t *refcount_table,
+                          int refcount_table_size,
+                          int64_t offset, int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t start, last, cluster_offset;
+    int k;
+    int errors = 0;
+
+    if (size <= 0)
+        return 0;
+
+    start = offset & ~(s->cluster_size - 1);
+    last = (offset + size - 1) & ~(s->cluster_size - 1);
+    for(cluster_offset = start; cluster_offset <= last;
+        cluster_offset += s->cluster_size) {
+        k = cluster_offset >> s->cluster_bits;
+        if (k < 0 || k >= refcount_table_size) {
+            fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
+                cluster_offset);
+            errors++;
+        } else {
+            if (++refcount_table[k] == 0) {
+                fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
+                    "\n", cluster_offset);
+                errors++;
+            }
+        }
+    }
+
+    return errors;
+}
+
+/*
+ * Increases the refcount in the given refcount table for the all clusters
+ * referenced in the L2 table. While doing so, performs some checks on L2
+ * entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l2(BlockDriverState *bs,
+    uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
+    int check_copied)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l2_table, offset;
+    int i, l2_size, nb_csectors, refcount;
+    int errors = 0;
+
+    /* Read L2 table from disk */
+    l2_size = s->l2_size * sizeof(uint64_t);
+    l2_table = qemu_malloc(l2_size);
+
+    if (bdrv_pread(s->hd, l2_offset, l2_table, l2_size) != l2_size)
+        goto fail;
+
+    /* Do the actual checks */
+    for(i = 0; i < s->l2_size; i++) {
+        offset = be64_to_cpu(l2_table[i]);
+        if (offset != 0) {
+            if (offset & QCOW_OFLAG_COMPRESSED) {
+                /* Compressed clusters don't have QCOW_OFLAG_COPIED */
+                if (offset & QCOW_OFLAG_COPIED) {
+                    fprintf(stderr, "ERROR: cluster %" PRId64 ": "
+                        "copied flag must never be set for compressed "
+                        "clusters\n", offset >> s->cluster_bits);
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    errors++;
+                }
+
+                /* Mark cluster as used */
+                nb_csectors = ((offset >> s->csize_shift) &
+                               s->csize_mask) + 1;
+                offset &= s->cluster_offset_mask;
+                errors += inc_refcounts(bs, refcount_table,
+                              refcount_table_size,
+                              offset & ~511, nb_csectors * 512);
+            } else {
+                /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+                if (check_copied) {
+                    uint64_t entry = offset;
+                    offset &= ~QCOW_OFLAG_COPIED;
+                    refcount = get_refcount(bs, offset >> s->cluster_bits);
+                    if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
+                        fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
+                            PRIx64 " refcount=%d\n", entry, refcount);
+                        errors++;
+                    }
+                }
+
+                /* Mark cluster as used */
+                offset &= ~QCOW_OFLAG_COPIED;
+                errors += inc_refcounts(bs, refcount_table,
+                              refcount_table_size,
+                              offset, s->cluster_size);
+
+                /* Correct offsets are cluster aligned */
+                if (offset & (s->cluster_size - 1)) {
+                    fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
+                        "properly aligned; L2 entry corrupted.\n", offset);
+                    errors++;
+                }
+            }
+        }
+    }
+
+    qemu_free(l2_table);
+    return errors;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+    qemu_free(l2_table);
+    return -EIO;
+}
+
+/*
+ * Increases the refcount for the L1 table, its L2 tables and all referenced
+ * clusters in the given refcount table. While doing so, performs some checks
+ * on L1 and L2 entries.
+ *
+ * Returns the number of errors found by the checks or -errno if an internal
+ * error occurred.
+ */
+static int check_refcounts_l1(BlockDriverState *bs,
+                              uint16_t *refcount_table,
+                              int refcount_table_size,
+                              int64_t l1_table_offset, int l1_size,
+                              int check_copied)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t *l1_table, l2_offset, l1_size2;
+    int i, refcount, ret;
+    int errors = 0;
+
+    l1_size2 = l1_size * sizeof(uint64_t);
+
+    /* Mark L1 table as used */
+    errors += inc_refcounts(bs, refcount_table, refcount_table_size,
+                  l1_table_offset, l1_size2);
+
+    /* Read L1 table entries from disk */
+    l1_table = qemu_malloc(l1_size2);
+    if (bdrv_pread(s->hd, l1_table_offset,
+                   l1_table, l1_size2) != l1_size2)
+        goto fail;
+    for(i = 0;i < l1_size; i++)
+        be64_to_cpus(&l1_table[i]);
+
+    /* Do the actual checks */
+    for(i = 0; i < l1_size; i++) {
+        l2_offset = l1_table[i];
+        if (l2_offset) {
+            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
+            if (check_copied) {
+                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
+                    >> s->cluster_bits);
+                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
+                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
+                        " refcount=%d\n", l2_offset, refcount);
+                    errors++;
+                }
+            }
+
+            /* Mark L2 table as used */
+            l2_offset &= ~QCOW_OFLAG_COPIED;
+            errors += inc_refcounts(bs, refcount_table,
+                          refcount_table_size,
+                          l2_offset,
+                          s->cluster_size);
+
+            /* L2 tables are cluster aligned */
+            if (l2_offset & (s->cluster_size - 1)) {
+                fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
+                    "cluster aligned; L1 entry corrupted\n", l2_offset);
+                errors++;
+            }
+
+            /* Process and check L2 entries */
+            ret = check_refcounts_l2(bs, refcount_table, refcount_table_size,
+                l2_offset, check_copied);
+            if (ret < 0) {
+                goto fail;
+            }
+            errors += ret;
+        }
+    }
+    qemu_free(l1_table);
+    return errors;
+
+fail:
+    fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
+    qemu_free(l1_table);
+    return -EIO;
+}
+
+/*
+ * Checks an image for refcount consistency.
+ *
+ * Returns 0 if no errors are found, the number of errors in case the image is
+ * detected as corrupted, and -errno when an internal error occured.
+ */
+static int check_refcounts(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t size;
+    int nb_clusters, refcount1, refcount2, i;
+    QCowSnapshot *sn;
+    uint16_t *refcount_table;
+    int ret, errors = 0;
+
+    size = bdrv_getlength(s->hd);
+    nb_clusters = size_to_clusters(s, size);
+    refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
+
+    /* header */
+    errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                  0, s->cluster_size);
+
+    /* current L1 table */
+    ret = check_refcounts_l1(bs, refcount_table, nb_clusters,
+                       s->l1_table_offset, s->l1_size, 1);
+    if (ret < 0) {
+        return ret;
+    }
+    errors += ret;
+
+    /* snapshots */
+    for(i = 0; i < s->nb_snapshots; i++) {
+        sn = s->snapshots + i;
+        check_refcounts_l1(bs, refcount_table, nb_clusters,
+                           sn->l1_table_offset, sn->l1_size, 0);
+    }
+    errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                  s->snapshots_offset, s->snapshots_size);
+
+    /* refcount data */
+    errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                  s->refcount_table_offset,
+                  s->refcount_table_size * sizeof(uint64_t));
+    for(i = 0; i < s->refcount_table_size; i++) {
+        int64_t offset;
+        offset = s->refcount_table[i];
+        if (offset != 0) {
+            errors += inc_refcounts(bs, refcount_table, nb_clusters,
+                          offset, s->cluster_size);
+        }
+    }
+
+    /* compare ref counts */
+    for(i = 0; i < nb_clusters; i++) {
+        refcount1 = get_refcount(bs, i);
+        refcount2 = refcount_table[i];
+        if (refcount1 != refcount2) {
+            fprintf(stderr, "ERROR cluster %d refcount=%d reference=%d\n",
+                   i, refcount1, refcount2);
+            errors++;
+        }
+    }
+
+    qemu_free(refcount_table);
+
+    return errors;
+}
+
+static int qcow_check(BlockDriverState *bs)
+{
+    return check_refcounts(bs);
+}
+
+#if 0
+static void dump_refcounts(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t nb_clusters, k, k1, size;
+    int refcount;
+
+    size = bdrv_getlength(s->hd);
+    nb_clusters = size_to_clusters(s, size);
+    for(k = 0; k < nb_clusters;) {
+        k1 = k;
+        refcount = get_refcount(bs, k);
+        k++;
+        while (k < nb_clusters && get_refcount(bs, k) == refcount)
+            k++;
+        printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1);
+    }
+}
+#endif
+
+static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf,
+                           int64_t pos, int size)
+{
+    int growable = bs->growable;
+
+    bs->growable = 1;
+    bdrv_pwrite(bs, pos, buf, size);
+    bs->growable = growable;
+
+    return size;
+}
+
+static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf,
+                           int64_t pos, int size)
+{
+    int growable = bs->growable;
+    int ret;
+
+    bs->growable = 1;
+    ret = bdrv_pread(bs, pos, buf, size);
+    bs->growable = growable;
+
+    return ret;
+}
+
+static BlockDriver bdrv_qcow2 = {
+    .format_name	= "qcow2",
+    .instance_size	= sizeof(BDRVQcowState),
+    .bdrv_probe		= qcow_probe,
+    .bdrv_open		= qcow_open,
+    .bdrv_close		= qcow_close,
+    .bdrv_create	= qcow_create,
+    .bdrv_flush		= qcow_flush,
+    .bdrv_is_allocated	= qcow_is_allocated,
+    .bdrv_set_key	= qcow_set_key,
+    .bdrv_make_empty	= qcow_make_empty,
+
+    .bdrv_aio_readv	= qcow_aio_readv,
+    .bdrv_aio_writev	= qcow_aio_writev,
+    .bdrv_aio_cancel	= qcow_aio_cancel,
+    .aiocb_size		= sizeof(QCowAIOCB),
+    .bdrv_write_compressed = qcow_write_compressed,
+
+    .bdrv_snapshot_create = qcow_snapshot_create,
+    .bdrv_snapshot_goto	= qcow_snapshot_goto,
+    .bdrv_snapshot_delete = qcow_snapshot_delete,
+    .bdrv_snapshot_list	= qcow_snapshot_list,
+    .bdrv_get_info	= qcow_get_info,
+
+    .bdrv_put_buffer    = qcow_put_buffer,
+    .bdrv_get_buffer    = qcow_get_buffer,
+
+    .bdrv_create2 = qcow_create2,
+    .bdrv_check = qcow_check,
+};
+
+static void bdrv_qcow2_init(void)
+{
+    bdrv_register(&bdrv_qcow2);
+}
+
+block_init(bdrv_qcow2_init);
diff --git a/block/raw-posix.c b/block/raw-posix.c
new file mode 100644
index 0000000..f3a9476
--- /dev/null
+++ b/block/raw-posix.c
@@ -0,0 +1,1438 @@
+/*
+ * Block driver for RAW files (posix)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "qemu-char.h"
+#include "block_int.h"
+#include "module.h"
+#ifdef CONFIG_AIO
+#include "posix-aio-compat.h"
+#endif
+
+#ifdef CONFIG_COCOA
+#include <paths.h>
+#include <sys/param.h>
+#include <IOKit/IOKitLib.h>
+#include <IOKit/IOBSD.h>
+#include <IOKit/storage/IOMediaBSDClient.h>
+#include <IOKit/storage/IOMedia.h>
+#include <IOKit/storage/IOCDMedia.h>
+//#include <IOKit/storage/IOCDTypes.h>
+#include <CoreFoundation/CoreFoundation.h>
+#endif
+
+#ifdef __sun__
+#define _POSIX_PTHREAD_SEMANTICS 1
+#include <signal.h>
+#include <sys/dkio.h>
+#endif
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/cdrom.h>
+#include <linux/fd.h>
+#endif
+#ifdef __FreeBSD__
+#include <signal.h>
+#include <sys/disk.h>
+#include <sys/cdio.h>
+#endif
+
+#ifdef __OpenBSD__
+#include <sys/ioctl.h>
+#include <sys/disklabel.h>
+#include <sys/dkio.h>
+#endif
+
+#ifdef __DragonFly__
+#include <sys/ioctl.h>
+#include <sys/diskslice.h>
+#endif
+
+//#define DEBUG_FLOPPY
+
+//#define DEBUG_BLOCK
+#if defined(DEBUG_BLOCK)
+#define DEBUG_BLOCK_PRINT(formatCstr, ...) do { if (qemu_log_enabled()) \
+    { qemu_log(formatCstr, ## __VA_ARGS__); qemu_log_flush(); } } while (0)
+#else
+#define DEBUG_BLOCK_PRINT(formatCstr, ...)
+#endif
+
+/* OS X does not have O_DSYNC */
+#ifndef O_DSYNC
+#define O_DSYNC O_SYNC
+#endif
+
+/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */
+#ifndef O_DIRECT
+#define O_DIRECT O_DSYNC
+#endif
+
+#define FTYPE_FILE   0
+#define FTYPE_CD     1
+#define FTYPE_FD     2
+
+#define ALIGNED_BUFFER_SIZE (32 * 512)
+
+/* if the FD is not accessed during that time (in ms), we try to
+   reopen it to see if the disk has been changed */
+#define FD_OPEN_TIMEOUT 1000
+
+typedef struct BDRVRawState {
+    int fd;
+    int type;
+    unsigned int lseek_err_cnt;
+#if defined(__linux__)
+    /* linux floppy specific */
+    int fd_open_flags;
+    int64_t fd_open_time;
+    int64_t fd_error_time;
+    int fd_got_error;
+    int fd_media_changed;
+#endif
+#if defined(__FreeBSD__)
+    int cd_open_flags;
+#endif
+    uint8_t* aligned_buf;
+} BDRVRawState;
+
+static int posix_aio_init(void);
+
+static int fd_open(BlockDriverState *bs);
+
+#if defined(__FreeBSD__)
+static int cd_open(BlockDriverState *bs);
+#endif
+
+static int raw_is_inserted(BlockDriverState *bs);
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd, open_flags, ret;
+
+    posix_aio_init();
+
+    s->lseek_err_cnt = 0;
+
+    open_flags = O_BINARY;
+    if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+        bs->read_only = 1;
+    }
+    if (flags & BDRV_O_CREAT)
+        open_flags |= O_CREAT | O_TRUNC;
+
+    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+     * and O_DIRECT for no caching. */
+    if ((flags & BDRV_O_NOCACHE))
+        open_flags |= O_DIRECT;
+    else if (!(flags & BDRV_O_CACHE_WB))
+        open_flags |= O_DSYNC;
+
+    s->type = FTYPE_FILE;
+
+    fd = open(filename, open_flags, 0644);
+    if (fd < 0) {
+        ret = -errno;
+        if (ret == -EROFS)
+            ret = -EACCES;
+        return ret;
+    }
+    s->fd = fd;
+    s->aligned_buf = NULL;
+    if ((flags & BDRV_O_NOCACHE)) {
+        s->aligned_buf = qemu_blockalign(bs, ALIGNED_BUFFER_SIZE);
+        if (s->aligned_buf == NULL) {
+            ret = -errno;
+            close(fd);
+            return ret;
+        }
+    }
+    return 0;
+}
+
+/* XXX: use host sector size if necessary with:
+#ifdef DIOCGSECTORSIZE
+        {
+            unsigned int sectorsize = 512;
+            if (!ioctl(fd, DIOCGSECTORSIZE, &sectorsize) &&
+                sectorsize > bufsize)
+                bufsize = sectorsize;
+        }
+#endif
+#ifdef CONFIG_COCOA
+        u_int32_t   blockSize = 512;
+        if ( !ioctl( fd, DKIOCGETBLOCKSIZE, &blockSize ) && blockSize > bufsize) {
+            bufsize = blockSize;
+        }
+#endif
+*/
+
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0)
+        return ret;
+
+    if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
+        ++(s->lseek_err_cnt);
+        if(s->lseek_err_cnt <= 10) {
+            DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                              "] lseek failed : %d = %s\n",
+                              s->fd, bs->filename, offset, buf, count,
+                              bs->total_sectors, errno, strerror(errno));
+        }
+        return -1;
+    }
+    s->lseek_err_cnt=0;
+
+    ret = read(s->fd, buf, count);
+    if (ret == count)
+        goto label__raw_read__success;
+
+    DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                      "] read failed %d : %d = %s\n",
+                      s->fd, bs->filename, offset, buf, count,
+                      bs->total_sectors, ret, errno, strerror(errno));
+
+    /* Try harder for CDrom. */
+    if (bs->type == BDRV_TYPE_CDROM) {
+        lseek(s->fd, offset, SEEK_SET);
+        ret = read(s->fd, buf, count);
+        if (ret == count)
+            goto label__raw_read__success;
+        lseek(s->fd, offset, SEEK_SET);
+        ret = read(s->fd, buf, count);
+        if (ret == count)
+            goto label__raw_read__success;
+
+        DEBUG_BLOCK_PRINT("raw_pread(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                          "] retry read failed %d : %d = %s\n",
+                          s->fd, bs->filename, offset, buf, count,
+                          bs->total_sectors, ret, errno, strerror(errno));
+    }
+
+label__raw_read__success:
+
+    return ret;
+}
+
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0)
+        return -errno;
+
+    if (offset >= 0 && lseek(s->fd, offset, SEEK_SET) == (off_t)-1) {
+        ++(s->lseek_err_cnt);
+        if(s->lseek_err_cnt) {
+            DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%"
+                              PRId64 "] lseek failed : %d = %s\n",
+                              s->fd, bs->filename, offset, buf, count,
+                              bs->total_sectors, errno, strerror(errno));
+        }
+        return -EIO;
+    }
+    s->lseek_err_cnt = 0;
+
+    ret = write(s->fd, buf, count);
+    if (ret == count)
+        goto label__raw_write__success;
+
+    DEBUG_BLOCK_PRINT("raw_pwrite(%d:%s, %" PRId64 ", %p, %d) [%" PRId64
+                      "] write failed %d : %d = %s\n",
+                      s->fd, bs->filename, offset, buf, count,
+                      bs->total_sectors, ret, errno, strerror(errno));
+
+label__raw_write__success:
+
+    return  (ret < 0) ? -errno : ret;
+}
+
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL)  {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+
+            shift = offset & 0x1ff;
+            size = (shift + count + 0x1ff) & ~0x1ff;
+            if (size > ALIGNED_BUFFER_SIZE)
+                size = ALIGNED_BUFFER_SIZE;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(buf, s->aligned_buf + shift, size);
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            /* read on aligned buffer */
+
+            while (count) {
+
+                size = (count + 0x1ff) & ~0x1ff;
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                size = ret;
+                if (size > count)
+                    size = count;
+
+                memcpy(buf, s->aligned_buf, size);
+
+                buf += size;
+                offset += size;
+                count -= size;
+                sum += size;
+            }
+
+            return sum;
+        }
+    }
+
+    return raw_pread_aligned(bs, offset, buf, count) + sum;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    int ret;
+
+    ret = raw_pread(bs, sector_num * 512, buf, nb_sectors * 512);
+    if (ret == (nb_sectors * 512))
+        ret = 0;
+    return ret;
+}
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL) {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+            shift = offset & 0x1ff;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(s->aligned_buf + shift, buf, size);
+
+            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            while ((size = (count & ~0x1ff)) != 0) {
+
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                memcpy(s->aligned_buf, buf, size);
+
+                ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                buf += ret;
+                offset += ret;
+                count -= ret;
+                sum += ret;
+            }
+            /* here, count < 512 because (count & ~0x1ff) == 0 */
+            if (count) {
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+                if (ret < 0)
+                    return ret;
+                 memcpy(s->aligned_buf, buf, count);
+
+                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+                 if (ret < 0)
+                     return ret;
+                 if (count < ret)
+                     ret = count;
+
+                 sum += ret;
+            }
+            return sum;
+        }
+    }
+    return raw_pwrite_aligned(bs, offset, buf, count) + sum;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    int ret;
+    ret = raw_pwrite(bs, sector_num * 512, buf, nb_sectors * 512);
+    if (ret == (nb_sectors * 512))
+        ret = 0;
+    return ret;
+}
+
+#ifdef CONFIG_AIO
+/***********************************************************/
+/* Unix AIO using POSIX AIO */
+
+typedef struct RawAIOCB {
+    BlockDriverAIOCB common;
+    struct qemu_paiocb aiocb;
+    struct RawAIOCB *next;
+    int ret;
+} RawAIOCB;
+
+typedef struct PosixAioState
+{
+    int rfd, wfd;
+    RawAIOCB *first_aio;
+} PosixAioState;
+
+static void posix_aio_read(void *opaque)
+{
+    PosixAioState *s = opaque;
+    RawAIOCB *acb, **pacb;
+    int ret;
+    ssize_t len;
+
+    /* read all bytes from signal pipe */
+    for (;;) {
+        char bytes[16];
+
+        len = read(s->rfd, bytes, sizeof(bytes));
+        if (len == -1 && errno == EINTR)
+            continue; /* try again */
+        if (len == sizeof(bytes))
+            continue; /* more to read */
+        break;
+    }
+
+    for(;;) {
+        pacb = &s->first_aio;
+        for(;;) {
+            acb = *pacb;
+            if (!acb)
+                goto the_end;
+            ret = qemu_paio_error(&acb->aiocb);
+            if (ret == ECANCELED) {
+                /* remove the request */
+                *pacb = acb->next;
+                qemu_aio_release(acb);
+            } else if (ret != EINPROGRESS) {
+                /* end of aio */
+                if (ret == 0) {
+                    ret = qemu_paio_return(&acb->aiocb);
+                    if (ret == acb->aiocb.aio_nbytes)
+                        ret = 0;
+                    else
+                        ret = -EINVAL;
+                } else {
+                    ret = -ret;
+                }
+                /* remove the request */
+                *pacb = acb->next;
+                /* call the callback */
+                acb->common.cb(acb->common.opaque, ret);
+                qemu_aio_release(acb);
+                break;
+            } else {
+                pacb = &acb->next;
+            }
+        }
+    }
+ the_end: ;
+}
+
+static int posix_aio_flush(void *opaque)
+{
+    PosixAioState *s = opaque;
+    return !!s->first_aio;
+}
+
+static PosixAioState *posix_aio_state;
+
+static void aio_signal_handler(int signum)
+{
+    if (posix_aio_state) {
+        char byte = 0;
+
+        write(posix_aio_state->wfd, &byte, sizeof(byte));
+    }
+
+    qemu_service_io();
+}
+
+static int posix_aio_init(void)
+{
+    struct sigaction act;
+    PosixAioState *s;
+    int fds[2];
+    struct qemu_paioinit ai;
+  
+    if (posix_aio_state)
+        return 0;
+
+    s = qemu_malloc(sizeof(PosixAioState));
+
+    sigfillset(&act.sa_mask);
+    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
+    act.sa_handler = aio_signal_handler;
+    sigaction(SIGUSR2, &act, NULL);
+
+    s->first_aio = NULL;
+    if (pipe(fds) == -1) {
+        fprintf(stderr, "failed to create pipe\n");
+        return -errno;
+    }
+
+    s->rfd = fds[0];
+    s->wfd = fds[1];
+
+    fcntl(s->rfd, F_SETFL, O_NONBLOCK);
+    fcntl(s->wfd, F_SETFL, O_NONBLOCK);
+
+    qemu_aio_set_fd_handler(s->rfd, posix_aio_read, NULL, posix_aio_flush, s);
+
+    memset(&ai, 0, sizeof(ai));
+    ai.aio_threads = 64;
+    ai.aio_num = 64;
+    qemu_paio_init(&ai);
+
+    posix_aio_state = s;
+
+    return 0;
+}
+
+static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int64_t sector_num,
+        QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+    RawAIOCB *acb;
+
+    if (fd_open(bs) < 0)
+        return NULL;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aiocb.aio_fildes = s->fd;
+    acb->aiocb.ev_signo = SIGUSR2;
+    acb->aiocb.aio_iov = qiov->iov;
+    acb->aiocb.aio_niov = qiov->niov;
+    acb->aiocb.aio_nbytes = nb_sectors * 512;
+    acb->aiocb.aio_offset = sector_num * 512;
+    acb->aiocb.aio_flags = 0;
+
+    /*
+     * If O_DIRECT is used the buffer needs to be aligned on a sector
+     * boundary. Tell the low level code to ensure that in case it's
+     * not done yet.
+     */
+    if (s->aligned_buf)
+        acb->aiocb.aio_flags |= QEMU_AIO_SECTOR_ALIGNED;
+
+    acb->next = posix_aio_state->first_aio;
+    posix_aio_state->first_aio = acb;
+    return acb;
+}
+
+static void raw_aio_remove(RawAIOCB *acb)
+{
+    RawAIOCB **pacb;
+
+    /* remove the callback from the queue */
+    pacb = &posix_aio_state->first_aio;
+    for(;;) {
+        if (*pacb == NULL) {
+            fprintf(stderr, "raw_aio_remove: aio request not found!\n");
+            break;
+        } else if (*pacb == acb) {
+            *pacb = acb->next;
+            qemu_aio_release(acb);
+            break;
+        }
+        pacb = &(*pacb)->next;
+    }
+}
+
+static BlockDriverAIOCB *raw_aio_readv(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (qemu_paio_read(&acb->aiocb) < 0) {
+        raw_aio_remove(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
+static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
+        int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (qemu_paio_write(&acb->aiocb) < 0) {
+        raw_aio_remove(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
+static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    int ret;
+    RawAIOCB *acb = (RawAIOCB *)blockacb;
+
+    ret = qemu_paio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
+    if (ret == QEMU_PAIO_NOTCANCELED) {
+        /* fail safe: if the aio could not be canceled, we wait for
+           it */
+        while (qemu_paio_error(&acb->aiocb) == EINPROGRESS);
+    }
+
+    raw_aio_remove(acb);
+}
+#else /* CONFIG_AIO */
+static int posix_aio_init(void)
+{
+    return 0;
+}
+#endif /* CONFIG_AIO */
+
+
+static void raw_close(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    if (s->fd >= 0) {
+        close(s->fd);
+        s->fd = -1;
+        if (s->aligned_buf != NULL)
+            qemu_free(s->aligned_buf);
+    }
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVRawState *s = bs->opaque;
+    if (s->type != FTYPE_FILE)
+        return -ENOTSUP;
+    if (ftruncate(s->fd, offset) < 0)
+        return -errno;
+    return 0;
+}
+
+#ifdef __OpenBSD__
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd = s->fd;
+    struct stat st;
+
+    if (fstat(fd, &st))
+        return -1;
+    if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) {
+        struct disklabel dl;
+
+        if (ioctl(fd, DIOCGDINFO, &dl))
+            return -1;
+        return (uint64_t)dl.d_secsize *
+            dl.d_partitions[DISKPART(st.st_rdev)].p_size;
+    } else
+        return st.st_size;
+}
+#else /* !__OpenBSD__ */
+static int64_t  raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd = s->fd;
+    int64_t size;
+#ifdef HOST_BSD
+    struct stat sb;
+#ifdef __FreeBSD__
+    int reopened = 0;
+#endif
+#endif
+#ifdef __sun__
+    struct dk_minfo minfo;
+    int rv;
+#endif
+    int ret;
+
+    ret = fd_open(bs);
+    if (ret < 0)
+        return ret;
+
+#ifdef HOST_BSD
+#ifdef __FreeBSD__
+again:
+#endif
+    if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) {
+#ifdef DIOCGMEDIASIZE
+	if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size))
+#elif defined(DIOCGPART)
+        {
+                struct partinfo pi;
+                if (ioctl(fd, DIOCGPART, &pi) == 0)
+                        size = pi.media_size;
+                else
+                        size = 0;
+        }
+        if (size == 0)
+#endif
+#ifdef CONFIG_COCOA
+        size = LONG_LONG_MAX;
+#else
+        size = lseek(fd, 0LL, SEEK_END);
+#endif
+#ifdef __FreeBSD__
+        switch(s->type) {
+        case FTYPE_CD:
+            /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */
+            if (size == 2048LL * (unsigned)-1)
+                size = 0;
+            /* XXX no disc?  maybe we need to reopen... */
+            if (size <= 0 && !reopened && cd_open(bs) >= 0) {
+                reopened = 1;
+                goto again;
+            }
+        }
+#endif
+    } else
+#endif
+#ifdef __sun__
+    /*
+     * use the DKIOCGMEDIAINFO ioctl to read the size.
+     */
+    rv = ioctl ( fd, DKIOCGMEDIAINFO, &minfo );
+    if ( rv != -1 ) {
+        size = minfo.dki_lbsize * minfo.dki_capacity;
+    } else /* there are reports that lseek on some devices
+              fails, but irc discussion said that contingency
+              on contingency was overkill */
+#endif
+    {
+        size = lseek(fd, 0, SEEK_END);
+    }
+    return size;
+}
+#endif
+
+static int raw_create(const char *filename, int64_t total_size,
+                      const char *backing_file, int flags)
+{
+    int fd;
+
+    if (flags || backing_file)
+        return -ENOTSUP;
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+              0644);
+    if (fd < 0)
+        return -EIO;
+    ftruncate(fd, total_size * 512);
+    close(fd);
+    return 0;
+}
+
+static void raw_flush(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    fsync(s->fd);
+}
+
+static BlockDriver bdrv_raw = {
+    .format_name = "raw",
+    .instance_size = sizeof(BDRVRawState),
+    .bdrv_probe = NULL, /* no probe for protocols */
+    .bdrv_open = raw_open,
+    .bdrv_read = raw_read,
+    .bdrv_write = raw_write,
+    .bdrv_close = raw_close,
+    .bdrv_create = raw_create,
+    .bdrv_flush = raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv = raw_aio_readv,
+    .bdrv_aio_writev = raw_aio_writev,
+    .bdrv_aio_cancel = raw_aio_cancel,
+    .aiocb_size = sizeof(RawAIOCB),
+#endif
+
+    .bdrv_truncate = raw_truncate,
+    .bdrv_getlength = raw_getlength,
+};
+
+/***********************************************/
+/* host device */
+
+#ifdef CONFIG_COCOA
+static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator );
+static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize );
+
+kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator )
+{
+    kern_return_t       kernResult;
+    mach_port_t     masterPort;
+    CFMutableDictionaryRef  classesToMatch;
+
+    kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort );
+    if ( KERN_SUCCESS != kernResult ) {
+        printf( "IOMasterPort returned %d\n", kernResult );
+    }
+
+    classesToMatch = IOServiceMatching( kIOCDMediaClass );
+    if ( classesToMatch == NULL ) {
+        printf( "IOServiceMatching returned a NULL dictionary.\n" );
+    } else {
+    CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue );
+    }
+    kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator );
+    if ( KERN_SUCCESS != kernResult )
+    {
+        printf( "IOServiceGetMatchingServices returned %d\n", kernResult );
+    }
+
+    return kernResult;
+}
+
+kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize )
+{
+    io_object_t     nextMedia;
+    kern_return_t   kernResult = KERN_FAILURE;
+    *bsdPath = '\0';
+    nextMedia = IOIteratorNext( mediaIterator );
+    if ( nextMedia )
+    {
+        CFTypeRef   bsdPathAsCFString;
+    bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 );
+        if ( bsdPathAsCFString ) {
+            size_t devPathLength;
+            strcpy( bsdPath, _PATH_DEV );
+            strcat( bsdPath, "r" );
+            devPathLength = strlen( bsdPath );
+            if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) {
+                kernResult = KERN_SUCCESS;
+            }
+            CFRelease( bsdPathAsCFString );
+        }
+        IOObjectRelease( nextMedia );
+    }
+
+    return kernResult;
+}
+
+#endif
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int fd, open_flags, ret;
+
+    posix_aio_init();
+
+#ifdef CONFIG_COCOA
+    if (strstart(filename, "/dev/cdrom", NULL)) {
+        kern_return_t kernResult;
+        io_iterator_t mediaIterator;
+        char bsdPath[ MAXPATHLEN ];
+        int fd;
+
+        kernResult = FindEjectableCDMedia( &mediaIterator );
+        kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) );
+
+        if ( bsdPath[ 0 ] != '\0' ) {
+            strcat(bsdPath,"s0");
+            /* some CDs don't have a partition 0 */
+            fd = open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE);
+            if (fd < 0) {
+                bsdPath[strlen(bsdPath)-1] = '1';
+            } else {
+                close(fd);
+            }
+            filename = bsdPath;
+        }
+
+        if ( mediaIterator )
+            IOObjectRelease( mediaIterator );
+    }
+#endif
+    open_flags = O_BINARY;
+    if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+        bs->read_only = 1;
+    }
+    /* Use O_DSYNC for write-through caching, no flags for write-back caching,
+     * and O_DIRECT for no caching. */
+    if ((flags & BDRV_O_NOCACHE))
+        open_flags |= O_DIRECT;
+    else if (!(flags & BDRV_O_CACHE_WB))
+        open_flags |= O_DSYNC;
+
+    s->type = FTYPE_FILE;
+#if defined(__linux__)
+    if (strstart(filename, "/dev/cd", NULL)) {
+        /* open will not fail even if no CD is inserted */
+        open_flags |= O_NONBLOCK;
+        s->type = FTYPE_CD;
+    } else if (strstart(filename, "/dev/fd", NULL)) {
+        s->type = FTYPE_FD;
+        s->fd_open_flags = open_flags;
+        /* open will not fail even if no floppy is inserted */
+        open_flags |= O_NONBLOCK;
+#ifdef CONFIG_AIO
+    } else if (strstart(filename, "/dev/sg", NULL)) {
+        bs->sg = 1;
+#endif
+    }
+#endif
+#if defined(__FreeBSD__)
+    if (strstart(filename, "/dev/cd", NULL) ||
+        strstart(filename, "/dev/acd", NULL)) {
+        s->type = FTYPE_CD;
+        s->cd_open_flags = open_flags;
+    }
+#endif
+    s->fd = -1;
+    fd = open(filename, open_flags, 0644);
+    if (fd < 0) {
+        ret = -errno;
+        if (ret == -EROFS)
+            ret = -EACCES;
+        return ret;
+    }
+    s->fd = fd;
+#if defined(__FreeBSD__)
+    /* make sure the door isnt locked at this time */
+    if (s->type == FTYPE_CD)
+        ioctl (s->fd, CDIOCALLOW);
+#endif
+#if defined(__linux__)
+    /* close fd so that we can reopen it as needed */
+    if (s->type == FTYPE_FD) {
+        close(s->fd);
+        s->fd = -1;
+        s->fd_media_changed = 1;
+    }
+#endif
+    return 0;
+}
+
+#if defined(__linux__)
+/* Note: we do not have a reliable method to detect if the floppy is
+   present. The current method is to try to open the floppy at every
+   I/O and to keep it opened during a few hundreds of ms. */
+static int fd_open(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int last_media_present;
+
+    if (s->type != FTYPE_FD)
+        return 0;
+    last_media_present = (s->fd >= 0);
+    if (s->fd >= 0 &&
+        (qemu_get_clock(rt_clock) - s->fd_open_time) >= FD_OPEN_TIMEOUT) {
+        close(s->fd);
+        s->fd = -1;
+#ifdef DEBUG_FLOPPY
+        printf("Floppy closed\n");
+#endif
+    }
+    if (s->fd < 0) {
+        if (s->fd_got_error &&
+            (qemu_get_clock(rt_clock) - s->fd_error_time) < FD_OPEN_TIMEOUT) {
+#ifdef DEBUG_FLOPPY
+            printf("No floppy (open delayed)\n");
+#endif
+            return -EIO;
+        }
+        s->fd = open(bs->filename, s->fd_open_flags);
+        if (s->fd < 0) {
+            s->fd_error_time = qemu_get_clock(rt_clock);
+            s->fd_got_error = 1;
+            if (last_media_present)
+                s->fd_media_changed = 1;
+#ifdef DEBUG_FLOPPY
+            printf("No floppy\n");
+#endif
+            return -EIO;
+        }
+#ifdef DEBUG_FLOPPY
+        printf("Floppy opened\n");
+#endif
+    }
+    if (!last_media_present)
+        s->fd_media_changed = 1;
+    s->fd_open_time = qemu_get_clock(rt_clock);
+    s->fd_got_error = 0;
+    return 0;
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    int ret;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT);
+        if (ret == CDS_DISC_OK)
+            return 1;
+        else
+            return 0;
+        break;
+    case FTYPE_FD:
+        ret = fd_open(bs);
+        return (ret >= 0);
+    default:
+        return 1;
+    }
+}
+
+/* currently only used by fdc.c, but a CD version would be good too */
+static int raw_media_changed(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    switch(s->type) {
+    case FTYPE_FD:
+        {
+            int ret;
+            /* XXX: we do not have a true media changed indication. It
+               does not work if the floppy is changed without trying
+               to read it */
+            fd_open(bs);
+            ret = s->fd_media_changed;
+            s->fd_media_changed = 0;
+#ifdef DEBUG_FLOPPY
+            printf("Floppy changed=%d\n", ret);
+#endif
+            return ret;
+        }
+    default:
+        return -ENOTSUP;
+    }
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+    BDRVRawState *s = bs->opaque;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        if (eject_flag) {
+            if (ioctl (s->fd, CDROMEJECT, NULL) < 0)
+                perror("CDROMEJECT");
+        } else {
+            if (ioctl (s->fd, CDROMCLOSETRAY, NULL) < 0)
+                perror("CDROMEJECT");
+        }
+        break;
+    case FTYPE_FD:
+        {
+            int fd;
+            if (s->fd >= 0) {
+                close(s->fd);
+                s->fd = -1;
+            }
+            fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
+            if (fd >= 0) {
+                if (ioctl(fd, FDEJECT, 0) < 0)
+                    perror("FDEJECT");
+                close(fd);
+            }
+        }
+        break;
+    default:
+        return -ENOTSUP;
+    }
+    return 0;
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+    BDRVRawState *s = bs->opaque;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        if (ioctl (s->fd, CDROM_LOCKDOOR, locked) < 0) {
+            /* Note: an error can happen if the distribution automatically
+               mounts the CD-ROM */
+            //        perror("CDROM_LOCKDOOR");
+        }
+        break;
+    default:
+        return -ENOTSUP;
+    }
+    return 0;
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BDRVRawState *s = bs->opaque;
+
+    return ioctl(s->fd, req, buf);
+}
+
+#ifdef CONFIG_AIO
+static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+    RawAIOCB *acb;
+
+    if (fd_open(bs) < 0)
+        return NULL;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aiocb.aio_fildes = s->fd;
+    acb->aiocb.ev_signo = SIGUSR2;
+    acb->aiocb.aio_offset = 0;
+    acb->aiocb.aio_flags = 0;
+
+    acb->next = posix_aio_state->first_aio;
+    posix_aio_state->first_aio = acb;
+
+    acb->aiocb.aio_ioctl_buf = buf;
+    acb->aiocb.aio_ioctl_cmd = req;
+    if (qemu_paio_ioctl(&acb->aiocb) < 0) {
+        raw_aio_remove(acb);
+        return NULL;
+    }
+
+    return &acb->common;
+}
+#endif
+
+#elif defined(__FreeBSD__)
+
+static int fd_open(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    /* this is just to ensure s->fd is sane (its called by io ops) */
+    if (s->fd >= 0)
+        return 0;
+    return -EIO;
+}
+
+static int cd_open(BlockDriverState *bs)
+{
+#if defined(__FreeBSD__)
+    BDRVRawState *s = bs->opaque;
+    int fd;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        /* XXX force reread of possibly changed/newly loaded disc,
+         * FreeBSD seems to not notice sometimes... */
+        if (s->fd >= 0)
+            close (s->fd);
+        fd = open(bs->filename, s->cd_open_flags, 0644);
+        if (fd < 0) {
+            s->fd = -1;
+            return -EIO;
+        }
+        s->fd = fd;
+        /* make sure the door isnt locked at this time */
+        ioctl (s->fd, CDIOCALLOW);
+    }
+#endif
+    return 0;
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        return (raw_getlength(bs) > 0);
+    case FTYPE_FD:
+        /* XXX handle this */
+        /* FALLTHRU */
+    default:
+        return 1;
+    }
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+    BDRVRawState *s = bs->opaque;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        if (s->fd < 0)
+            return -ENOTSUP;
+        (void) ioctl (s->fd, CDIOCALLOW);
+        if (eject_flag) {
+            if (ioctl (s->fd, CDIOCEJECT) < 0)
+                perror("CDIOCEJECT");
+        } else {
+            if (ioctl (s->fd, CDIOCCLOSE) < 0)
+                perror("CDIOCCLOSE");
+        }
+        if (cd_open(bs) < 0)
+            return -ENOTSUP;
+        break;
+    case FTYPE_FD:
+        /* XXX handle this */
+        /* FALLTHRU */
+    default:
+        return -ENOTSUP;
+    }
+    return 0;
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+    BDRVRawState *s = bs->opaque;
+
+    switch(s->type) {
+    case FTYPE_CD:
+        if (s->fd < 0)
+            return -ENOTSUP;
+        if (ioctl (s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) {
+            /* Note: an error can happen if the distribution automatically
+               mounts the CD-ROM */
+            //        perror("CDROM_LOCKDOOR");
+        }
+        break;
+    default:
+        return -ENOTSUP;
+    }
+    return 0;
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    return -ENOTSUP;
+}
+#else /* !linux && !FreeBSD */
+
+static int fd_open(BlockDriverState *bs)
+{
+    return 0;
+}
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+    return 1;
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+    return -ENOTSUP;
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+    return -ENOTSUP;
+}
+
+static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    return -ENOTSUP;
+}
+
+static BlockDriverAIOCB *raw_aio_ioctl(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    return NULL;
+}
+#endif /* !linux && !FreeBSD */
+
+#if defined(__linux__) || defined(__FreeBSD__)
+static int hdev_create(const char *filename, int64_t total_size,
+                       const char *backing_file, int flags)
+{
+    int fd;
+    int ret = 0;
+    struct stat stat_buf;
+
+    if (flags || backing_file)
+        return -ENOTSUP;
+
+    fd = open(filename, O_WRONLY | O_BINARY);
+    if (fd < 0)
+        return -EIO;
+
+    if (fstat(fd, &stat_buf) < 0)
+        ret = -EIO;
+    else if (!S_ISBLK(stat_buf.st_mode))
+        ret = -EIO;
+    else if (lseek(fd, 0, SEEK_END) < total_size * 512)
+        ret = -ENOSPC;
+
+    close(fd);
+    return ret;
+}
+
+#else  /* !(linux || freebsd) */
+
+static int hdev_create(const char *filename, int64_t total_size,
+                       const char *backing_file, int flags)
+{
+    return -ENOTSUP;
+}
+#endif
+
+static BlockDriver bdrv_host_device = {
+    .format_name	= "host_device",
+    .instance_size	= sizeof(BDRVRawState),
+    .bdrv_open		= hdev_open,
+    .bdrv_close		= raw_close,
+    .bdrv_create        = hdev_create,
+    .bdrv_flush		= raw_flush,
+
+#ifdef CONFIG_AIO
+    .bdrv_aio_readv	= raw_aio_readv,
+    .bdrv_aio_writev	= raw_aio_writev,
+    .bdrv_aio_cancel	= raw_aio_cancel,
+    .aiocb_size		= sizeof(RawAIOCB),
+#endif
+
+    .bdrv_read          = raw_read,
+    .bdrv_write         = raw_write,
+    .bdrv_getlength	= raw_getlength,
+
+    /* removable device support */
+    .bdrv_is_inserted	= raw_is_inserted,
+    .bdrv_media_changed	= raw_media_changed,
+    .bdrv_eject		= raw_eject,
+    .bdrv_set_locked	= raw_set_locked,
+    /* generic scsi device */
+    .bdrv_ioctl		= raw_ioctl,
+#ifdef CONFIG_AIO
+    .bdrv_aio_ioctl	= raw_aio_ioctl,
+#endif
+};
+
+static void bdrv_raw_init(void)
+{
+    bdrv_register(&bdrv_raw);
+    bdrv_register(&bdrv_host_device);
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/raw-win32.c b/block/raw-win32.c
new file mode 100644
index 0000000..ab3abd6
--- /dev/null
+++ b/block/raw-win32.c
@@ -0,0 +1,394 @@
+/*
+ * Block driver for RAW files (win32)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu-timer.h"
+#include "block_int.h"
+#include "module.h"
+#include <windows.h>
+#include <winioctl.h>
+
+#define FTYPE_FILE 0
+#define FTYPE_CD     1
+#define FTYPE_HARDDISK 2
+
+typedef struct BDRVRawState {
+    HANDLE hfile;
+    int type;
+    char drive_path[16]; /* format: "d:\" */
+} BDRVRawState;
+
+int qemu_ftruncate64(int fd, int64_t length)
+{
+    LARGE_INTEGER li;
+    LONG high;
+    HANDLE h;
+    BOOL res;
+
+    if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0)
+	return -1;
+
+    h = (HANDLE)_get_osfhandle(fd);
+
+    /* get current position, ftruncate do not change position */
+    li.HighPart = 0;
+    li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT);
+    if (li.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+	return -1;
+
+    high = length >> 32;
+    if (!SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN))
+	return -1;
+    res = SetEndOfFile(h);
+
+    /* back to old position */
+    SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN);
+    return res ? 0 : -1;
+}
+
+static int set_sparse(int fd)
+{
+    DWORD returned;
+    return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE,
+				 NULL, 0, NULL, 0, &returned, NULL);
+}
+
+static int raw_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int access_flags, create_flags;
+    DWORD overlapped;
+
+    s->type = FTYPE_FILE;
+
+    if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+        access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        access_flags = GENERIC_READ;
+    }
+    if (flags & BDRV_O_CREAT) {
+        create_flags = CREATE_ALWAYS;
+    } else {
+        create_flags = OPEN_EXISTING;
+    }
+    overlapped = FILE_ATTRIBUTE_NORMAL;
+    if ((flags & BDRV_O_NOCACHE))
+        overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+    else if (!(flags & BDRV_O_CACHE_WB))
+        overlapped |= FILE_FLAG_WRITE_THROUGH;
+    s->hfile = CreateFile(filename, access_flags,
+                          FILE_SHARE_READ, NULL,
+                          create_flags, overlapped, NULL);
+    if (s->hfile == INVALID_HANDLE_VALUE) {
+        int err = GetLastError();
+
+        if (err == ERROR_ACCESS_DENIED)
+            return -EACCES;
+        return -1;
+    }
+    return 0;
+}
+
+static int raw_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVRawState *s = bs->opaque;
+    OVERLAPPED ov;
+    DWORD ret_count;
+    int ret;
+    int64_t offset = sector_num * 512;
+    int count = nb_sectors * 512;
+
+    memset(&ov, 0, sizeof(ov));
+    ov.Offset = offset;
+    ov.OffsetHigh = offset >> 32;
+    ret = ReadFile(s->hfile, buf, count, &ret_count, &ov);
+    if (!ret)
+        return ret_count;
+    if (ret_count == count)
+        ret_count = 0;
+    return ret_count;
+}
+
+static int raw_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVRawState *s = bs->opaque;
+    OVERLAPPED ov;
+    DWORD ret_count;
+    int ret;
+    int64_t offset = sector_num * 512;
+    int count = nb_sectors * 512;
+
+    memset(&ov, 0, sizeof(ov));
+    ov.Offset = offset;
+    ov.OffsetHigh = offset >> 32;
+    ret = WriteFile(s->hfile, buf, count, &ret_count, &ov);
+    if (!ret)
+        return ret_count;
+    if (ret_count == count)
+        ret_count = 0;
+    return ret_count;
+}
+
+static void raw_flush(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    FlushFileBuffers(s->hfile);
+}
+
+static void raw_close(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    CloseHandle(s->hfile);
+}
+
+static int raw_truncate(BlockDriverState *bs, int64_t offset)
+{
+    BDRVRawState *s = bs->opaque;
+    LONG low, high;
+
+    low = offset;
+    high = offset >> 32;
+    if (!SetFilePointer(s->hfile, low, &high, FILE_BEGIN))
+	return -EIO;
+    if (!SetEndOfFile(s->hfile))
+        return -EIO;
+    return 0;
+}
+
+static int64_t raw_getlength(BlockDriverState *bs)
+{
+    BDRVRawState *s = bs->opaque;
+    LARGE_INTEGER l;
+    ULARGE_INTEGER available, total, total_free;
+    DISK_GEOMETRY_EX dg;
+    DWORD count;
+    BOOL status;
+
+    switch(s->type) {
+    case FTYPE_FILE:
+        l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart);
+        if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR)
+            return -EIO;
+        break;
+    case FTYPE_CD:
+        if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free))
+            return -EIO;
+        l.QuadPart = total.QuadPart;
+        break;
+    case FTYPE_HARDDISK:
+        status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX,
+                                 NULL, 0, &dg, sizeof(dg), &count, NULL);
+        if (status != 0) {
+            l = dg.DiskSize;
+        }
+        break;
+    default:
+        return -EIO;
+    }
+    return l.QuadPart;
+}
+
+static int raw_create(const char *filename, int64_t total_size,
+                      const char *backing_file, int flags)
+{
+    int fd;
+
+    if (flags || backing_file)
+        return -ENOTSUP;
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
+              0644);
+    if (fd < 0)
+        return -EIO;
+    set_sparse(fd);
+    ftruncate(fd, total_size * 512);
+    close(fd);
+    return 0;
+}
+
+static BlockDriver bdrv_raw = {
+    .format_name	= "raw",
+    .instance_size	= sizeof(BDRVRawState),
+    .bdrv_open		= raw_open,
+    .bdrv_close		= raw_close,
+    .bdrv_create	= raw_create,
+    .bdrv_flush		= raw_flush,
+    .bdrv_read		= raw_read,
+    .bdrv_write		= raw_write,
+    .bdrv_truncate	= raw_truncate,
+    .bdrv_getlength	= raw_getlength,
+};
+
+/***********************************************/
+/* host device */
+
+static int find_cdrom(char *cdrom_name, int cdrom_name_size)
+{
+    char drives[256], *pdrv = drives;
+    UINT type;
+
+    memset(drives, 0, sizeof(drives));
+    GetLogicalDriveStrings(sizeof(drives), drives);
+    while(pdrv[0] != '\0') {
+        type = GetDriveType(pdrv);
+        switch(type) {
+        case DRIVE_CDROM:
+            snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]);
+            return 0;
+            break;
+        }
+        pdrv += lstrlen(pdrv) + 1;
+    }
+    return -1;
+}
+
+static int find_device_type(BlockDriverState *bs, const char *filename)
+{
+    BDRVRawState *s = bs->opaque;
+    UINT type;
+    const char *p;
+
+    if (strstart(filename, "\\\\.\\", &p) ||
+        strstart(filename, "//./", &p)) {
+        if (stristart(p, "PhysicalDrive", NULL))
+            return FTYPE_HARDDISK;
+        snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]);
+        type = GetDriveType(s->drive_path);
+        switch (type) {
+        case DRIVE_REMOVABLE:
+        case DRIVE_FIXED:
+            return FTYPE_HARDDISK;
+        case DRIVE_CDROM:
+            return FTYPE_CD;
+        default:
+            return FTYPE_FILE;
+        }
+    } else {
+        return FTYPE_FILE;
+    }
+}
+
+static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int access_flags, create_flags;
+    DWORD overlapped;
+    char device_name[64];
+
+    if (strstart(filename, "/dev/cdrom", NULL)) {
+        if (find_cdrom(device_name, sizeof(device_name)) < 0)
+            return -ENOENT;
+        filename = device_name;
+    } else {
+        /* transform drive letters into device name */
+        if (((filename[0] >= 'a' && filename[0] <= 'z') ||
+             (filename[0] >= 'A' && filename[0] <= 'Z')) &&
+            filename[1] == ':' && filename[2] == '\0') {
+            snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]);
+            filename = device_name;
+        }
+    }
+    s->type = find_device_type(bs, filename);
+
+    if ((flags & BDRV_O_ACCESS) == O_RDWR) {
+        access_flags = GENERIC_READ | GENERIC_WRITE;
+    } else {
+        access_flags = GENERIC_READ;
+    }
+    create_flags = OPEN_EXISTING;
+
+    overlapped = FILE_ATTRIBUTE_NORMAL;
+    if ((flags & BDRV_O_NOCACHE))
+        overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
+    else if (!(flags & BDRV_O_CACHE_WB))
+        overlapped |= FILE_FLAG_WRITE_THROUGH;
+    s->hfile = CreateFile(filename, access_flags,
+                          FILE_SHARE_READ, NULL,
+                          create_flags, overlapped, NULL);
+    if (s->hfile == INVALID_HANDLE_VALUE) {
+        int err = GetLastError();
+
+        if (err == ERROR_ACCESS_DENIED)
+            return -EACCES;
+        return -1;
+    }
+    return 0;
+}
+
+#if 0
+/***********************************************/
+/* removable device additional commands */
+
+static int raw_is_inserted(BlockDriverState *bs)
+{
+    return 1;
+}
+
+static int raw_media_changed(BlockDriverState *bs)
+{
+    return -ENOTSUP;
+}
+
+static int raw_eject(BlockDriverState *bs, int eject_flag)
+{
+    DWORD ret_count;
+
+    if (s->type == FTYPE_FILE)
+        return -ENOTSUP;
+    if (eject_flag) {
+        DeviceIoControl(s->hfile, IOCTL_STORAGE_EJECT_MEDIA,
+                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
+    } else {
+        DeviceIoControl(s->hfile, IOCTL_STORAGE_LOAD_MEDIA,
+                        NULL, 0, NULL, 0, &lpBytesReturned, NULL);
+    }
+}
+
+static int raw_set_locked(BlockDriverState *bs, int locked)
+{
+    return -ENOTSUP;
+}
+#endif
+
+static BlockDriver bdrv_host_device = {
+    .format_name	= "host_device",
+    .instance_size	= sizeof(BDRVRawState),
+    .bdrv_open		= hdev_open,
+    .bdrv_close		= raw_close,
+    .bdrv_flush		= raw_flush,
+
+    .bdrv_read		= raw_read,
+    .bdrv_write	        = raw_write,
+    .bdrv_getlength	= raw_getlength,
+};
+
+static void bdrv_raw_init(void)
+{
+    bdrv_register(&bdrv_raw);
+    bdrv_register(&bdrv_host_device);
+    return 0;
+}
+
+block_init(bdrv_raw_init);
diff --git a/block/vmdk.c b/block/vmdk.c
new file mode 100644
index 0000000..13866e9
--- /dev/null
+++ b/block/vmdk.c
@@ -0,0 +1,833 @@
+/*
+ * Block driver for the VMDK format
+ *
+ * Copyright (c) 2004 Fabrice Bellard
+ * Copyright (c) 2005 Filip Navara
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D')
+#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V')
+
+typedef struct {
+    uint32_t version;
+    uint32_t flags;
+    uint32_t disk_sectors;
+    uint32_t granularity;
+    uint32_t l1dir_offset;
+    uint32_t l1dir_size;
+    uint32_t file_sectors;
+    uint32_t cylinders;
+    uint32_t heads;
+    uint32_t sectors_per_track;
+} VMDK3Header;
+
+typedef struct {
+    uint32_t version;
+    uint32_t flags;
+    int64_t capacity;
+    int64_t granularity;
+    int64_t desc_offset;
+    int64_t desc_size;
+    int32_t num_gtes_per_gte;
+    int64_t rgd_offset;
+    int64_t gd_offset;
+    int64_t grain_offset;
+    char filler[1];
+    char check_bytes[4];
+} __attribute__((packed)) VMDK4Header;
+
+#define L2_CACHE_SIZE 16
+
+typedef struct BDRVVmdkState {
+    BlockDriverState *hd;
+    int64_t l1_table_offset;
+    int64_t l1_backup_table_offset;
+    uint32_t *l1_table;
+    uint32_t *l1_backup_table;
+    unsigned int l1_size;
+    uint32_t l1_entry_sectors;
+
+    unsigned int l2_size;
+    uint32_t *l2_cache;
+    uint32_t l2_cache_offsets[L2_CACHE_SIZE];
+    uint32_t l2_cache_counts[L2_CACHE_SIZE];
+
+    unsigned int cluster_sectors;
+    uint32_t parent_cid;
+    int is_parent;
+} BDRVVmdkState;
+
+typedef struct VmdkMetaData {
+    uint32_t offset;
+    unsigned int l1_index;
+    unsigned int l2_index;
+    unsigned int l2_offset;
+    int valid;
+} VmdkMetaData;
+
+typedef struct ActiveBDRVState{
+    BlockDriverState *hd;            // active image handler
+    uint64_t cluster_offset;         // current write offset
+}ActiveBDRVState;
+
+static ActiveBDRVState activeBDRV;
+
+
+static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    uint32_t magic;
+
+    if (buf_size < 4)
+        return 0;
+    magic = be32_to_cpu(*(uint32_t *)buf);
+    if (magic == VMDK3_MAGIC ||
+        magic == VMDK4_MAGIC)
+        return 100;
+    else
+        return 0;
+}
+
+#define CHECK_CID 1
+
+#define SECTOR_SIZE 512
+#define DESC_SIZE 20*SECTOR_SIZE	// 20 sectors of 512 bytes each
+#define HEADER_SIZE 512   			// first sector of 512 bytes
+
+static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
+{
+    BDRVVmdkState *s = bs->opaque;
+    char desc[DESC_SIZE];
+    uint32_t cid;
+    const char *p_name, *cid_str;
+    size_t cid_str_size;
+
+    /* the descriptor offset = 0x200 */
+    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return 0;
+
+    if (parent) {
+        cid_str = "parentCID";
+        cid_str_size = sizeof("parentCID");
+    } else {
+        cid_str = "CID";
+        cid_str_size = sizeof("CID");
+    }
+
+    if ((p_name = strstr(desc,cid_str)) != NULL) {
+        p_name += cid_str_size;
+        sscanf(p_name,"%x",&cid);
+    }
+
+    return cid;
+}
+
+static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid)
+{
+    BDRVVmdkState *s = bs->opaque;
+    char desc[DESC_SIZE], tmp_desc[DESC_SIZE];
+    char *p_name, *tmp_str;
+
+    /* the descriptor offset = 0x200 */
+    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return -1;
+
+    tmp_str = strstr(desc,"parentCID");
+    pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str);
+    if ((p_name = strstr(desc,"CID")) != NULL) {
+        p_name += sizeof("CID");
+        snprintf(p_name, sizeof(desc) - (p_name - desc), "%x\n", cid);
+        pstrcat(desc, sizeof(desc), tmp_desc);
+    }
+
+    if (bdrv_pwrite(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return -1;
+    return 0;
+}
+
+static int vmdk_is_cid_valid(BlockDriverState *bs)
+{
+#ifdef CHECK_CID
+    BDRVVmdkState *s = bs->opaque;
+    BlockDriverState *p_bs = s->hd->backing_hd;
+    uint32_t cur_pcid;
+
+    if (p_bs) {
+        cur_pcid = vmdk_read_cid(p_bs,0);
+        if (s->parent_cid != cur_pcid)
+            // CID not valid
+            return 0;
+    }
+#endif
+    // CID valid
+    return 1;
+}
+
+static int vmdk_snapshot_create(const char *filename, const char *backing_file)
+{
+    int snp_fd, p_fd;
+    uint32_t p_cid;
+    char *p_name, *gd_buf, *rgd_buf;
+    const char *real_filename, *temp_str;
+    VMDK4Header header;
+    uint32_t gde_entries, gd_size;
+    int64_t gd_offset, rgd_offset, capacity, gt_size;
+    char p_desc[DESC_SIZE], s_desc[DESC_SIZE], hdr[HEADER_SIZE];
+    static const char desc_template[] =
+    "# Disk DescriptorFile\n"
+    "version=1\n"
+    "CID=%x\n"
+    "parentCID=%x\n"
+    "createType=\"monolithicSparse\"\n"
+    "parentFileNameHint=\"%s\"\n"
+    "\n"
+    "# Extent description\n"
+    "RW %u SPARSE \"%s\"\n"
+    "\n"
+    "# The Disk Data Base \n"
+    "#DDB\n"
+    "\n";
+
+    snp_fd = open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE, 0644);
+    if (snp_fd < 0)
+        return -1;
+    p_fd = open(backing_file, O_RDONLY | O_BINARY | O_LARGEFILE);
+    if (p_fd < 0) {
+        close(snp_fd);
+        return -1;
+    }
+
+    /* read the header */
+    if (lseek(p_fd, 0x0, SEEK_SET) == -1)
+        goto fail;
+    if (read(p_fd, hdr, HEADER_SIZE) != HEADER_SIZE)
+        goto fail;
+
+    /* write the header */
+    if (lseek(snp_fd, 0x0, SEEK_SET) == -1)
+        goto fail;
+    if (write(snp_fd, hdr, HEADER_SIZE) == -1)
+        goto fail;
+
+    memset(&header, 0, sizeof(header));
+    memcpy(&header,&hdr[4], sizeof(header)); // skip the VMDK4_MAGIC
+
+    ftruncate(snp_fd, header.grain_offset << 9);
+    /* the descriptor offset = 0x200 */
+    if (lseek(p_fd, 0x200, SEEK_SET) == -1)
+        goto fail;
+    if (read(p_fd, p_desc, DESC_SIZE) != DESC_SIZE)
+        goto fail;
+
+    if ((p_name = strstr(p_desc,"CID")) != NULL) {
+        p_name += sizeof("CID");
+        sscanf(p_name,"%x",&p_cid);
+    }
+
+    real_filename = filename;
+    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, '/')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, ':')) != NULL)
+        real_filename = temp_str + 1;
+
+    snprintf(s_desc, sizeof(s_desc), desc_template, p_cid, p_cid, backing_file,
+             (uint32_t)header.capacity, real_filename);
+
+    /* write the descriptor */
+    if (lseek(snp_fd, 0x200, SEEK_SET) == -1)
+        goto fail;
+    if (write(snp_fd, s_desc, strlen(s_desc)) == -1)
+        goto fail;
+
+    gd_offset = header.gd_offset * SECTOR_SIZE;     // offset of GD table
+    rgd_offset = header.rgd_offset * SECTOR_SIZE;   // offset of RGD table
+    capacity = header.capacity * SECTOR_SIZE;       // Extent size
+    /*
+     * Each GDE span 32M disk, means:
+     * 512 GTE per GT, each GTE points to grain
+     */
+    gt_size = (int64_t)header.num_gtes_per_gte * header.granularity * SECTOR_SIZE;
+    if (!gt_size)
+        goto fail;
+    gde_entries = (uint32_t)(capacity / gt_size);  // number of gde/rgde
+    gd_size = gde_entries * sizeof(uint32_t);
+
+    /* write RGD */
+    rgd_buf = qemu_malloc(gd_size);
+    if (lseek(p_fd, rgd_offset, SEEK_SET) == -1)
+        goto fail_rgd;
+    if (read(p_fd, rgd_buf, gd_size) != gd_size)
+        goto fail_rgd;
+    if (lseek(snp_fd, rgd_offset, SEEK_SET) == -1)
+        goto fail_rgd;
+    if (write(snp_fd, rgd_buf, gd_size) == -1)
+        goto fail_rgd;
+    qemu_free(rgd_buf);
+
+    /* write GD */
+    gd_buf = qemu_malloc(gd_size);
+    if (lseek(p_fd, gd_offset, SEEK_SET) == -1)
+        goto fail_gd;
+    if (read(p_fd, gd_buf, gd_size) != gd_size)
+        goto fail_gd;
+    if (lseek(snp_fd, gd_offset, SEEK_SET) == -1)
+        goto fail_gd;
+    if (write(snp_fd, gd_buf, gd_size) == -1)
+        goto fail_gd;
+    qemu_free(gd_buf);
+
+    close(p_fd);
+    close(snp_fd);
+    return 0;
+
+    fail_gd:
+    qemu_free(gd_buf);
+    fail_rgd:
+    qemu_free(rgd_buf);
+    fail:
+    close(p_fd);
+    close(snp_fd);
+    return -1;
+}
+
+static void vmdk_parent_close(BlockDriverState *bs)
+{
+    if (bs->backing_hd)
+        bdrv_close(bs->backing_hd);
+}
+
+static int parent_open = 0;
+static int vmdk_parent_open(BlockDriverState *bs, const char * filename)
+{
+    BDRVVmdkState *s = bs->opaque;
+    char *p_name;
+    char desc[DESC_SIZE];
+    char parent_img_name[1024];
+
+    /* the descriptor offset = 0x200 */
+    if (bdrv_pread(s->hd, 0x200, desc, DESC_SIZE) != DESC_SIZE)
+        return -1;
+
+    if ((p_name = strstr(desc,"parentFileNameHint")) != NULL) {
+        char *end_name;
+        struct stat file_buf;
+
+        p_name += sizeof("parentFileNameHint") + 1;
+        if ((end_name = strchr(p_name,'\"')) == NULL)
+            return -1;
+        if ((end_name - p_name) > sizeof (s->hd->backing_file) - 1)
+            return -1;
+
+        pstrcpy(s->hd->backing_file, end_name - p_name + 1, p_name);
+        if (stat(s->hd->backing_file, &file_buf) != 0) {
+            path_combine(parent_img_name, sizeof(parent_img_name),
+                         filename, s->hd->backing_file);
+        } else {
+            pstrcpy(parent_img_name, sizeof(parent_img_name),
+                    s->hd->backing_file);
+        }
+
+        s->hd->backing_hd = bdrv_new("");
+        if (!s->hd->backing_hd) {
+            failure:
+            bdrv_close(s->hd);
+            return -1;
+        }
+        parent_open = 1;
+        if (bdrv_open(s->hd->backing_hd, parent_img_name, BDRV_O_RDONLY) < 0)
+            goto failure;
+        parent_open = 0;
+    }
+
+    return 0;
+}
+
+static int vmdk_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVVmdkState *s = bs->opaque;
+    uint32_t magic;
+    int l1_size, i, ret;
+
+    if (parent_open)
+        // Parent must be opened as RO.
+        flags = BDRV_O_RDONLY;
+
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+    if (bdrv_pread(s->hd, 0, &magic, sizeof(magic)) != sizeof(magic))
+        goto fail;
+
+    magic = be32_to_cpu(magic);
+    if (magic == VMDK3_MAGIC) {
+        VMDK3Header header;
+
+        if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
+            goto fail;
+        s->cluster_sectors = le32_to_cpu(header.granularity);
+        s->l2_size = 1 << 9;
+        s->l1_size = 1 << 6;
+        bs->total_sectors = le32_to_cpu(header.disk_sectors);
+        s->l1_table_offset = le32_to_cpu(header.l1dir_offset) << 9;
+        s->l1_backup_table_offset = 0;
+        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+    } else if (magic == VMDK4_MAGIC) {
+        VMDK4Header header;
+
+        if (bdrv_pread(s->hd, sizeof(magic), &header, sizeof(header)) != sizeof(header))
+            goto fail;
+        bs->total_sectors = le64_to_cpu(header.capacity);
+        s->cluster_sectors = le64_to_cpu(header.granularity);
+        s->l2_size = le32_to_cpu(header.num_gtes_per_gte);
+        s->l1_entry_sectors = s->l2_size * s->cluster_sectors;
+        if (s->l1_entry_sectors <= 0)
+            goto fail;
+        s->l1_size = (bs->total_sectors + s->l1_entry_sectors - 1)
+            / s->l1_entry_sectors;
+        s->l1_table_offset = le64_to_cpu(header.rgd_offset) << 9;
+        s->l1_backup_table_offset = le64_to_cpu(header.gd_offset) << 9;
+
+        if (parent_open)
+            s->is_parent = 1;
+        else
+            s->is_parent = 0;
+
+        // try to open parent images, if exist
+        if (vmdk_parent_open(bs, filename) != 0)
+            goto fail;
+        // write the CID once after the image creation
+        s->parent_cid = vmdk_read_cid(bs,1);
+    } else {
+        goto fail;
+    }
+
+    /* read the L1 table */
+    l1_size = s->l1_size * sizeof(uint32_t);
+    s->l1_table = qemu_malloc(l1_size);
+    if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, l1_size) != l1_size)
+        goto fail;
+    for(i = 0; i < s->l1_size; i++) {
+        le32_to_cpus(&s->l1_table[i]);
+    }
+
+    if (s->l1_backup_table_offset) {
+        s->l1_backup_table = qemu_malloc(l1_size);
+        if (bdrv_pread(s->hd, s->l1_backup_table_offset, s->l1_backup_table, l1_size) != l1_size)
+            goto fail;
+        for(i = 0; i < s->l1_size; i++) {
+            le32_to_cpus(&s->l1_backup_table[i]);
+        }
+    }
+
+    s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint32_t));
+    return 0;
+ fail:
+    qemu_free(s->l1_backup_table);
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+                                   uint64_t offset, int allocate);
+
+static int get_whole_cluster(BlockDriverState *bs, uint64_t cluster_offset,
+                             uint64_t offset, int allocate)
+{
+    uint64_t parent_cluster_offset;
+    BDRVVmdkState *s = bs->opaque;
+    uint8_t  whole_grain[s->cluster_sectors*512];        // 128 sectors * 512 bytes each = grain size 64KB
+
+    // we will be here if it's first write on non-exist grain(cluster).
+    // try to read from parent image, if exist
+    if (s->hd->backing_hd) {
+        BDRVVmdkState *ps = s->hd->backing_hd->opaque;
+
+        if (!vmdk_is_cid_valid(bs))
+            return -1;
+
+        parent_cluster_offset = get_cluster_offset(s->hd->backing_hd, NULL, offset, allocate);
+
+        if (parent_cluster_offset) {
+            BDRVVmdkState *act_s = activeBDRV.hd->opaque;
+
+            if (bdrv_pread(ps->hd, parent_cluster_offset, whole_grain, ps->cluster_sectors*512) != ps->cluster_sectors*512)
+                return -1;
+
+            //Write grain only into the active image
+            if (bdrv_pwrite(act_s->hd, activeBDRV.cluster_offset << 9, whole_grain, sizeof(whole_grain)) != sizeof(whole_grain))
+                return -1;
+        }
+    }
+    return 0;
+}
+
+static int vmdk_L2update(BlockDriverState *bs, VmdkMetaData *m_data)
+{
+    BDRVVmdkState *s = bs->opaque;
+
+    /* update L2 table */
+    if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+                    &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+        return -1;
+    /* update backup L2 table */
+    if (s->l1_backup_table_offset != 0) {
+        m_data->l2_offset = s->l1_backup_table[m_data->l1_index];
+        if (bdrv_pwrite(s->hd, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(m_data->offset)),
+                        &(m_data->offset), sizeof(m_data->offset)) != sizeof(m_data->offset))
+            return -1;
+    }
+
+    return 0;
+}
+
+static uint64_t get_cluster_offset(BlockDriverState *bs, VmdkMetaData *m_data,
+                                   uint64_t offset, int allocate)
+{
+    BDRVVmdkState *s = bs->opaque;
+    unsigned int l1_index, l2_offset, l2_index;
+    int min_index, i, j;
+    uint32_t min_count, *l2_table, tmp = 0;
+    uint64_t cluster_offset;
+
+    if (m_data)
+        m_data->valid = 0;
+
+    l1_index = (offset >> 9) / s->l1_entry_sectors;
+    if (l1_index >= s->l1_size)
+        return 0;
+    l2_offset = s->l1_table[l1_index];
+    if (!l2_offset)
+        return 0;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (l2_offset == s->l2_cache_offsets[i]) {
+            /* increment the hit count */
+            if (++s->l2_cache_counts[i] == 0xffffffff) {
+                for(j = 0; j < L2_CACHE_SIZE; j++) {
+                    s->l2_cache_counts[j] >>= 1;
+                }
+            }
+            l2_table = s->l2_cache + (i * s->l2_size);
+            goto found;
+        }
+    }
+    /* not found: load a new entry in the least used one */
+    min_index = 0;
+    min_count = 0xffffffff;
+    for(i = 0; i < L2_CACHE_SIZE; i++) {
+        if (s->l2_cache_counts[i] < min_count) {
+            min_count = s->l2_cache_counts[i];
+            min_index = i;
+        }
+    }
+    l2_table = s->l2_cache + (min_index * s->l2_size);
+    if (bdrv_pread(s->hd, (int64_t)l2_offset * 512, l2_table, s->l2_size * sizeof(uint32_t)) !=
+                                                                        s->l2_size * sizeof(uint32_t))
+        return 0;
+
+    s->l2_cache_offsets[min_index] = l2_offset;
+    s->l2_cache_counts[min_index] = 1;
+ found:
+    l2_index = ((offset >> 9) / s->cluster_sectors) % s->l2_size;
+    cluster_offset = le32_to_cpu(l2_table[l2_index]);
+
+    if (!cluster_offset) {
+        if (!allocate)
+            return 0;
+        // Avoid the L2 tables update for the images that have snapshots.
+        if (!s->is_parent) {
+            cluster_offset = bdrv_getlength(s->hd);
+            bdrv_truncate(s->hd, cluster_offset + (s->cluster_sectors << 9));
+
+            cluster_offset >>= 9;
+            tmp = cpu_to_le32(cluster_offset);
+            l2_table[l2_index] = tmp;
+            // Save the active image state
+            activeBDRV.cluster_offset = cluster_offset;
+            activeBDRV.hd = bs;
+        }
+        /* First of all we write grain itself, to avoid race condition
+         * that may to corrupt the image.
+         * This problem may occur because of insufficient space on host disk
+         * or inappropriate VM shutdown.
+         */
+        if (get_whole_cluster(bs, cluster_offset, offset, allocate) == -1)
+            return 0;
+
+        if (m_data) {
+            m_data->offset = tmp;
+            m_data->l1_index = l1_index;
+            m_data->l2_index = l2_index;
+            m_data->l2_offset = l2_offset;
+            m_data->valid = 1;
+        }
+    }
+    cluster_offset <<= 9;
+    return cluster_offset;
+}
+
+static int vmdk_is_allocated(BlockDriverState *bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+
+    cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
+    index_in_cluster = sector_num % s->cluster_sectors;
+    n = s->cluster_sectors - index_in_cluster;
+    if (n > nb_sectors)
+        n = nb_sectors;
+    *pnum = n;
+    return (cluster_offset != 0);
+}
+
+static int vmdk_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVmdkState *s = bs->opaque;
+    int index_in_cluster, n, ret;
+    uint64_t cluster_offset;
+
+    while (nb_sectors > 0) {
+        cluster_offset = get_cluster_offset(bs, NULL, sector_num << 9, 0);
+        index_in_cluster = sector_num % s->cluster_sectors;
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        if (!cluster_offset) {
+            // try to read from parent image, if exist
+            if (s->hd->backing_hd) {
+                if (!vmdk_is_cid_valid(bs))
+                    return -1;
+                ret = bdrv_read(s->hd->backing_hd, sector_num, buf, n);
+                if (ret < 0)
+                    return -1;
+            } else {
+                memset(buf, 0, 512 * n);
+            }
+        } else {
+            if(bdrv_pread(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
+                return -1;
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+    }
+    return 0;
+}
+
+static int vmdk_write(BlockDriverState *bs, int64_t sector_num,
+                     const uint8_t *buf, int nb_sectors)
+{
+    BDRVVmdkState *s = bs->opaque;
+    VmdkMetaData m_data;
+    int index_in_cluster, n;
+    uint64_t cluster_offset;
+    static int cid_update = 0;
+
+    if (sector_num > bs->total_sectors) {
+        fprintf(stderr,
+                "(VMDK) Wrong offset: sector_num=0x%" PRIx64
+                " total_sectors=0x%" PRIx64 "\n",
+                sector_num, bs->total_sectors);
+        return -1;
+    }
+
+    while (nb_sectors > 0) {
+        index_in_cluster = sector_num & (s->cluster_sectors - 1);
+        n = s->cluster_sectors - index_in_cluster;
+        if (n > nb_sectors)
+            n = nb_sectors;
+        cluster_offset = get_cluster_offset(bs, &m_data, sector_num << 9, 1);
+        if (!cluster_offset)
+            return -1;
+
+        if (bdrv_pwrite(s->hd, cluster_offset + index_in_cluster * 512, buf, n * 512) != n * 512)
+            return -1;
+        if (m_data.valid) {
+            /* update L2 tables */
+            if (vmdk_L2update(bs, &m_data) == -1)
+                return -1;
+        }
+        nb_sectors -= n;
+        sector_num += n;
+        buf += n * 512;
+
+        // update CID on the first write every time the virtual disk is opened
+        if (!cid_update) {
+            vmdk_write_cid(bs, time(NULL));
+            cid_update++;
+        }
+    }
+    return 0;
+}
+
+static int vmdk_create(const char *filename, int64_t total_size,
+                       const char *backing_file, int flags)
+{
+    int fd, i;
+    VMDK4Header header;
+    uint32_t tmp, magic, grains, gd_size, gt_size, gt_count;
+    static const char desc_template[] =
+        "# Disk DescriptorFile\n"
+        "version=1\n"
+        "CID=%x\n"
+        "parentCID=ffffffff\n"
+        "createType=\"monolithicSparse\"\n"
+        "\n"
+        "# Extent description\n"
+        "RW %" PRId64 " SPARSE \"%s\"\n"
+        "\n"
+        "# The Disk Data Base \n"
+        "#DDB\n"
+        "\n"
+        "ddb.virtualHWVersion = \"%d\"\n"
+        "ddb.geometry.cylinders = \"%" PRId64 "\"\n"
+        "ddb.geometry.heads = \"16\"\n"
+        "ddb.geometry.sectors = \"63\"\n"
+        "ddb.adapterType = \"ide\"\n";
+    char desc[1024];
+    const char *real_filename, *temp_str;
+
+    /* XXX: add support for backing file */
+    if (backing_file) {
+        return vmdk_snapshot_create(filename, backing_file);
+    }
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
+              0644);
+    if (fd < 0)
+        return -1;
+    magic = cpu_to_be32(VMDK4_MAGIC);
+    memset(&header, 0, sizeof(header));
+    header.version = cpu_to_le32(1);
+    header.flags = cpu_to_le32(3); /* ?? */
+    header.capacity = cpu_to_le64(total_size);
+    header.granularity = cpu_to_le64(128);
+    header.num_gtes_per_gte = cpu_to_le32(512);
+
+    grains = (total_size + header.granularity - 1) / header.granularity;
+    gt_size = ((header.num_gtes_per_gte * sizeof(uint32_t)) + 511) >> 9;
+    gt_count = (grains + header.num_gtes_per_gte - 1) / header.num_gtes_per_gte;
+    gd_size = (gt_count * sizeof(uint32_t) + 511) >> 9;
+
+    header.desc_offset = 1;
+    header.desc_size = 20;
+    header.rgd_offset = header.desc_offset + header.desc_size;
+    header.gd_offset = header.rgd_offset + gd_size + (gt_size * gt_count);
+    header.grain_offset =
+       ((header.gd_offset + gd_size + (gt_size * gt_count) +
+         header.granularity - 1) / header.granularity) *
+        header.granularity;
+
+    header.desc_offset = cpu_to_le64(header.desc_offset);
+    header.desc_size = cpu_to_le64(header.desc_size);
+    header.rgd_offset = cpu_to_le64(header.rgd_offset);
+    header.gd_offset = cpu_to_le64(header.gd_offset);
+    header.grain_offset = cpu_to_le64(header.grain_offset);
+
+    header.check_bytes[0] = 0xa;
+    header.check_bytes[1] = 0x20;
+    header.check_bytes[2] = 0xd;
+    header.check_bytes[3] = 0xa;
+
+    /* write all the data */
+    write(fd, &magic, sizeof(magic));
+    write(fd, &header, sizeof(header));
+
+    ftruncate(fd, header.grain_offset << 9);
+
+    /* write grain directory */
+    lseek(fd, le64_to_cpu(header.rgd_offset) << 9, SEEK_SET);
+    for (i = 0, tmp = header.rgd_offset + gd_size;
+         i < gt_count; i++, tmp += gt_size)
+        write(fd, &tmp, sizeof(tmp));
+
+    /* write backup grain directory */
+    lseek(fd, le64_to_cpu(header.gd_offset) << 9, SEEK_SET);
+    for (i = 0, tmp = header.gd_offset + gd_size;
+         i < gt_count; i++, tmp += gt_size)
+        write(fd, &tmp, sizeof(tmp));
+
+    /* compose the descriptor */
+    real_filename = filename;
+    if ((temp_str = strrchr(real_filename, '\\')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, '/')) != NULL)
+        real_filename = temp_str + 1;
+    if ((temp_str = strrchr(real_filename, ':')) != NULL)
+        real_filename = temp_str + 1;
+    snprintf(desc, sizeof(desc), desc_template, (unsigned int)time(NULL),
+             total_size, real_filename,
+             (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4),
+             total_size / (int64_t)(63 * 16));
+
+    /* write the descriptor */
+    lseek(fd, le64_to_cpu(header.desc_offset) << 9, SEEK_SET);
+    write(fd, desc, strlen(desc));
+
+    close(fd);
+    return 0;
+}
+
+static void vmdk_close(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+
+    qemu_free(s->l1_table);
+    qemu_free(s->l2_cache);
+    // try to close parent image, if exist
+    vmdk_parent_close(s->hd);
+    bdrv_delete(s->hd);
+}
+
+static void vmdk_flush(BlockDriverState *bs)
+{
+    BDRVVmdkState *s = bs->opaque;
+    bdrv_flush(s->hd);
+}
+
+static BlockDriver bdrv_vmdk = {
+    .format_name	= "vmdk",
+    .instance_size	= sizeof(BDRVVmdkState),
+    .bdrv_probe		= vmdk_probe,
+    .bdrv_open		= vmdk_open,
+    .bdrv_read		= vmdk_read,
+    .bdrv_write		= vmdk_write,
+    .bdrv_close		= vmdk_close,
+    .bdrv_create	= vmdk_create,
+    .bdrv_flush		= vmdk_flush,
+    .bdrv_is_allocated	= vmdk_is_allocated,
+};
+
+static void bdrv_vmdk_init(void)
+{
+    bdrv_register(&bdrv_vmdk);
+}
+
+block_init(bdrv_vmdk_init);
diff --git a/block/vpc.c b/block/vpc.c
new file mode 100644
index 0000000..211ae5c
--- /dev/null
+++ b/block/vpc.c
@@ -0,0 +1,606 @@
+/*
+ * Block driver for Conectix/Microsoft Virtual PC images
+ *
+ * Copyright (c) 2005 Alex Beregszaszi
+ * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+/**************************************************************/
+
+#define HEADER_SIZE 512
+
+//#define CACHE
+
+enum vhd_type {
+    VHD_FIXED           = 2,
+    VHD_DYNAMIC         = 3,
+    VHD_DIFFERENCING    = 4,
+};
+
+// Seconds since Jan 1, 2000 0:00:00 (UTC)
+#define VHD_TIMESTAMP_BASE 946684800
+
+// always big-endian
+struct vhd_footer {
+    char        creator[8]; // "conectix"
+    uint32_t    features;
+    uint32_t    version;
+
+    // Offset of next header structure, 0xFFFFFFFF if none
+    uint64_t    data_offset;
+
+    // Seconds since Jan 1, 2000 0:00:00 (UTC)
+    uint32_t    timestamp;
+
+    char        creator_app[4]; // "vpc "
+    uint16_t    major;
+    uint16_t    minor;
+    char        creator_os[4]; // "Wi2k"
+
+    uint64_t    orig_size;
+    uint64_t    size;
+
+    uint16_t    cyls;
+    uint8_t     heads;
+    uint8_t     secs_per_cyl;
+
+    uint32_t    type;
+
+    // Checksum of the Hard Disk Footer ("one's complement of the sum of all
+    // the bytes in the footer without the checksum field")
+    uint32_t    checksum;
+
+    // UUID used to identify a parent hard disk (backing file)
+    uint8_t     uuid[16];
+
+    uint8_t     in_saved_state;
+};
+
+struct vhd_dyndisk_header {
+    char        magic[8]; // "cxsparse"
+
+    // Offset of next header structure, 0xFFFFFFFF if none
+    uint64_t    data_offset;
+
+    // Offset of the Block Allocation Table (BAT)
+    uint64_t    table_offset;
+
+    uint32_t    version;
+    uint32_t    max_table_entries; // 32bit/entry
+
+    // 2 MB by default, must be a power of two
+    uint32_t    block_size;
+
+    uint32_t    checksum;
+    uint8_t     parent_uuid[16];
+    uint32_t    parent_timestamp;
+    uint32_t    reserved;
+
+    // Backing file name (in UTF-16)
+    uint8_t     parent_name[512];
+
+    struct {
+        uint32_t    platform;
+        uint32_t    data_space;
+        uint32_t    data_length;
+        uint32_t    reserved;
+        uint64_t    data_offset;
+    } parent_locator[8];
+};
+
+typedef struct BDRVVPCState {
+    BlockDriverState *hd;
+
+    uint8_t footer_buf[HEADER_SIZE];
+    uint64_t free_data_block_offset;
+    int max_table_entries;
+    uint32_t *pagetable;
+    uint64_t bat_offset;
+    uint64_t last_bitmap_offset;
+
+    uint32_t block_size;
+    uint32_t bitmap_size;
+
+#ifdef CACHE
+    uint8_t *pageentry_u8;
+    uint32_t *pageentry_u32;
+    uint16_t *pageentry_u16;
+
+    uint64_t last_bitmap;
+#endif
+} BDRVVPCState;
+
+static uint32_t vpc_checksum(uint8_t* buf, size_t size)
+{
+    uint32_t res = 0;
+    int i;
+
+    for (i = 0; i < size; i++)
+        res += buf[i];
+
+    return ~res;
+}
+
+
+static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename)
+{
+    if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8))
+	return 100;
+    return 0;
+}
+
+static int vpc_open(BlockDriverState *bs, const char *filename, int flags)
+{
+    BDRVVPCState *s = bs->opaque;
+    int ret, i;
+    struct vhd_footer* footer;
+    struct vhd_dyndisk_header* dyndisk_header;
+    uint8_t buf[HEADER_SIZE];
+    uint32_t checksum;
+
+    ret = bdrv_file_open(&s->hd, filename, flags);
+    if (ret < 0)
+        return ret;
+
+    if (bdrv_pread(s->hd, 0, s->footer_buf, HEADER_SIZE) != HEADER_SIZE)
+        goto fail;
+
+    footer = (struct vhd_footer*) s->footer_buf;
+    if (strncmp(footer->creator, "conectix", 8))
+        goto fail;
+
+    checksum = be32_to_cpu(footer->checksum);
+    footer->checksum = 0;
+    if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum)
+        fprintf(stderr, "block-vpc: The header checksum of '%s' is "
+            "incorrect.\n", filename);
+
+    // The visible size of a image in Virtual PC depends on the geometry
+    // rather than on the size stored in the footer (the size in the footer
+    // is too large usually)
+    bs->total_sectors = (int64_t)
+        be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl;
+
+    if (bdrv_pread(s->hd, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE)
+            != HEADER_SIZE)
+        goto fail;
+
+    dyndisk_header = (struct vhd_dyndisk_header*) buf;
+
+    if (strncmp(dyndisk_header->magic, "cxsparse", 8))
+        goto fail;
+
+
+    s->block_size = be32_to_cpu(dyndisk_header->block_size);
+    s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511;
+
+    s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries);
+    s->pagetable = qemu_malloc(s->max_table_entries * 4);
+
+    s->bat_offset = be64_to_cpu(dyndisk_header->table_offset);
+    if (bdrv_pread(s->hd, s->bat_offset, s->pagetable,
+            s->max_table_entries * 4) != s->max_table_entries * 4)
+	    goto fail;
+
+    s->free_data_block_offset =
+        (s->bat_offset + (s->max_table_entries * 4) + 511) & ~511;
+
+    for (i = 0; i < s->max_table_entries; i++) {
+        be32_to_cpus(&s->pagetable[i]);
+        if (s->pagetable[i] != 0xFFFFFFFF) {
+            int64_t next = (512 * (int64_t) s->pagetable[i]) +
+                s->bitmap_size + s->block_size;
+
+            if (next> s->free_data_block_offset)
+                s->free_data_block_offset = next;
+        }
+    }
+
+    s->last_bitmap_offset = (int64_t) -1;
+
+#ifdef CACHE
+    s->pageentry_u8 = qemu_malloc(512);
+    s->pageentry_u32 = s->pageentry_u8;
+    s->pageentry_u16 = s->pageentry_u8;
+    s->last_pagetable = -1;
+#endif
+
+    return 0;
+ fail:
+    bdrv_delete(s->hd);
+    return -1;
+}
+
+/*
+ * Returns the absolute byte offset of the given sector in the image file.
+ * If the sector is not allocated, -1 is returned instead.
+ *
+ * The parameter write must be 1 if the offset will be used for a write
+ * operation (the block bitmaps is updated then), 0 otherwise.
+ */
+static inline int64_t get_sector_offset(BlockDriverState *bs,
+    int64_t sector_num, int write)
+{
+    BDRVVPCState *s = bs->opaque;
+    uint64_t offset = sector_num * 512;
+    uint64_t bitmap_offset, block_offset;
+    uint32_t pagetable_index, pageentry_index;
+
+    pagetable_index = offset / s->block_size;
+    pageentry_index = (offset % s->block_size) / 512;
+
+    if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff)
+        return -1; // not allocated
+
+    bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index];
+    block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index);
+
+    // We must ensure that we don't write to any sectors which are marked as
+    // unused in the bitmap. We get away with setting all bits in the block
+    // bitmap each time we write to a new block. This might cause Virtual PC to
+    // miss sparse read optimization, but it's not a problem in terms of
+    // correctness.
+    if (write && (s->last_bitmap_offset != bitmap_offset)) {
+        uint8_t bitmap[s->bitmap_size];
+
+        s->last_bitmap_offset = bitmap_offset;
+        memset(bitmap, 0xff, s->bitmap_size);
+        bdrv_pwrite(s->hd, bitmap_offset, bitmap, s->bitmap_size);
+    }
+
+//    printf("sector: %" PRIx64 ", index: %x, offset: %x, bioff: %" PRIx64 ", bloff: %" PRIx64 "\n",
+//	sector_num, pagetable_index, pageentry_index,
+//	bitmap_offset, block_offset);
+
+// disabled by reason
+#if 0
+#ifdef CACHE
+    if (bitmap_offset != s->last_bitmap)
+    {
+	lseek(s->fd, bitmap_offset, SEEK_SET);
+
+	s->last_bitmap = bitmap_offset;
+
+	// Scary! Bitmap is stored as big endian 32bit entries,
+	// while we used to look it up byte by byte
+	read(s->fd, s->pageentry_u8, 512);
+	for (i = 0; i < 128; i++)
+	    be32_to_cpus(&s->pageentry_u32[i]);
+    }
+
+    if ((s->pageentry_u8[pageentry_index / 8] >> (pageentry_index % 8)) & 1)
+	return -1;
+#else
+    lseek(s->fd, bitmap_offset + (pageentry_index / 8), SEEK_SET);
+
+    read(s->fd, &bitmap_entry, 1);
+
+    if ((bitmap_entry >> (pageentry_index % 8)) & 1)
+	return -1; // not allocated
+#endif
+#endif
+
+    return block_offset;
+}
+
+/*
+ * Writes the footer to the end of the image file. This is needed when the
+ * file grows as it overwrites the old footer
+ *
+ * Returns 0 on success and < 0 on error
+ */
+static int rewrite_footer(BlockDriverState* bs)
+{
+    int ret;
+    BDRVVPCState *s = bs->opaque;
+    int64_t offset = s->free_data_block_offset;
+
+    ret = bdrv_pwrite(s->hd, offset, s->footer_buf, HEADER_SIZE);
+    if (ret < 0)
+        return ret;
+
+    return 0;
+}
+
+/*
+ * Allocates a new block. This involves writing a new footer and updating
+ * the Block Allocation Table to use the space at the old end of the image
+ * file (overwriting the old footer)
+ *
+ * Returns the sectors' offset in the image file on success and < 0 on error
+ */
+static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num)
+{
+    BDRVVPCState *s = bs->opaque;
+    int64_t bat_offset;
+    uint32_t index, bat_value;
+    int ret;
+    uint8_t bitmap[s->bitmap_size];
+
+    // Check if sector_num is valid
+    if ((sector_num < 0) || (sector_num > bs->total_sectors))
+        return -1;
+
+    // Write entry into in-memory BAT
+    index = (sector_num * 512) / s->block_size;
+    if (s->pagetable[index] != 0xFFFFFFFF)
+        return -1;
+
+    s->pagetable[index] = s->free_data_block_offset / 512;
+
+    // Initialize the block's bitmap
+    memset(bitmap, 0xff, s->bitmap_size);
+    bdrv_pwrite(s->hd, s->free_data_block_offset, bitmap, s->bitmap_size);
+
+    // Write new footer (the old one will be overwritten)
+    s->free_data_block_offset += s->block_size + s->bitmap_size;
+    ret = rewrite_footer(bs);
+    if (ret < 0)
+        goto fail;
+
+    // Write BAT entry to disk
+    bat_offset = s->bat_offset + (4 * index);
+    bat_value = be32_to_cpu(s->pagetable[index]);
+    ret = bdrv_pwrite(s->hd, bat_offset, &bat_value, 4);
+    if (ret < 0)
+        goto fail;
+
+    return get_sector_offset(bs, sector_num, 0);
+
+fail:
+    s->free_data_block_offset -= (s->block_size + s->bitmap_size);
+    return -1;
+}
+
+static int vpc_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVPCState *s = bs->opaque;
+    int ret;
+    int64_t offset;
+
+    while (nb_sectors > 0) {
+        offset = get_sector_offset(bs, sector_num, 0);
+
+        if (offset == -1) {
+            memset(buf, 0, 512);
+        } else {
+            ret = bdrv_pread(s->hd, offset, buf, 512);
+            if (ret != 512)
+                return -1;
+        }
+
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+    return 0;
+}
+
+static int vpc_write(BlockDriverState *bs, int64_t sector_num,
+    const uint8_t *buf, int nb_sectors)
+{
+    BDRVVPCState *s = bs->opaque;
+    int64_t offset;
+    int ret;
+
+    while (nb_sectors > 0) {
+        offset = get_sector_offset(bs, sector_num, 1);
+
+        if (offset == -1) {
+            offset = alloc_block(bs, sector_num);
+            if (offset < 0)
+                return -1;
+        }
+
+        ret = bdrv_pwrite(s->hd, offset, buf, 512);
+        if (ret != 512)
+            return -1;
+
+        nb_sectors--;
+        sector_num++;
+        buf += 512;
+    }
+
+    return 0;
+}
+
+
+/*
+ * Calculates the number of cylinders, heads and sectors per cylinder
+ * based on a given number of sectors. This is the algorithm described
+ * in the VHD specification.
+ *
+ * Note that the geometry doesn't always exactly match total_sectors but
+ * may round it down.
+ *
+ * Returns 0 on success, -EFBIG if the size is larger than 127 GB
+ */
+static int calculate_geometry(int64_t total_sectors, uint16_t* cyls,
+    uint8_t* heads, uint8_t* secs_per_cyl)
+{
+    uint32_t cyls_times_heads;
+
+    if (total_sectors > 65535 * 16 * 255)
+        return -EFBIG;
+
+    if (total_sectors > 65535 * 16 * 63) {
+        *secs_per_cyl = 255;
+        *heads = 16;
+        cyls_times_heads = total_sectors / *secs_per_cyl;
+    } else {
+        *secs_per_cyl = 17;
+        cyls_times_heads = total_sectors / *secs_per_cyl;
+        *heads = (cyls_times_heads + 1023) / 1024;
+
+        if (*heads < 4)
+            *heads = 4;
+
+        if (cyls_times_heads >= (*heads * 1024) || *heads > 16) {
+            *secs_per_cyl = 31;
+            *heads = 16;
+            cyls_times_heads = total_sectors / *secs_per_cyl;
+        }
+
+        if (cyls_times_heads >= (*heads * 1024)) {
+            *secs_per_cyl = 63;
+            *heads = 16;
+            cyls_times_heads = total_sectors / *secs_per_cyl;
+        }
+    }
+
+    // Note: Rounding up deviates from the Virtual PC behaviour
+    // However, we need this to avoid truncating images in qemu-img convert
+    *cyls = (cyls_times_heads + *heads - 1) / *heads;
+
+    return 0;
+}
+
+static int vpc_create(const char *filename, int64_t total_sectors,
+    const char *backing_file, int flags)
+{
+    uint8_t buf[1024];
+    struct vhd_footer* footer = (struct vhd_footer*) buf;
+    struct vhd_dyndisk_header* dyndisk_header =
+        (struct vhd_dyndisk_header*) buf;
+    int fd, i;
+    uint16_t cyls;
+    uint8_t heads;
+    uint8_t secs_per_cyl;
+    size_t block_size, num_bat_entries;
+
+    if (backing_file != NULL)
+        return -ENOTSUP;
+
+    fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0)
+        return -EIO;
+
+    // Calculate matching total_size and geometry
+    if (calculate_geometry(total_sectors, &cyls, &heads, &secs_per_cyl))
+        return -EFBIG;
+    total_sectors = (int64_t) cyls * heads * secs_per_cyl;
+
+    // Prepare the Hard Disk Footer
+    memset(buf, 0, 1024);
+
+    strncpy(footer->creator, "conectix", 8);
+    // TODO Check if "qemu" creator_app is ok for VPC
+    strncpy(footer->creator_app, "qemu", 4);
+    strncpy(footer->creator_os, "Wi2k", 4);
+
+    footer->features = be32_to_cpu(0x02);
+    footer->version = be32_to_cpu(0x00010000);
+    footer->data_offset = be64_to_cpu(HEADER_SIZE);
+    footer->timestamp = be32_to_cpu(time(NULL) - VHD_TIMESTAMP_BASE);
+
+    // Version of Virtual PC 2007
+    footer->major = be16_to_cpu(0x0005);
+    footer->minor =be16_to_cpu(0x0003);
+
+    footer->orig_size = be64_to_cpu(total_sectors * 512);
+    footer->size = be64_to_cpu(total_sectors * 512);
+
+    footer->cyls = be16_to_cpu(cyls);
+    footer->heads = heads;
+    footer->secs_per_cyl = secs_per_cyl;
+
+    footer->type = be32_to_cpu(VHD_DYNAMIC);
+
+    // TODO uuid is missing
+
+    footer->checksum = be32_to_cpu(vpc_checksum(buf, HEADER_SIZE));
+
+    // Write the footer (twice: at the beginning and at the end)
+    block_size = 0x200000;
+    num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512);
+
+    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE)
+        return -EIO;
+
+    if (lseek(fd, 1536 + ((num_bat_entries * 4 + 511) & ~511), SEEK_SET) < 0)
+        return -EIO;
+    if (write(fd, buf, HEADER_SIZE) != HEADER_SIZE)
+        return -EIO;
+
+    // Write the initial BAT
+    if (lseek(fd, 3 * 512, SEEK_SET) < 0)
+        return -EIO;
+
+    memset(buf, 0xFF, 512);
+    for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++)
+        if (write(fd, buf, 512) != 512)
+            return -EIO;
+
+
+    // Prepare the Dynamic Disk Header
+    memset(buf, 0, 1024);
+
+    strncpy(dyndisk_header->magic, "cxsparse", 8);
+
+    dyndisk_header->data_offset = be64_to_cpu(0xFFFFFFFF);
+    dyndisk_header->table_offset = be64_to_cpu(3 * 512);
+    dyndisk_header->version = be32_to_cpu(0x00010000);
+    dyndisk_header->block_size = be32_to_cpu(block_size);
+    dyndisk_header->max_table_entries = be32_to_cpu(num_bat_entries);
+
+    dyndisk_header->checksum = be32_to_cpu(vpc_checksum(buf, 1024));
+
+    // Write the header
+    if (lseek(fd, 512, SEEK_SET) < 0)
+        return -EIO;
+    if (write(fd, buf, 1024) != 1024)
+        return -EIO;
+
+    close(fd);
+    return 0;
+}
+
+static void vpc_close(BlockDriverState *bs)
+{
+    BDRVVPCState *s = bs->opaque;
+    qemu_free(s->pagetable);
+#ifdef CACHE
+    qemu_free(s->pageentry_u8);
+#endif
+    bdrv_delete(s->hd);
+}
+
+static BlockDriver bdrv_vpc = {
+    .format_name	= "vpc",
+    .instance_size	= sizeof(BDRVVPCState),
+    .bdrv_probe		= vpc_probe,
+    .bdrv_open		= vpc_open,
+    .bdrv_read		= vpc_read,
+    .bdrv_write		= vpc_write,
+    .bdrv_close		= vpc_close,
+    .bdrv_create	= vpc_create,
+};
+
+static void bdrv_vpc_init(void)
+{
+    bdrv_register(&bdrv_vpc);
+}
+
+block_init(bdrv_vpc_init);
diff --git a/block/vvfat.c b/block/vvfat.c
new file mode 100644
index 0000000..2a8feb3
--- /dev/null
+++ b/block/vvfat.c
@@ -0,0 +1,2855 @@
+/* vim:set shiftwidth=4 ts=8: */
+/*
+ * QEMU Block driver for virtual VFAT (shadows a local directory)
+ *
+ * Copyright (c) 2004,2005 Johannes E. Schindelin
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <sys/stat.h>
+#include <dirent.h>
+#include "qemu-common.h"
+#include "block_int.h"
+#include "module.h"
+
+#ifndef S_IWGRP
+#define S_IWGRP 0
+#endif
+#ifndef S_IWOTH
+#define S_IWOTH 0
+#endif
+
+/* TODO: add ":bootsector=blabla.img:" */
+/* LATER TODO: add automatic boot sector generation from
+    BOOTEASY.ASM and Ranish Partition Manager
+    Note that DOS assumes the system files to be the first files in the
+    file system (test if the boot sector still relies on that fact)! */
+/* MAYBE TODO: write block-visofs.c */
+/* TODO: call try_commit() only after a timeout */
+
+/* #define DEBUG */
+
+#ifdef DEBUG
+
+#define DLOG(a) a
+
+#undef stderr
+#define stderr STDERR
+FILE* stderr = NULL;
+
+static void checkpoint(void);
+
+#ifdef __MINGW32__
+void nonono(const char* file, int line, const char* msg) {
+    fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg);
+    exit(-5);
+}
+#undef assert
+#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0)
+#endif
+
+#else
+
+#define DLOG(a)
+
+#endif
+
+/* dynamic array functions */
+typedef struct array_t {
+    char* pointer;
+    unsigned int size,next,item_size;
+} array_t;
+
+static inline void array_init(array_t* array,unsigned int item_size)
+{
+    array->pointer = NULL;
+    array->size=0;
+    array->next=0;
+    array->item_size=item_size;
+}
+
+static inline void array_free(array_t* array)
+{
+    if(array->pointer)
+        free(array->pointer);
+    array->size=array->next=0;
+}
+
+/* does not automatically grow */
+static inline void* array_get(array_t* array,unsigned int index) {
+    assert(index < array->next);
+    return array->pointer + index * array->item_size;
+}
+
+static inline int array_ensure_allocated(array_t* array, int index)
+{
+    if((index + 1) * array->item_size > array->size) {
+	int new_size = (index + 32) * array->item_size;
+	array->pointer = qemu_realloc(array->pointer, new_size);
+	if (!array->pointer)
+	    return -1;
+	array->size = new_size;
+	array->next = index + 1;
+    }
+
+    return 0;
+}
+
+static inline void* array_get_next(array_t* array) {
+    unsigned int next = array->next;
+    void* result;
+
+    if (array_ensure_allocated(array, next) < 0)
+	return NULL;
+
+    array->next = next + 1;
+    result = array_get(array, next);
+
+    return result;
+}
+
+static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) {
+    if((array->next+count)*array->item_size>array->size) {
+	int increment=count*array->item_size;
+	array->pointer=qemu_realloc(array->pointer,array->size+increment);
+	if(!array->pointer)
+            return NULL;
+	array->size+=increment;
+    }
+    memmove(array->pointer+(index+count)*array->item_size,
+		array->pointer+index*array->item_size,
+		(array->next-index)*array->item_size);
+    array->next+=count;
+    return array->pointer+index*array->item_size;
+}
+
+/* this performs a "roll", so that the element which was at index_from becomes
+ * index_to, but the order of all other elements is preserved. */
+static inline int array_roll(array_t* array,int index_to,int index_from,int count)
+{
+    char* buf;
+    char* from;
+    char* to;
+    int is;
+
+    if(!array ||
+	    index_to<0 || index_to>=array->next ||
+	    index_from<0 || index_from>=array->next)
+	return -1;
+
+    if(index_to==index_from)
+	return 0;
+
+    is=array->item_size;
+    from=array->pointer+index_from*is;
+    to=array->pointer+index_to*is;
+    buf=qemu_malloc(is*count);
+    memcpy(buf,from,is*count);
+
+    if(index_to<index_from)
+	memmove(to+is*count,to,from-to);
+    else
+	memmove(from,from+is*count,to-from);
+
+    memcpy(to,buf,is*count);
+
+    free(buf);
+
+    return 0;
+}
+
+static inline int array_remove_slice(array_t* array,int index, int count)
+{
+    assert(index >=0);
+    assert(count > 0);
+    assert(index + count <= array->next);
+    if(array_roll(array,array->next-1,index,count))
+	return -1;
+    array->next -= count;
+    return 0;
+}
+
+static int array_remove(array_t* array,int index)
+{
+    return array_remove_slice(array, index, 1);
+}
+
+/* return the index for a given member */
+static int array_index(array_t* array, void* pointer)
+{
+    size_t offset = (char*)pointer - array->pointer;
+    assert((offset % array->item_size) == 0);
+    assert(offset/array->item_size < array->next);
+    return offset/array->item_size;
+}
+
+/* These structures are used to fake a disk and the VFAT filesystem.
+ * For this reason we need to use __attribute__((packed)). */
+
+typedef struct bootsector_t {
+    uint8_t jump[3];
+    uint8_t name[8];
+    uint16_t sector_size;
+    uint8_t sectors_per_cluster;
+    uint16_t reserved_sectors;
+    uint8_t number_of_fats;
+    uint16_t root_entries;
+    uint16_t total_sectors16;
+    uint8_t media_type;
+    uint16_t sectors_per_fat;
+    uint16_t sectors_per_track;
+    uint16_t number_of_heads;
+    uint32_t hidden_sectors;
+    uint32_t total_sectors;
+    union {
+        struct {
+	    uint8_t drive_number;
+	    uint8_t current_head;
+	    uint8_t signature;
+	    uint32_t id;
+	    uint8_t volume_label[11];
+	} __attribute__((packed)) fat16;
+	struct {
+	    uint32_t sectors_per_fat;
+	    uint16_t flags;
+	    uint8_t major,minor;
+	    uint32_t first_cluster_of_root_directory;
+	    uint16_t info_sector;
+	    uint16_t backup_boot_sector;
+	    uint16_t ignored;
+	} __attribute__((packed)) fat32;
+    } u;
+    uint8_t fat_type[8];
+    uint8_t ignored[0x1c0];
+    uint8_t magic[2];
+} __attribute__((packed)) bootsector_t;
+
+typedef struct {
+    uint8_t head;
+    uint8_t sector;
+    uint8_t cylinder;
+} mbr_chs_t;
+
+typedef struct partition_t {
+    uint8_t attributes; /* 0x80 = bootable */
+    mbr_chs_t start_CHS;
+    uint8_t   fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */
+    mbr_chs_t end_CHS;
+    uint32_t start_sector_long;
+    uint32_t length_sector_long;
+} __attribute__((packed)) partition_t;
+
+typedef struct mbr_t {
+    uint8_t ignored[0x1b8];
+    uint32_t nt_id;
+    uint8_t ignored2[2];
+    partition_t partition[4];
+    uint8_t magic[2];
+} __attribute__((packed)) mbr_t;
+
+typedef struct direntry_t {
+    uint8_t name[8];
+    uint8_t extension[3];
+    uint8_t attributes;
+    uint8_t reserved[2];
+    uint16_t ctime;
+    uint16_t cdate;
+    uint16_t adate;
+    uint16_t begin_hi;
+    uint16_t mtime;
+    uint16_t mdate;
+    uint16_t begin;
+    uint32_t size;
+} __attribute__((packed)) direntry_t;
+
+/* this structure are used to transparently access the files */
+
+typedef struct mapping_t {
+    /* begin is the first cluster, end is the last+1 */
+    uint32_t begin,end;
+    /* as s->directory is growable, no pointer may be used here */
+    unsigned int dir_index;
+    /* the clusters of a file may be in any order; this points to the first */
+    int first_mapping_index;
+    union {
+	/* offset is
+	 * - the offset in the file (in clusters) for a file, or
+	 * - the next cluster of the directory for a directory, and
+	 * - the address of the buffer for a faked entry
+	 */
+	struct {
+	    uint32_t offset;
+	} file;
+	struct {
+	    int parent_mapping_index;
+	    int first_dir_index;
+	} dir;
+    } info;
+    /* path contains the full path, i.e. it always starts with s->path */
+    char* path;
+
+    enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2,
+	MODE_DIRECTORY = 4, MODE_FAKED = 8,
+	MODE_DELETED = 16, MODE_RENAMED = 32 } mode;
+    int read_only;
+} mapping_t;
+
+#ifdef DEBUG
+static void print_direntry(const struct direntry_t*);
+static void print_mapping(const struct mapping_t* mapping);
+#endif
+
+/* here begins the real VVFAT driver */
+
+typedef struct BDRVVVFATState {
+    BlockDriverState* bs; /* pointer to parent */
+    unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */
+    unsigned char first_sectors[0x40*0x200];
+
+    int fat_type; /* 16 or 32 */
+    array_t fat,directory,mapping;
+
+    unsigned int cluster_size;
+    unsigned int sectors_per_cluster;
+    unsigned int sectors_per_fat;
+    unsigned int sectors_of_root_directory;
+    uint32_t last_cluster_of_root_directory;
+    unsigned int faked_sectors; /* how many sectors are faked before file data */
+    uint32_t sector_count; /* total number of sectors of the partition */
+    uint32_t cluster_count; /* total number of clusters of this partition */
+    uint32_t max_fat_value;
+
+    int current_fd;
+    mapping_t* current_mapping;
+    unsigned char* cluster; /* points to current cluster */
+    unsigned char* cluster_buffer; /* points to a buffer to hold temp data */
+    unsigned int current_cluster;
+
+    /* write support */
+    BlockDriverState* write_target;
+    char* qcow_filename;
+    BlockDriverState* qcow;
+    void* fat2;
+    char* used_clusters;
+    array_t commits;
+    const char* path;
+    int downcase_short_names;
+} BDRVVVFATState;
+
+/* take the sector position spos and convert it to Cylinder/Head/Sector position
+ * if the position is outside the specified geometry, fill maximum value for CHS
+ * and return 1 to signal overflow.
+ */
+static int sector2CHS(BlockDriverState* bs, mbr_chs_t * chs, int spos){
+    int head,sector;
+    sector   = spos % (bs->secs);  spos/= bs->secs;
+    head     = spos % (bs->heads); spos/= bs->heads;
+    if(spos >= bs->cyls){
+        /* Overflow,
+        it happens if 32bit sector positions are used, while CHS is only 24bit.
+        Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */
+        chs->head     = 0xFF;
+        chs->sector   = 0xFF;
+        chs->cylinder = 0xFF;
+        return 1;
+    }
+    chs->head     = (uint8_t)head;
+    chs->sector   = (uint8_t)( (sector+1) | ((spos>>8)<<6) );
+    chs->cylinder = (uint8_t)spos;
+    return 0;
+}
+
+static void init_mbr(BDRVVVFATState* s)
+{
+    /* TODO: if the files mbr.img and bootsect.img exist, use them */
+    mbr_t* real_mbr=(mbr_t*)s->first_sectors;
+    partition_t* partition=&(real_mbr->partition[0]);
+    int lba;
+
+    memset(s->first_sectors,0,512);
+
+    /* Win NT Disk Signature */
+    real_mbr->nt_id= cpu_to_le32(0xbe1afdfa);
+
+    partition->attributes=0x80; /* bootable */
+
+    /* LBA is used when partition is outside the CHS geometry */
+    lba = sector2CHS(s->bs, &partition->start_CHS, s->first_sectors_number-1);
+    lba|= sector2CHS(s->bs, &partition->end_CHS,   s->sector_count);
+
+    /*LBA partitions are identified only by start/length_sector_long not by CHS*/
+    partition->start_sector_long =cpu_to_le32(s->first_sectors_number-1);
+    partition->length_sector_long=cpu_to_le32(s->sector_count - s->first_sectors_number+1);
+
+    /* FAT12/FAT16/FAT32 */
+    /* DOS uses different types when partition is LBA,
+       probably to prevent older versions from using CHS on them */
+    partition->fs_type= s->fat_type==12 ? 0x1:
+                        s->fat_type==16 ? (lba?0xe:0x06):
+                         /*fat_tyoe==32*/ (lba?0xc:0x0b);
+
+    real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa;
+}
+
+/* direntry functions */
+
+/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */
+static inline int short2long_name(char* dest,const char* src)
+{
+    int i;
+    int len;
+    for(i=0;i<129 && src[i];i++) {
+        dest[2*i]=src[i];
+	dest[2*i+1]=0;
+    }
+    len=2*i;
+    dest[2*i]=dest[2*i+1]=0;
+    for(i=2*i+2;(i%26);i++)
+	dest[i]=0xff;
+    return len;
+}
+
+static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename)
+{
+    char buffer[258];
+    int length=short2long_name(buffer,filename),
+        number_of_entries=(length+25)/26,i;
+    direntry_t* entry;
+
+    for(i=0;i<number_of_entries;i++) {
+	entry=array_get_next(&(s->directory));
+	entry->attributes=0xf;
+	entry->reserved[0]=0;
+	entry->begin=0;
+	entry->name[0]=(number_of_entries-i)|(i==0?0x40:0);
+    }
+    for(i=0;i<26*number_of_entries;i++) {
+	int offset=(i%26);
+	if(offset<10) offset=1+offset;
+	else if(offset<22) offset=14+offset-10;
+	else offset=28+offset-22;
+	entry=array_get(&(s->directory),s->directory.next-1-(i/26));
+	entry->name[offset]=buffer[i];
+    }
+    return array_get(&(s->directory),s->directory.next-number_of_entries);
+}
+
+static char is_free(const direntry_t* direntry)
+{
+    return direntry->name[0]==0xe5 || direntry->name[0]==0x00;
+}
+
+static char is_volume_label(const direntry_t* direntry)
+{
+    return direntry->attributes == 0x28;
+}
+
+static char is_long_name(const direntry_t* direntry)
+{
+    return direntry->attributes == 0xf;
+}
+
+static char is_short_name(const direntry_t* direntry)
+{
+    return !is_volume_label(direntry) && !is_long_name(direntry)
+	&& !is_free(direntry);
+}
+
+static char is_directory(const direntry_t* direntry)
+{
+    return direntry->attributes & 0x10 && direntry->name[0] != 0xe5;
+}
+
+static inline char is_dot(const direntry_t* direntry)
+{
+    return is_short_name(direntry) && direntry->name[0] == '.';
+}
+
+static char is_file(const direntry_t* direntry)
+{
+    return is_short_name(direntry) && !is_directory(direntry);
+}
+
+static inline uint32_t begin_of_direntry(const direntry_t* direntry)
+{
+    return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16);
+}
+
+static inline uint32_t filesize_of_direntry(const direntry_t* direntry)
+{
+    return le32_to_cpu(direntry->size);
+}
+
+static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin)
+{
+    direntry->begin = cpu_to_le16(begin & 0xffff);
+    direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff);
+}
+
+/* fat functions */
+
+static inline uint8_t fat_chksum(const direntry_t* entry)
+{
+    uint8_t chksum=0;
+    int i;
+
+    for(i=0;i<11;i++) {
+        unsigned char c;
+
+        c = (i <= 8) ? entry->name[i] : entry->extension[i-8];
+        chksum=(((chksum&0xfe)>>1)|((chksum&0x01)?0x80:0)) + c;
+    }
+
+    return chksum;
+}
+
+/* if return_time==0, this returns the fat_date, else the fat_time */
+static uint16_t fat_datetime(time_t time,int return_time) {
+    struct tm* t;
+#ifdef _WIN32
+    t=localtime(&time); /* this is not thread safe */
+#else
+    struct tm t1;
+    t=&t1;
+    localtime_r(&time,t);
+#endif
+    if(return_time)
+	return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11));
+    return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9));
+}
+
+static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value)
+{
+    if(s->fat_type==32) {
+	uint32_t* entry=array_get(&(s->fat),cluster);
+	*entry=cpu_to_le32(value);
+    } else if(s->fat_type==16) {
+	uint16_t* entry=array_get(&(s->fat),cluster);
+	*entry=cpu_to_le16(value&0xffff);
+    } else {
+	int offset = (cluster*3/2);
+	unsigned char* p = array_get(&(s->fat), offset);
+        switch (cluster&1) {
+	case 0:
+		p[0] = value&0xff;
+		p[1] = (p[1]&0xf0) | ((value>>8)&0xf);
+		break;
+	case 1:
+		p[0] = (p[0]&0xf) | ((value&0xf)<<4);
+		p[1] = (value>>4);
+		break;
+	}
+    }
+}
+
+static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster)
+{
+    if(s->fat_type==32) {
+	uint32_t* entry=array_get(&(s->fat),cluster);
+	return le32_to_cpu(*entry);
+    } else if(s->fat_type==16) {
+	uint16_t* entry=array_get(&(s->fat),cluster);
+	return le16_to_cpu(*entry);
+    } else {
+	const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2;
+	return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+    }
+}
+
+static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry)
+{
+    if(fat_entry>s->max_fat_value-8)
+	return -1;
+    return 0;
+}
+
+static inline void init_fat(BDRVVVFATState* s)
+{
+    if (s->fat_type == 12) {
+	array_init(&(s->fat),1);
+	array_ensure_allocated(&(s->fat),
+		s->sectors_per_fat * 0x200 * 3 / 2 - 1);
+    } else {
+	array_init(&(s->fat),(s->fat_type==32?4:2));
+	array_ensure_allocated(&(s->fat),
+		s->sectors_per_fat * 0x200 / s->fat.item_size - 1);
+    }
+    memset(s->fat.pointer,0,s->fat.size);
+
+    switch(s->fat_type) {
+	case 12: s->max_fat_value=0xfff; break;
+	case 16: s->max_fat_value=0xffff; break;
+	case 32: s->max_fat_value=0x0fffffff; break;
+	default: s->max_fat_value=0; /* error... */
+    }
+
+}
+
+/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */
+/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */
+static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s,
+	unsigned int directory_start, const char* filename, int is_dot)
+{
+    int i,j,long_index=s->directory.next;
+    direntry_t* entry = NULL;
+    direntry_t* entry_long = NULL;
+
+    if(is_dot) {
+	entry=array_get_next(&(s->directory));
+	memset(entry->name,0x20,11);
+	memcpy(entry->name,filename,strlen(filename));
+	return entry;
+    }
+
+    entry_long=create_long_filename(s,filename);
+
+    i = strlen(filename);
+    for(j = i - 1; j>0  && filename[j]!='.';j--);
+    if (j > 0)
+	i = (j > 8 ? 8 : j);
+    else if (i > 8)
+	i = 8;
+
+    entry=array_get_next(&(s->directory));
+    memset(entry->name,0x20,11);
+    memcpy(entry->name, filename, i);
+
+    if(j > 0)
+	for (i = 0; i < 3 && filename[j+1+i]; i++)
+	    entry->extension[i] = filename[j+1+i];
+
+    /* upcase & remove unwanted characters */
+    for(i=10;i>=0;i--) {
+	if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--);
+	if(entry->name[i]<=' ' || entry->name[i]>0x7f
+		|| strchr(".*?<>|\":/\\[];,+='",entry->name[i]))
+	    entry->name[i]='_';
+        else if(entry->name[i]>='a' && entry->name[i]<='z')
+            entry->name[i]+='A'-'a';
+    }
+
+    /* mangle duplicates */
+    while(1) {
+	direntry_t* entry1=array_get(&(s->directory),directory_start);
+	int j;
+
+	for(;entry1<entry;entry1++)
+	    if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11))
+		break; /* found dupe */
+	if(entry1==entry) /* no dupe found */
+	    break;
+
+	/* use all 8 characters of name */
+	if(entry->name[7]==' ') {
+	    int j;
+	    for(j=6;j>0 && entry->name[j]==' ';j--)
+		entry->name[j]='~';
+	}
+
+	/* increment number */
+	for(j=7;j>0 && entry->name[j]=='9';j--)
+	    entry->name[j]='0';
+	if(j>0) {
+	    if(entry->name[j]<'0' || entry->name[j]>'9')
+	        entry->name[j]='0';
+	    else
+	        entry->name[j]++;
+	}
+    }
+
+    /* calculate checksum; propagate to long name */
+    if(entry_long) {
+        uint8_t chksum=fat_chksum(entry);
+
+	/* calculate anew, because realloc could have taken place */
+	entry_long=array_get(&(s->directory),long_index);
+	while(entry_long<entry && is_long_name(entry_long)) {
+	    entry_long->reserved[1]=chksum;
+	    entry_long++;
+	}
+    }
+
+    return entry;
+}
+
+/*
+ * Read a directory. (the index of the corresponding mapping must be passed).
+ */
+static int read_directory(BDRVVVFATState* s, int mapping_index)
+{
+    mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+    direntry_t* direntry;
+    const char* dirname = mapping->path;
+    int first_cluster = mapping->begin;
+    int parent_index = mapping->info.dir.parent_mapping_index;
+    mapping_t* parent_mapping = (mapping_t*)
+        (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL);
+    int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1;
+
+    DIR* dir=opendir(dirname);
+    struct dirent* entry;
+    int i;
+
+    assert(mapping->mode & MODE_DIRECTORY);
+
+    if(!dir) {
+	mapping->end = mapping->begin;
+	return -1;
+    }
+
+    i = mapping->info.dir.first_dir_index =
+	    first_cluster == 0 ? 0 : s->directory.next;
+
+    /* actually read the directory, and allocate the mappings */
+    while((entry=readdir(dir))) {
+	unsigned int length=strlen(dirname)+2+strlen(entry->d_name);
+        char* buffer;
+	direntry_t* direntry;
+        struct stat st;
+	int is_dot=!strcmp(entry->d_name,".");
+	int is_dotdot=!strcmp(entry->d_name,"..");
+
+	if(first_cluster == 0 && (is_dotdot || is_dot))
+	    continue;
+
+	buffer=(char*)qemu_malloc(length);
+	snprintf(buffer,length,"%s/%s",dirname,entry->d_name);
+
+	if(stat(buffer,&st)<0) {
+	    free(buffer);
+            continue;
+	}
+
+	/* create directory entry for this file */
+	direntry=create_short_and_long_name(s, i, entry->d_name,
+		is_dot || is_dotdot);
+	direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20);
+	direntry->reserved[0]=direntry->reserved[1]=0;
+	direntry->ctime=fat_datetime(st.st_ctime,1);
+	direntry->cdate=fat_datetime(st.st_ctime,0);
+	direntry->adate=fat_datetime(st.st_atime,0);
+	direntry->begin_hi=0;
+	direntry->mtime=fat_datetime(st.st_mtime,1);
+	direntry->mdate=fat_datetime(st.st_mtime,0);
+	if(is_dotdot)
+	    set_begin_of_direntry(direntry, first_cluster_of_parent);
+	else if(is_dot)
+	    set_begin_of_direntry(direntry, first_cluster);
+	else
+	    direntry->begin=0; /* do that later */
+        if (st.st_size > 0x7fffffff) {
+	    fprintf(stderr, "File %s is larger than 2GB\n", buffer);
+	    free(buffer);
+	    return -2;
+        }
+	direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size);
+
+	/* create mapping for this file */
+	if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) {
+	    s->current_mapping=(mapping_t*)array_get_next(&(s->mapping));
+	    s->current_mapping->begin=0;
+	    s->current_mapping->end=st.st_size;
+	    /*
+	     * we get the direntry of the most recent direntry, which
+	     * contains the short name and all the relevant information.
+	     */
+	    s->current_mapping->dir_index=s->directory.next-1;
+	    s->current_mapping->first_mapping_index = -1;
+	    if (S_ISDIR(st.st_mode)) {
+		s->current_mapping->mode = MODE_DIRECTORY;
+		s->current_mapping->info.dir.parent_mapping_index =
+		    mapping_index;
+	    } else {
+		s->current_mapping->mode = MODE_UNDEFINED;
+		s->current_mapping->info.file.offset = 0;
+	    }
+	    s->current_mapping->path=buffer;
+	    s->current_mapping->read_only =
+		(st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0;
+	}
+    }
+    closedir(dir);
+
+    /* fill with zeroes up to the end of the cluster */
+    while(s->directory.next%(0x10*s->sectors_per_cluster)) {
+	direntry_t* direntry=array_get_next(&(s->directory));
+	memset(direntry,0,sizeof(direntry_t));
+    }
+
+/* TODO: if there are more entries, bootsector has to be adjusted! */
+#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster)
+    if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) {
+	/* root directory */
+	int cur = s->directory.next;
+	array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1);
+	memset(array_get(&(s->directory), cur), 0,
+		(ROOT_ENTRIES - cur) * sizeof(direntry_t));
+    }
+
+     /* reget the mapping, since s->mapping was possibly realloc()ed */
+    mapping = (mapping_t*)array_get(&(s->mapping), mapping_index);
+    first_cluster += (s->directory.next - mapping->info.dir.first_dir_index)
+	* 0x20 / s->cluster_size;
+    mapping->end = first_cluster;
+
+    direntry = (direntry_t*)array_get(&(s->directory), mapping->dir_index);
+    set_begin_of_direntry(direntry, mapping->begin);
+
+    return 0;
+}
+
+static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num)
+{
+    return (sector_num-s->faked_sectors)/s->sectors_per_cluster;
+}
+
+static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num)
+{
+    return s->faked_sectors + s->sectors_per_cluster * cluster_num;
+}
+
+static inline uint32_t sector_offset_in_cluster(BDRVVVFATState* s,off_t sector_num)
+{
+    return (sector_num-s->first_sectors_number-2*s->sectors_per_fat)%s->sectors_per_cluster;
+}
+
+#ifdef DBG
+static direntry_t* get_direntry_for_mapping(BDRVVVFATState* s,mapping_t* mapping)
+{
+    if(mapping->mode==MODE_UNDEFINED)
+	return 0;
+    return (direntry_t*)(s->directory.pointer+sizeof(direntry_t)*mapping->dir_index);
+}
+#endif
+
+static int init_directories(BDRVVVFATState* s,
+	const char* dirname)
+{
+    bootsector_t* bootsector;
+    mapping_t* mapping;
+    unsigned int i;
+    unsigned int cluster;
+
+    memset(&(s->first_sectors[0]),0,0x40*0x200);
+
+    s->cluster_size=s->sectors_per_cluster*0x200;
+    s->cluster_buffer=qemu_malloc(s->cluster_size);
+
+    /*
+     * The formula: sc = spf+1+spf*spc*(512*8/fat_type),
+     * where sc is sector_count,
+     * spf is sectors_per_fat,
+     * spc is sectors_per_clusters, and
+     * fat_type = 12, 16 or 32.
+     */
+    i = 1+s->sectors_per_cluster*0x200*8/s->fat_type;
+    s->sectors_per_fat=(s->sector_count+i)/i; /* round up */
+
+    array_init(&(s->mapping),sizeof(mapping_t));
+    array_init(&(s->directory),sizeof(direntry_t));
+
+    /* add volume label */
+    {
+	direntry_t* entry=array_get_next(&(s->directory));
+	entry->attributes=0x28; /* archive | volume label */
+	snprintf((char*)entry->name,11,"QEMU VVFAT");
+    }
+
+    /* Now build FAT, and write back information into directory */
+    init_fat(s);
+
+    s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2;
+    s->cluster_count=sector2cluster(s, s->sector_count);
+
+    mapping = array_get_next(&(s->mapping));
+    mapping->begin = 0;
+    mapping->dir_index = 0;
+    mapping->info.dir.parent_mapping_index = -1;
+    mapping->first_mapping_index = -1;
+    mapping->path = strdup(dirname);
+    i = strlen(mapping->path);
+    if (i > 0 && mapping->path[i - 1] == '/')
+	mapping->path[i - 1] = '\0';
+    mapping->mode = MODE_DIRECTORY;
+    mapping->read_only = 0;
+    s->path = mapping->path;
+
+    for (i = 0, cluster = 0; i < s->mapping.next; i++) {
+	/* MS-DOS expects the FAT to be 0 for the root directory
+	 * (except for the media byte). */
+	/* LATER TODO: still true for FAT32? */
+	int fix_fat = (i != 0);
+	mapping = array_get(&(s->mapping), i);
+
+        if (mapping->mode & MODE_DIRECTORY) {
+	    mapping->begin = cluster;
+	    if(read_directory(s, i)) {
+		fprintf(stderr, "Could not read directory %s\n",
+			mapping->path);
+		return -1;
+	    }
+	    mapping = array_get(&(s->mapping), i);
+	} else {
+	    assert(mapping->mode == MODE_UNDEFINED);
+	    mapping->mode=MODE_NORMAL;
+	    mapping->begin = cluster;
+	    if (mapping->end > 0) {
+		direntry_t* direntry = array_get(&(s->directory),
+			mapping->dir_index);
+
+		mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size;
+		set_begin_of_direntry(direntry, mapping->begin);
+	    } else {
+		mapping->end = cluster + 1;
+		fix_fat = 0;
+	    }
+	}
+
+	assert(mapping->begin < mapping->end);
+
+	/* next free cluster */
+	cluster = mapping->end;
+
+	if(cluster > s->cluster_count) {
+	    fprintf(stderr,"Directory does not fit in FAT%d (capacity %s)\n",
+		    s->fat_type,
+		    s->fat_type == 12 ? s->sector_count == 2880 ? "1.44 MB"
+								: "2.88 MB"
+				      : "504MB");
+	    return -EINVAL;
+	}
+
+	/* fix fat for entry */
+	if (fix_fat) {
+	    int j;
+	    for(j = mapping->begin; j < mapping->end - 1; j++)
+		fat_set(s, j, j+1);
+	    fat_set(s, mapping->end - 1, s->max_fat_value);
+	}
+    }
+
+    mapping = array_get(&(s->mapping), 0);
+    s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster;
+    s->last_cluster_of_root_directory = mapping->end;
+
+    /* the FAT signature */
+    fat_set(s,0,s->max_fat_value);
+    fat_set(s,1,s->max_fat_value);
+
+    s->current_mapping = NULL;
+
+    bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200);
+    bootsector->jump[0]=0xeb;
+    bootsector->jump[1]=0x3e;
+    bootsector->jump[2]=0x90;
+    memcpy(bootsector->name,"QEMU    ",8);
+    bootsector->sector_size=cpu_to_le16(0x200);
+    bootsector->sectors_per_cluster=s->sectors_per_cluster;
+    bootsector->reserved_sectors=cpu_to_le16(1);
+    bootsector->number_of_fats=0x2; /* number of FATs */
+    bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10);
+    bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count);
+    bootsector->media_type=(s->fat_type!=12?0xf8:s->sector_count==5760?0xf9:0xf8); /* media descriptor */
+    s->fat.pointer[0] = bootsector->media_type;
+    bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat);
+    bootsector->sectors_per_track=cpu_to_le16(s->bs->secs);
+    bootsector->number_of_heads=cpu_to_le16(s->bs->heads);
+    bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f);
+    bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0);
+
+    /* LATER TODO: if FAT32, this is wrong */
+    bootsector->u.fat16.drive_number=s->fat_type==12?0:0x80; /* assume this is hda (TODO) */
+    bootsector->u.fat16.current_head=0;
+    bootsector->u.fat16.signature=0x29;
+    bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd);
+
+    memcpy(bootsector->u.fat16.volume_label,"QEMU VVFAT ",11);
+    memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12   ":s->fat_type==16?"FAT16   ":"FAT32   "),8);
+    bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa;
+
+    return 0;
+}
+
+#ifdef DEBUG
+static BDRVVVFATState *vvv = NULL;
+#endif
+
+static int enable_write_target(BDRVVVFATState *s);
+static int is_consistent(BDRVVVFATState *s);
+
+static int vvfat_open(BlockDriverState *bs, const char* dirname, int flags)
+{
+    BDRVVVFATState *s = bs->opaque;
+    int floppy = 0;
+    int i;
+
+#ifdef DEBUG
+    vvv = s;
+#endif
+
+DLOG(if (stderr == NULL) {
+    stderr = fopen("vvfat.log", "a");
+    setbuf(stderr, NULL);
+})
+
+    s->bs = bs;
+
+    s->fat_type=16;
+    /* LATER TODO: if FAT32, adjust */
+    s->sectors_per_cluster=0x10;
+    /* 504MB disk*/
+    bs->cyls=1024; bs->heads=16; bs->secs=63;
+
+    s->current_cluster=0xffffffff;
+
+    s->first_sectors_number=0x40;
+    /* read only is the default for safety */
+    bs->read_only = 1;
+    s->qcow = s->write_target = NULL;
+    s->qcow_filename = NULL;
+    s->fat2 = NULL;
+    s->downcase_short_names = 1;
+
+    if (!strstart(dirname, "fat:", NULL))
+	return -1;
+
+    if (strstr(dirname, ":floppy:")) {
+	floppy = 1;
+	s->fat_type = 12;
+	s->first_sectors_number = 1;
+	s->sectors_per_cluster=2;
+	bs->cyls = 80; bs->heads = 2; bs->secs = 36;
+    }
+
+    s->sector_count=bs->cyls*bs->heads*bs->secs;
+
+    if (strstr(dirname, ":32:")) {
+	fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. You are welcome to do so!\n");
+	s->fat_type = 32;
+    } else if (strstr(dirname, ":16:")) {
+	s->fat_type = 16;
+    } else if (strstr(dirname, ":12:")) {
+	s->fat_type = 12;
+	s->sector_count=2880;
+    }
+
+    if (strstr(dirname, ":rw:")) {
+	if (enable_write_target(s))
+	    return -1;
+	bs->read_only = 0;
+    }
+
+    i = strrchr(dirname, ':') - dirname;
+    assert(i >= 3);
+    if (dirname[i-2] == ':' && qemu_isalpha(dirname[i-1]))
+	/* workaround for DOS drive names */
+	dirname += i-1;
+    else
+	dirname += i+1;
+
+    bs->total_sectors=bs->cyls*bs->heads*bs->secs;
+
+    if(init_directories(s, dirname))
+	return -1;
+
+    s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count;
+
+    if(s->first_sectors_number==0x40)
+	init_mbr(s);
+
+    /* for some reason or other, MS-DOS does not like to know about CHS... */
+    if (floppy)
+	bs->heads = bs->cyls = bs->secs = 0;
+
+    //    assert(is_consistent(s));
+    return 0;
+}
+
+static inline void vvfat_close_current_file(BDRVVVFATState *s)
+{
+    if(s->current_mapping) {
+	s->current_mapping = NULL;
+	if (s->current_fd) {
+		close(s->current_fd);
+		s->current_fd = 0;
+	}
+    }
+    s->current_cluster = -1;
+}
+
+/* mappings between index1 and index2-1 are supposed to be ordered
+ * return value is the index of the last mapping for which end>cluster_num
+ */
+static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2)
+{
+    int index3=index1+1;
+    while(1) {
+	mapping_t* mapping;
+	index3=(index1+index2)/2;
+	mapping=array_get(&(s->mapping),index3);
+	assert(mapping->begin < mapping->end);
+	if(mapping->begin>=cluster_num) {
+	    assert(index2!=index3 || index2==0);
+	    if(index2==index3)
+		return index1;
+	    index2=index3;
+	} else {
+	    if(index1==index3)
+		return mapping->end<=cluster_num ? index2 : index1;
+	    index1=index3;
+	}
+	assert(index1<=index2);
+	DLOG(mapping=array_get(&(s->mapping),index1);
+	assert(mapping->begin<=cluster_num);
+	assert(index2 >= s->mapping.next ||
+		((mapping = array_get(&(s->mapping),index2)) &&
+		mapping->end>cluster_num)));
+    }
+}
+
+static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num)
+{
+    int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next);
+    mapping_t* mapping;
+    if(index>=s->mapping.next)
+        return NULL;
+    mapping=array_get(&(s->mapping),index);
+    if(mapping->begin>cluster_num)
+        return NULL;
+    assert(mapping->begin<=cluster_num && mapping->end>cluster_num);
+    return mapping;
+}
+
+/*
+ * This function simply compares path == mapping->path. Since the mappings
+ * are sorted by cluster, this is expensive: O(n).
+ */
+static inline mapping_t* find_mapping_for_path(BDRVVVFATState* s,
+	const char* path)
+{
+    int i;
+
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+	if (mapping->first_mapping_index < 0 &&
+		!strcmp(path, mapping->path))
+	    return mapping;
+    }
+
+    return NULL;
+}
+
+static int open_file(BDRVVVFATState* s,mapping_t* mapping)
+{
+    if(!mapping)
+	return -1;
+    if(!s->current_mapping ||
+	    strcmp(s->current_mapping->path,mapping->path)) {
+	/* open file */
+	int fd = open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE);
+	if(fd<0)
+	    return -1;
+	vvfat_close_current_file(s);
+	s->current_fd = fd;
+	s->current_mapping = mapping;
+    }
+    return 0;
+}
+
+static inline int read_cluster(BDRVVVFATState *s,int cluster_num)
+{
+    if(s->current_cluster != cluster_num) {
+	int result=0;
+	off_t offset;
+	assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY));
+	if(!s->current_mapping
+		|| s->current_mapping->begin>cluster_num
+		|| s->current_mapping->end<=cluster_num) {
+	    /* binary search of mappings for file */
+	    mapping_t* mapping=find_mapping_for_cluster(s,cluster_num);
+
+	    assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end));
+
+	    if (mapping && mapping->mode & MODE_DIRECTORY) {
+		vvfat_close_current_file(s);
+		s->current_mapping = mapping;
+read_cluster_directory:
+		offset = s->cluster_size*(cluster_num-s->current_mapping->begin);
+		s->cluster = (unsigned char*)s->directory.pointer+offset
+			+ 0x20*s->current_mapping->info.dir.first_dir_index;
+		assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0);
+		assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size);
+		s->current_cluster = cluster_num;
+		return 0;
+	    }
+
+	    if(open_file(s,mapping))
+		return -2;
+	} else if (s->current_mapping->mode & MODE_DIRECTORY)
+	    goto read_cluster_directory;
+
+	assert(s->current_fd);
+
+	offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset;
+	if(lseek(s->current_fd, offset, SEEK_SET)!=offset)
+	    return -3;
+	s->cluster=s->cluster_buffer;
+	result=read(s->current_fd,s->cluster,s->cluster_size);
+	if(result<0) {
+	    s->current_cluster = -1;
+	    return -1;
+	}
+	s->current_cluster = cluster_num;
+    }
+    return 0;
+}
+
+#ifdef DEBUG
+static void hexdump(const void* address, uint32_t len)
+{
+    const unsigned char* p = address;
+    int i, j;
+
+    for (i = 0; i < len; i += 16) {
+	for (j = 0; j < 16 && i + j < len; j++)
+	    fprintf(stderr, "%02x ", p[i + j]);
+	for (; j < 16; j++)
+	    fprintf(stderr, "   ");
+	fprintf(stderr, " ");
+	for (j = 0; j < 16 && i + j < len; j++)
+	    fprintf(stderr, "%c", (p[i + j] < ' ' || p[i + j] > 0x7f) ? '.' : p[i + j]);
+	fprintf(stderr, "\n");
+    }
+}
+
+static void print_direntry(const direntry_t* direntry)
+{
+    int j = 0;
+    char buffer[1024];
+
+    fprintf(stderr, "direntry 0x%x: ", (int)direntry);
+    if(!direntry)
+	return;
+    if(is_long_name(direntry)) {
+	unsigned char* c=(unsigned char*)direntry;
+	int i;
+	for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2)
+#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;}
+	    ADD_CHAR(c[i]);
+	for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2)
+	    ADD_CHAR(c[i]);
+	for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2)
+	    ADD_CHAR(c[i]);
+	buffer[j] = 0;
+	fprintf(stderr, "%s\n", buffer);
+    } else {
+	int i;
+	for(i=0;i<11;i++)
+	    ADD_CHAR(direntry->name[i]);
+	buffer[j] = 0;
+	fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n",
+		buffer,
+		direntry->attributes,
+		begin_of_direntry(direntry),le32_to_cpu(direntry->size));
+    }
+}
+
+static void print_mapping(const mapping_t* mapping)
+{
+    fprintf(stderr, "mapping (0x%x): begin, end = %d, %d, dir_index = %d, first_mapping_index = %d, name = %s, mode = 0x%x, " , (int)mapping, mapping->begin, mapping->end, mapping->dir_index, mapping->first_mapping_index, mapping->path, mapping->mode);
+    if (mapping->mode & MODE_DIRECTORY)
+	fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index);
+    else
+	fprintf(stderr, "offset = %d\n", mapping->info.file.offset);
+}
+#endif
+
+static int vvfat_read(BlockDriverState *bs, int64_t sector_num,
+                    uint8_t *buf, int nb_sectors)
+{
+    BDRVVVFATState *s = bs->opaque;
+    int i;
+
+    for(i=0;i<nb_sectors;i++,sector_num++) {
+	if (sector_num >= s->sector_count)
+	   return -1;
+	if (s->qcow) {
+	    int n;
+	    if (s->qcow->drv->bdrv_is_allocated(s->qcow,
+			sector_num, nb_sectors-i, &n)) {
+DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n));
+		if (s->qcow->drv->bdrv_read(s->qcow, sector_num, buf+i*0x200, n))
+		    return -1;
+		i += n - 1;
+		sector_num += n - 1;
+		continue;
+	    }
+DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num));
+	}
+	if(sector_num<s->faked_sectors) {
+	    if(sector_num<s->first_sectors_number)
+		memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200);
+	    else if(sector_num-s->first_sectors_number<s->sectors_per_fat)
+		memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200);
+	    else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat)
+		memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200);
+	} else {
+	    uint32_t sector=sector_num-s->faked_sectors,
+	    sector_offset_in_cluster=(sector%s->sectors_per_cluster),
+	    cluster_num=sector/s->sectors_per_cluster;
+	    if(read_cluster(s, cluster_num) != 0) {
+		/* LATER TODO: strict: return -1; */
+		memset(buf+i*0x200,0,0x200);
+		continue;
+	    }
+	    memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200);
+	}
+    }
+    return 0;
+}
+
+/* LATER TODO: statify all functions */
+
+/*
+ * Idea of the write support (use snapshot):
+ *
+ * 1. check if all data is consistent, recording renames, modifications,
+ *    new files and directories (in s->commits).
+ *
+ * 2. if the data is not consistent, stop committing
+ *
+ * 3. handle renames, and create new files and directories (do not yet
+ *    write their contents)
+ *
+ * 4. walk the directories, fixing the mapping and direntries, and marking
+ *    the handled mappings as not deleted
+ *
+ * 5. commit the contents of the files
+ *
+ * 6. handle deleted files and directories
+ *
+ */
+
+typedef struct commit_t {
+    char* path;
+    union {
+	struct { uint32_t cluster; } rename;
+	struct { int dir_index; uint32_t modified_offset; } writeout;
+	struct { uint32_t first_cluster; } new_file;
+	struct { uint32_t cluster; } mkdir;
+    } param;
+    /* DELETEs and RMDIRs are handled differently: see handle_deletes() */
+    enum {
+	ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR
+    } action;
+} commit_t;
+
+static void clear_commits(BDRVVVFATState* s)
+{
+    int i;
+DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next));
+    for (i = 0; i < s->commits.next; i++) {
+	commit_t* commit = array_get(&(s->commits), i);
+	assert(commit->path || commit->action == ACTION_WRITEOUT);
+	if (commit->action != ACTION_WRITEOUT) {
+	    assert(commit->path);
+	    free(commit->path);
+	} else
+	    assert(commit->path == NULL);
+    }
+    s->commits.next = 0;
+}
+
+static void schedule_rename(BDRVVVFATState* s,
+	uint32_t cluster, char* new_path)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = new_path;
+    commit->param.rename.cluster = cluster;
+    commit->action = ACTION_RENAME;
+}
+
+static void schedule_writeout(BDRVVVFATState* s,
+	int dir_index, uint32_t modified_offset)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = NULL;
+    commit->param.writeout.dir_index = dir_index;
+    commit->param.writeout.modified_offset = modified_offset;
+    commit->action = ACTION_WRITEOUT;
+}
+
+static void schedule_new_file(BDRVVVFATState* s,
+	char* path, uint32_t first_cluster)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = path;
+    commit->param.new_file.first_cluster = first_cluster;
+    commit->action = ACTION_NEW_FILE;
+}
+
+static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path)
+{
+    commit_t* commit = array_get_next(&(s->commits));
+    commit->path = path;
+    commit->param.mkdir.cluster = cluster;
+    commit->action = ACTION_MKDIR;
+}
+
+typedef struct {
+    /*
+     * Since the sequence number is at most 0x3f, and the filename
+     * length is at most 13 times the sequence number, the maximal
+     * filename length is 0x3f * 13 bytes.
+     */
+    unsigned char name[0x3f * 13 + 1];
+    int checksum, len;
+    int sequence_number;
+} long_file_name;
+
+static void lfn_init(long_file_name* lfn)
+{
+   lfn->sequence_number = lfn->len = 0;
+   lfn->checksum = 0x100;
+}
+
+/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */
+static int parse_long_name(long_file_name* lfn,
+	const direntry_t* direntry)
+{
+    int i, j, offset;
+    const unsigned char* pointer = (const unsigned char*)direntry;
+
+    if (!is_long_name(direntry))
+	return 1;
+
+    if (pointer[0] & 0x40) {
+	lfn->sequence_number = pointer[0] & 0x3f;
+	lfn->checksum = pointer[13];
+	lfn->name[0] = 0;
+	lfn->name[lfn->sequence_number * 13] = 0;
+    } else if ((pointer[0] & 0x3f) != --lfn->sequence_number)
+	return -1;
+    else if (pointer[13] != lfn->checksum)
+	return -2;
+    else if (pointer[12] || pointer[26] || pointer[27])
+	return -3;
+
+    offset = 13 * (lfn->sequence_number - 1);
+    for (i = 0, j = 1; i < 13; i++, j+=2) {
+	if (j == 11)
+	    j = 14;
+	else if (j == 26)
+	    j = 28;
+
+	if (pointer[j+1] == 0)
+	    lfn->name[offset + i] = pointer[j];
+	else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0)
+	    return -4;
+	else
+	    lfn->name[offset + i] = 0;
+    }
+
+    if (pointer[0] & 0x40)
+	lfn->len = offset + strlen((char*)lfn->name + offset);
+
+    return 0;
+}
+
+/* returns 0 if successful, >0 if no short_name, and <0 on error */
+static int parse_short_name(BDRVVVFATState* s,
+	long_file_name* lfn, direntry_t* direntry)
+{
+    int i, j;
+
+    if (!is_short_name(direntry))
+	return 1;
+
+    for (j = 7; j >= 0 && direntry->name[j] == ' '; j--);
+    for (i = 0; i <= j; i++) {
+	if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f)
+	    return -1;
+	else if (s->downcase_short_names)
+	    lfn->name[i] = qemu_tolower(direntry->name[i]);
+	else
+	    lfn->name[i] = direntry->name[i];
+    }
+
+    for (j = 2; j >= 0 && direntry->extension[j] == ' '; j--);
+    if (j >= 0) {
+	lfn->name[i++] = '.';
+	lfn->name[i + j + 1] = '\0';
+	for (;j >= 0; j--) {
+	    if (direntry->extension[j] <= ' ' || direntry->extension[j] > 0x7f)
+		return -2;
+	    else if (s->downcase_short_names)
+		lfn->name[i + j] = qemu_tolower(direntry->extension[j]);
+	    else
+		lfn->name[i + j] = direntry->extension[j];
+	}
+    } else
+	lfn->name[i + j + 1] = '\0';
+
+    lfn->len = strlen((char*)lfn->name);
+
+    return 0;
+}
+
+static inline uint32_t modified_fat_get(BDRVVVFATState* s,
+	unsigned int cluster)
+{
+    if (cluster < s->last_cluster_of_root_directory) {
+	if (cluster + 1 == s->last_cluster_of_root_directory)
+	    return s->max_fat_value;
+	else
+	    return cluster + 1;
+    }
+
+    if (s->fat_type==32) {
+        uint32_t* entry=((uint32_t*)s->fat2)+cluster;
+        return le32_to_cpu(*entry);
+    } else if (s->fat_type==16) {
+        uint16_t* entry=((uint16_t*)s->fat2)+cluster;
+        return le16_to_cpu(*entry);
+    } else {
+        const uint8_t* x=s->fat2+cluster*3/2;
+        return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff;
+    }
+}
+
+static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num)
+{
+    int was_modified = 0;
+    int i, dummy;
+
+    if (s->qcow == NULL)
+	return 0;
+
+    for (i = 0; !was_modified && i < s->sectors_per_cluster; i++)
+	was_modified = s->qcow->drv->bdrv_is_allocated(s->qcow,
+		cluster2sector(s, cluster_num) + i, 1, &dummy);
+
+    return was_modified;
+}
+
+static const char* get_basename(const char* path)
+{
+    char* basename = strrchr(path, '/');
+    if (basename == NULL)
+	return path;
+    else
+	return basename + 1; /* strip '/' */
+}
+
+/*
+ * The array s->used_clusters holds the states of the clusters. If it is
+ * part of a file, it has bit 2 set, in case of a directory, bit 1. If it
+ * was modified, bit 3 is set.
+ * If any cluster is allocated, but not part of a file or directory, this
+ * driver refuses to commit.
+ */
+typedef enum {
+     USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4
+} used_t;
+
+/*
+ * get_cluster_count_for_direntry() not only determines how many clusters
+ * are occupied by direntry, but also if it was renamed or modified.
+ *
+ * A file is thought to be renamed *only* if there already was a file with
+ * exactly the same first cluster, but a different name.
+ *
+ * Further, the files/directories handled by this function are
+ * assumed to be *not* deleted (and *only* those).
+ */
+static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s,
+	direntry_t* direntry, const char* path)
+{
+    /*
+     * This is a little bit tricky:
+     * IF the guest OS just inserts a cluster into the file chain,
+     * and leaves the rest alone, (i.e. the original file had clusters
+     * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens:
+     *
+     * - do_commit will write the cluster into the file at the given
+     *   offset, but
+     *
+     * - the cluster which is overwritten should be moved to a later
+     *   position in the file.
+     *
+     * I am not aware that any OS does something as braindead, but this
+     * situation could happen anyway when not committing for a long time.
+     * Just to be sure that this does not bite us, detect it, and copy the
+     * contents of the clusters to-be-overwritten into the qcow.
+     */
+    int copy_it = 0;
+    int was_modified = 0;
+    int32_t ret = 0;
+
+    uint32_t cluster_num = begin_of_direntry(direntry);
+    uint32_t offset = 0;
+    int first_mapping_index = -1;
+    mapping_t* mapping = NULL;
+    const char* basename2 = NULL;
+
+    vvfat_close_current_file(s);
+
+    /* the root directory */
+    if (cluster_num == 0)
+	return 0;
+
+    /* write support */
+    if (s->qcow) {
+	basename2 = get_basename(path);
+
+	mapping = find_mapping_for_cluster(s, cluster_num);
+
+	if (mapping) {
+	    const char* basename;
+
+	    assert(mapping->mode & MODE_DELETED);
+	    mapping->mode &= ~MODE_DELETED;
+
+	    basename = get_basename(mapping->path);
+
+	    assert(mapping->mode & MODE_NORMAL);
+
+	    /* rename */
+	    if (strcmp(basename, basename2))
+		schedule_rename(s, cluster_num, strdup(path));
+	} else if (is_file(direntry))
+	    /* new file */
+	    schedule_new_file(s, strdup(path), cluster_num);
+	else {
+	    assert(0);
+	    return 0;
+	}
+    }
+
+    while(1) {
+	if (s->qcow) {
+	    if (!copy_it && cluster_was_modified(s, cluster_num)) {
+		if (mapping == NULL ||
+			mapping->begin > cluster_num ||
+			mapping->end <= cluster_num)
+		mapping = find_mapping_for_cluster(s, cluster_num);
+
+
+		if (mapping &&
+			(mapping->mode & MODE_DIRECTORY) == 0) {
+
+		    /* was modified in qcow */
+		    if (offset != mapping->info.file.offset + s->cluster_size
+			    * (cluster_num - mapping->begin)) {
+			/* offset of this cluster in file chain has changed */
+			assert(0);
+			copy_it = 1;
+		    } else if (offset == 0) {
+			const char* basename = get_basename(mapping->path);
+
+			if (strcmp(basename, basename2))
+			    copy_it = 1;
+			first_mapping_index = array_index(&(s->mapping), mapping);
+		    }
+
+		    if (mapping->first_mapping_index != first_mapping_index
+			    && mapping->info.file.offset > 0) {
+			assert(0);
+			copy_it = 1;
+		    }
+
+		    /* need to write out? */
+		    if (!was_modified && is_file(direntry)) {
+			was_modified = 1;
+			schedule_writeout(s, mapping->dir_index, offset);
+		    }
+		}
+	    }
+
+	    if (copy_it) {
+		int i, dummy;
+		/*
+		 * This is horribly inefficient, but that is okay, since
+		 * it is rarely executed, if at all.
+		 */
+		int64_t offset = cluster2sector(s, cluster_num);
+
+		vvfat_close_current_file(s);
+		for (i = 0; i < s->sectors_per_cluster; i++)
+		    if (!s->qcow->drv->bdrv_is_allocated(s->qcow,
+				offset + i, 1, &dummy)) {
+			if (vvfat_read(s->bs,
+				    offset, s->cluster_buffer, 1))
+			    return -1;
+			if (s->qcow->drv->bdrv_write(s->qcow,
+				    offset, s->cluster_buffer, 1))
+			    return -2;
+		    }
+	    }
+	}
+
+	ret++;
+	if (s->used_clusters[cluster_num] & USED_ANY)
+	    return 0;
+	s->used_clusters[cluster_num] = USED_FILE;
+
+	cluster_num = modified_fat_get(s, cluster_num);
+
+	if (fat_eof(s, cluster_num))
+	    return ret;
+	else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16)
+	    return -1;
+
+	offset += s->cluster_size;
+    }
+}
+
+/*
+ * This function looks at the modified data (qcow).
+ * It returns 0 upon inconsistency or error, and the number of clusters
+ * used by the directory, its subdirectories and their files.
+ */
+static int check_directory_consistency(BDRVVVFATState *s,
+	int cluster_num, const char* path)
+{
+    int ret = 0;
+    unsigned char* cluster = qemu_malloc(s->cluster_size);
+    direntry_t* direntries = (direntry_t*)cluster;
+    mapping_t* mapping = find_mapping_for_cluster(s, cluster_num);
+
+    long_file_name lfn;
+    int path_len = strlen(path);
+    char path2[PATH_MAX];
+
+    assert(path_len < PATH_MAX); /* len was tested before! */
+    pstrcpy(path2, sizeof(path2), path);
+    path2[path_len] = '/';
+    path2[path_len + 1] = '\0';
+
+    if (mapping) {
+	const char* basename = get_basename(mapping->path);
+	const char* basename2 = get_basename(path);
+
+	assert(mapping->mode & MODE_DIRECTORY);
+
+	assert(mapping->mode & MODE_DELETED);
+	mapping->mode &= ~MODE_DELETED;
+
+	if (strcmp(basename, basename2))
+	    schedule_rename(s, cluster_num, strdup(path));
+    } else
+	/* new directory */
+	schedule_mkdir(s, cluster_num, strdup(path));
+
+    lfn_init(&lfn);
+    do {
+	int i;
+	int subret = 0;
+
+	ret++;
+
+	if (s->used_clusters[cluster_num] & USED_ANY) {
+	    fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num);
+	    return 0;
+	}
+	s->used_clusters[cluster_num] = USED_DIRECTORY;
+
+DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num)));
+	subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster,
+		s->sectors_per_cluster);
+	if (subret) {
+	    fprintf(stderr, "Error fetching direntries\n");
+	fail:
+	    free(cluster);
+	    return 0;
+	}
+
+	for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) {
+	    int cluster_count = 0;
+
+DLOG(fprintf(stderr, "check direntry %d: \n", i); print_direntry(direntries + i));
+	    if (is_volume_label(direntries + i) || is_dot(direntries + i) ||
+		    is_free(direntries + i))
+		continue;
+
+	    subret = parse_long_name(&lfn, direntries + i);
+	    if (subret < 0) {
+		fprintf(stderr, "Error in long name\n");
+		goto fail;
+	    }
+	    if (subret == 0 || is_free(direntries + i))
+		continue;
+
+	    if (fat_chksum(direntries+i) != lfn.checksum) {
+		subret = parse_short_name(s, &lfn, direntries + i);
+		if (subret < 0) {
+		    fprintf(stderr, "Error in short name (%d)\n", subret);
+		    goto fail;
+		}
+		if (subret > 0 || !strcmp((char*)lfn.name, ".")
+			|| !strcmp((char*)lfn.name, ".."))
+		    continue;
+	    }
+	    lfn.checksum = 0x100; /* cannot use long name twice */
+
+	    if (path_len + 1 + lfn.len >= PATH_MAX) {
+		fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name);
+		goto fail;
+	    }
+            pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1,
+                    (char*)lfn.name);
+
+	    if (is_directory(direntries + i)) {
+		if (begin_of_direntry(direntries + i) == 0) {
+		    DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i));
+		    goto fail;
+		}
+		cluster_count = check_directory_consistency(s,
+			begin_of_direntry(direntries + i), path2);
+		if (cluster_count == 0) {
+		    DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i));
+		    goto fail;
+		}
+	    } else if (is_file(direntries + i)) {
+		/* check file size with FAT */
+		cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2);
+		if (cluster_count !=
+			(le32_to_cpu(direntries[i].size) + s->cluster_size
+			 - 1) / s->cluster_size) {
+		    DLOG(fprintf(stderr, "Cluster count mismatch\n"));
+		    goto fail;
+		}
+	    } else
+		assert(0); /* cluster_count = 0; */
+
+	    ret += cluster_count;
+	}
+
+	cluster_num = modified_fat_get(s, cluster_num);
+    } while(!fat_eof(s, cluster_num));
+
+    free(cluster);
+    return ret;
+}
+
+/* returns 1 on success */
+static int is_consistent(BDRVVVFATState* s)
+{
+    int i, check;
+    int used_clusters_count = 0;
+
+DLOG(checkpoint());
+    /*
+     * - get modified FAT
+     * - compare the two FATs (TODO)
+     * - get buffer for marking used clusters
+     * - recurse direntries from root (using bs->bdrv_read to make
+     *    sure to get the new data)
+     *   - check that the FAT agrees with the size
+     *   - count the number of clusters occupied by this directory and
+     *     its files
+     * - check that the cumulative used cluster count agrees with the
+     *   FAT
+     * - if all is fine, return number of used clusters
+     */
+    if (s->fat2 == NULL) {
+	int size = 0x200 * s->sectors_per_fat;
+	s->fat2 = qemu_malloc(size);
+	memcpy(s->fat2, s->fat.pointer, size);
+    }
+    check = vvfat_read(s->bs,
+	    s->first_sectors_number, s->fat2, s->sectors_per_fat);
+    if (check) {
+	fprintf(stderr, "Could not copy fat\n");
+	return 0;
+    }
+    assert (s->used_clusters);
+    for (i = 0; i < sector2cluster(s, s->sector_count); i++)
+	s->used_clusters[i] &= ~USED_ANY;
+
+    clear_commits(s);
+
+    /* mark every mapped file/directory as deleted.
+     * (check_directory_consistency() will unmark those still present). */
+    if (s->qcow)
+	for (i = 0; i < s->mapping.next; i++) {
+	    mapping_t* mapping = array_get(&(s->mapping), i);
+	    if (mapping->first_mapping_index < 0)
+		mapping->mode |= MODE_DELETED;
+	}
+
+    used_clusters_count = check_directory_consistency(s, 0, s->path);
+    if (used_clusters_count <= 0) {
+	DLOG(fprintf(stderr, "problem in directory\n"));
+	return 0;
+    }
+
+    check = s->last_cluster_of_root_directory;
+    for (i = check; i < sector2cluster(s, s->sector_count); i++) {
+	if (modified_fat_get(s, i)) {
+	    if(!s->used_clusters[i]) {
+		DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i));
+		return 0;
+	    }
+	    check++;
+	}
+
+	if (s->used_clusters[i] == USED_ALLOCATED) {
+	    /* allocated, but not used... */
+	    DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i));
+	    return 0;
+	}
+    }
+
+    if (check != used_clusters_count)
+	return 0;
+
+    return used_clusters_count;
+}
+
+static inline void adjust_mapping_indices(BDRVVVFATState* s,
+	int offset, int adjust)
+{
+    int i;
+
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+
+#define ADJUST_MAPPING_INDEX(name) \
+	if (mapping->name >= offset) \
+	    mapping->name += adjust
+
+	ADJUST_MAPPING_INDEX(first_mapping_index);
+	if (mapping->mode & MODE_DIRECTORY)
+	    ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index);
+    }
+}
+
+/* insert or update mapping */
+static mapping_t* insert_mapping(BDRVVVFATState* s,
+	uint32_t begin, uint32_t end)
+{
+    /*
+     * - find mapping where mapping->begin >= begin,
+     * - if mapping->begin > begin: insert
+     *   - adjust all references to mappings!
+     * - else: adjust
+     * - replace name
+     */
+    int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next);
+    mapping_t* mapping = NULL;
+    mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+    if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index))
+	    && mapping->begin < begin) {
+	mapping->end = begin;
+	index++;
+	mapping = array_get(&(s->mapping), index);
+    }
+    if (index >= s->mapping.next || mapping->begin > begin) {
+	mapping = array_insert(&(s->mapping), index, 1);
+	mapping->path = NULL;
+	adjust_mapping_indices(s, index, +1);
+    }
+
+    mapping->begin = begin;
+    mapping->end = end;
+
+DLOG(mapping_t* next_mapping;
+assert(index + 1 >= s->mapping.next ||
+((next_mapping = array_get(&(s->mapping), index + 1)) &&
+ next_mapping->begin >= end)));
+
+    if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+	s->current_mapping = array_get(&(s->mapping),
+		s->current_mapping - first_mapping);
+
+    return mapping;
+}
+
+static int remove_mapping(BDRVVVFATState* s, int mapping_index)
+{
+    mapping_t* mapping = array_get(&(s->mapping), mapping_index);
+    mapping_t* first_mapping = array_get(&(s->mapping), 0);
+
+    /* free mapping */
+    if (mapping->first_mapping_index < 0)
+	free(mapping->path);
+
+    /* remove from s->mapping */
+    array_remove(&(s->mapping), mapping_index);
+
+    /* adjust all references to mappings */
+    adjust_mapping_indices(s, mapping_index, -1);
+
+    if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer)
+	s->current_mapping = array_get(&(s->mapping),
+		s->current_mapping - first_mapping);
+
+    return 0;
+}
+
+static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust)
+{
+    int i;
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+	if (mapping->dir_index >= offset)
+	    mapping->dir_index += adjust;
+	if ((mapping->mode & MODE_DIRECTORY) &&
+		mapping->info.dir.first_dir_index >= offset)
+	    mapping->info.dir.first_dir_index += adjust;
+    }
+}
+
+static direntry_t* insert_direntries(BDRVVVFATState* s,
+	int dir_index, int count)
+{
+    /*
+     * make room in s->directory,
+     * adjust_dirindices
+     */
+    direntry_t* result = array_insert(&(s->directory), dir_index, count);
+    if (result == NULL)
+	return NULL;
+    adjust_dirindices(s, dir_index, count);
+    return result;
+}
+
+static int remove_direntries(BDRVVVFATState* s, int dir_index, int count)
+{
+    int ret = array_remove_slice(&(s->directory), dir_index, count);
+    if (ret)
+	return ret;
+    adjust_dirindices(s, dir_index, -count);
+    return 0;
+}
+
+/*
+ * Adapt the mappings of the cluster chain starting at first cluster
+ * (i.e. if a file starts at first_cluster, the chain is followed according
+ * to the modified fat, and the corresponding entries in s->mapping are
+ * adjusted)
+ */
+static int commit_mappings(BDRVVVFATState* s,
+	uint32_t first_cluster, int dir_index)
+{
+    mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+    direntry_t* direntry = array_get(&(s->directory), dir_index);
+    uint32_t cluster = first_cluster;
+
+    vvfat_close_current_file(s);
+
+    assert(mapping);
+    assert(mapping->begin == first_cluster);
+    mapping->first_mapping_index = -1;
+    mapping->dir_index = dir_index;
+    mapping->mode = (dir_index <= 0 || is_directory(direntry)) ?
+	MODE_DIRECTORY : MODE_NORMAL;
+
+    while (!fat_eof(s, cluster)) {
+	uint32_t c, c1;
+
+	for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1;
+		c = c1, c1 = modified_fat_get(s, c1));
+
+	c++;
+	if (c > mapping->end) {
+	    int index = array_index(&(s->mapping), mapping);
+	    int i, max_i = s->mapping.next - index;
+	    for (i = 1; i < max_i && mapping[i].begin < c; i++);
+	    while (--i > 0)
+		remove_mapping(s, index + 1);
+	}
+	assert(mapping == array_get(&(s->mapping), s->mapping.next - 1)
+		|| mapping[1].begin >= c);
+	mapping->end = c;
+
+	if (!fat_eof(s, c1)) {
+	    int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next);
+	    mapping_t* next_mapping = i >= s->mapping.next ? NULL :
+		array_get(&(s->mapping), i);
+
+	    if (next_mapping == NULL || next_mapping->begin > c1) {
+		int i1 = array_index(&(s->mapping), mapping);
+
+		next_mapping = insert_mapping(s, c1, c1+1);
+
+		if (c1 < c)
+		    i1++;
+		mapping = array_get(&(s->mapping), i1);
+	    }
+
+	    next_mapping->dir_index = mapping->dir_index;
+	    next_mapping->first_mapping_index =
+		mapping->first_mapping_index < 0 ?
+		array_index(&(s->mapping), mapping) :
+		mapping->first_mapping_index;
+	    next_mapping->path = mapping->path;
+	    next_mapping->mode = mapping->mode;
+	    next_mapping->read_only = mapping->read_only;
+	    if (mapping->mode & MODE_DIRECTORY) {
+		next_mapping->info.dir.parent_mapping_index =
+			mapping->info.dir.parent_mapping_index;
+		next_mapping->info.dir.first_dir_index =
+			mapping->info.dir.first_dir_index +
+			0x10 * s->sectors_per_cluster *
+			(mapping->end - mapping->begin);
+	    } else
+		next_mapping->info.file.offset = mapping->info.file.offset +
+			mapping->end - mapping->begin;
+
+	    mapping = next_mapping;
+	}
+
+	cluster = c1;
+    }
+
+    return 0;
+}
+
+static int commit_direntries(BDRVVVFATState* s,
+	int dir_index, int parent_mapping_index)
+{
+    direntry_t* direntry = array_get(&(s->directory), dir_index);
+    uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry);
+    mapping_t* mapping = find_mapping_for_cluster(s, first_cluster);
+
+    int factor = 0x10 * s->sectors_per_cluster;
+    int old_cluster_count, new_cluster_count;
+    int current_dir_index = mapping->info.dir.first_dir_index;
+    int first_dir_index = current_dir_index;
+    int ret, i;
+    uint32_t c;
+
+DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index));
+
+    assert(direntry);
+    assert(mapping);
+    assert(mapping->begin == first_cluster);
+    assert(mapping->info.dir.first_dir_index < s->directory.next);
+    assert(mapping->mode & MODE_DIRECTORY);
+    assert(dir_index == 0 || is_directory(direntry));
+
+    mapping->info.dir.parent_mapping_index = parent_mapping_index;
+
+    if (first_cluster == 0) {
+	old_cluster_count = new_cluster_count =
+	    s->last_cluster_of_root_directory;
+    } else {
+	for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+		c = fat_get(s, c))
+	    old_cluster_count++;
+
+	for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c);
+		c = modified_fat_get(s, c))
+	    new_cluster_count++;
+    }
+
+    if (new_cluster_count > old_cluster_count) {
+	if (insert_direntries(s,
+		current_dir_index + factor * old_cluster_count,
+		factor * (new_cluster_count - old_cluster_count)) == NULL)
+	    return -1;
+    } else if (new_cluster_count < old_cluster_count)
+	remove_direntries(s,
+		current_dir_index + factor * new_cluster_count,
+		factor * (old_cluster_count - new_cluster_count));
+
+    for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) {
+	void* direntry = array_get(&(s->directory), current_dir_index);
+	int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry,
+		s->sectors_per_cluster);
+	if (ret)
+	    return ret;
+	assert(!strncmp(s->directory.pointer, "QEMU", 4));
+	current_dir_index += factor;
+    }
+
+    ret = commit_mappings(s, first_cluster, dir_index);
+    if (ret)
+	return ret;
+
+    /* recurse */
+    for (i = 0; i < factor * new_cluster_count; i++) {
+	direntry = array_get(&(s->directory), first_dir_index + i);
+	if (is_directory(direntry) && !is_dot(direntry)) {
+	    mapping = find_mapping_for_cluster(s, first_cluster);
+	    assert(mapping->mode & MODE_DIRECTORY);
+	    ret = commit_direntries(s, first_dir_index + i,
+		array_index(&(s->mapping), mapping));
+	    if (ret)
+		return ret;
+	}
+    }
+
+    return 0;
+}
+
+/* commit one file (adjust contents, adjust mapping),
+   return first_mapping_index */
+static int commit_one_file(BDRVVVFATState* s,
+	int dir_index, uint32_t offset)
+{
+    direntry_t* direntry = array_get(&(s->directory), dir_index);
+    uint32_t c = begin_of_direntry(direntry);
+    uint32_t first_cluster = c;
+    mapping_t* mapping = find_mapping_for_cluster(s, c);
+    uint32_t size = filesize_of_direntry(direntry);
+    char* cluster = qemu_malloc(s->cluster_size);
+    uint32_t i;
+    int fd = 0;
+
+    assert(offset < size);
+    assert((offset % s->cluster_size) == 0);
+
+    for (i = s->cluster_size; i < offset; i += s->cluster_size)
+	c = modified_fat_get(s, c);
+
+    fd = open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666);
+    if (fd < 0) {
+	fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path,
+		strerror(errno), errno);
+	return fd;
+    }
+    if (offset > 0)
+	if (lseek(fd, offset, SEEK_SET) != offset)
+	    return -3;
+
+    while (offset < size) {
+	uint32_t c1;
+	int rest_size = (size - offset > s->cluster_size ?
+		s->cluster_size : size - offset);
+	int ret;
+
+	c1 = modified_fat_get(s, c);
+
+	assert((size - offset == 0 && fat_eof(s, c)) ||
+		(size > offset && c >=2 && !fat_eof(s, c)));
+
+	ret = vvfat_read(s->bs, cluster2sector(s, c),
+	    (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200);
+
+	if (ret < 0)
+	    return ret;
+
+	if (write(fd, cluster, rest_size) < 0)
+	    return -2;
+
+	offset += rest_size;
+	c = c1;
+    }
+
+    ftruncate(fd, size);
+    close(fd);
+
+    return commit_mappings(s, first_cluster, dir_index);
+}
+
+#ifdef DEBUG
+/* test, if all mappings point to valid direntries */
+static void check1(BDRVVVFATState* s)
+{
+    int i;
+    for (i = 0; i < s->mapping.next; i++) {
+	mapping_t* mapping = array_get(&(s->mapping), i);
+	if (mapping->mode & MODE_DELETED) {
+	    fprintf(stderr, "deleted\n");
+	    continue;
+	}
+	assert(mapping->dir_index >= 0);
+	assert(mapping->dir_index < s->directory.next);
+	direntry_t* direntry = array_get(&(s->directory), mapping->dir_index);
+	assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0);
+	if (mapping->mode & MODE_DIRECTORY) {
+	    assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next);
+	    assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0);
+	}
+    }
+}
+
+/* test, if all direntries have mappings */
+static void check2(BDRVVVFATState* s)
+{
+    int i;
+    int first_mapping = -1;
+
+    for (i = 0; i < s->directory.next; i++) {
+	direntry_t* direntry = array_get(&(s->directory), i);
+
+	if (is_short_name(direntry) && begin_of_direntry(direntry)) {
+	    mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry));
+	    assert(mapping);
+	    assert(mapping->dir_index == i || is_dot(direntry));
+	    assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry));
+	}
+
+	if ((i % (0x10 * s->sectors_per_cluster)) == 0) {
+	    /* cluster start */
+	    int j, count = 0;
+
+	    for (j = 0; j < s->mapping.next; j++) {
+		mapping_t* mapping = array_get(&(s->mapping), j);
+		if (mapping->mode & MODE_DELETED)
+		    continue;
+		if (mapping->mode & MODE_DIRECTORY) {
+		    if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) {
+			assert(++count == 1);
+			if (mapping->first_mapping_index == -1)
+			    first_mapping = array_index(&(s->mapping), mapping);
+			else
+			    assert(first_mapping == mapping->first_mapping_index);
+			if (mapping->info.dir.parent_mapping_index < 0)
+			    assert(j == 0);
+			else {
+			    mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index);
+			    assert(parent->mode & MODE_DIRECTORY);
+			    assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index);
+			}
+		    }
+		}
+	    }
+	    if (count == 0)
+		first_mapping = -1;
+	}
+    }
+}
+#endif
+
+static int handle_renames_and_mkdirs(BDRVVVFATState* s)
+{
+    int i;
+
+#ifdef DEBUG
+    fprintf(stderr, "handle_renames\n");
+    for (i = 0; i < s->commits.next; i++) {
+	commit_t* commit = array_get(&(s->commits), i);
+	fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action);
+    }
+#endif
+
+    for (i = 0; i < s->commits.next;) {
+	commit_t* commit = array_get(&(s->commits), i);
+	if (commit->action == ACTION_RENAME) {
+	    mapping_t* mapping = find_mapping_for_cluster(s,
+		    commit->param.rename.cluster);
+	    char* old_path = mapping->path;
+
+	    assert(commit->path);
+	    mapping->path = commit->path;
+	    if (rename(old_path, mapping->path))
+		return -2;
+
+	    if (mapping->mode & MODE_DIRECTORY) {
+		int l1 = strlen(mapping->path);
+		int l2 = strlen(old_path);
+		int diff = l1 - l2;
+		direntry_t* direntry = array_get(&(s->directory),
+			mapping->info.dir.first_dir_index);
+		uint32_t c = mapping->begin;
+		int i = 0;
+
+		/* recurse */
+		while (!fat_eof(s, c)) {
+		    do {
+			direntry_t* d = direntry + i;
+
+			if (is_file(d) || (is_directory(d) && !is_dot(d))) {
+			    mapping_t* m = find_mapping_for_cluster(s,
+				    begin_of_direntry(d));
+			    int l = strlen(m->path);
+			    char* new_path = qemu_malloc(l + diff + 1);
+
+			    assert(!strncmp(m->path, mapping->path, l2));
+
+                            pstrcpy(new_path, l + diff + 1, mapping->path);
+                            pstrcpy(new_path + l1, l + diff + 1 - l1,
+                                    m->path + l2);
+
+			    schedule_rename(s, m->begin, new_path);
+			}
+			i++;
+		    } while((i % (0x10 * s->sectors_per_cluster)) != 0);
+		    c = fat_get(s, c);
+		}
+	    }
+
+	    free(old_path);
+	    array_remove(&(s->commits), i);
+	    continue;
+	} else if (commit->action == ACTION_MKDIR) {
+	    mapping_t* mapping;
+	    int j, parent_path_len;
+
+#ifdef __MINGW32__
+            if (mkdir(commit->path))
+                return -5;
+#else
+            if (mkdir(commit->path, 0755))
+                return -5;
+#endif
+
+	    mapping = insert_mapping(s, commit->param.mkdir.cluster,
+		    commit->param.mkdir.cluster + 1);
+	    if (mapping == NULL)
+		return -6;
+
+	    mapping->mode = MODE_DIRECTORY;
+	    mapping->read_only = 0;
+	    mapping->path = commit->path;
+	    j = s->directory.next;
+	    assert(j);
+	    insert_direntries(s, s->directory.next,
+		    0x10 * s->sectors_per_cluster);
+	    mapping->info.dir.first_dir_index = j;
+
+	    parent_path_len = strlen(commit->path)
+		- strlen(get_basename(commit->path)) - 1;
+	    for (j = 0; j < s->mapping.next; j++) {
+		mapping_t* m = array_get(&(s->mapping), j);
+		if (m->first_mapping_index < 0 && m != mapping &&
+			!strncmp(m->path, mapping->path, parent_path_len) &&
+			strlen(m->path) == parent_path_len)
+		    break;
+	    }
+	    assert(j < s->mapping.next);
+	    mapping->info.dir.parent_mapping_index = j;
+
+	    array_remove(&(s->commits), i);
+	    continue;
+	}
+
+	i++;
+    }
+    return 0;
+}
+
+/*
+ * TODO: make sure that the short name is not matching *another* file
+ */
+static int handle_commits(BDRVVVFATState* s)
+{
+    int i, fail = 0;
+
+    vvfat_close_current_file(s);
+
+    for (i = 0; !fail && i < s->commits.next; i++) {
+	commit_t* commit = array_get(&(s->commits), i);
+	switch(commit->action) {
+	case ACTION_RENAME: case ACTION_MKDIR:
+	    assert(0);
+	    fail = -2;
+	    break;
+	case ACTION_WRITEOUT: {
+	    direntry_t* entry = array_get(&(s->directory),
+		    commit->param.writeout.dir_index);
+	    uint32_t begin = begin_of_direntry(entry);
+	    mapping_t* mapping = find_mapping_for_cluster(s, begin);
+
+	    assert(mapping);
+	    assert(mapping->begin == begin);
+	    assert(commit->path == NULL);
+
+	    if (commit_one_file(s, commit->param.writeout.dir_index,
+			commit->param.writeout.modified_offset))
+		fail = -3;
+
+	    break;
+	}
+	case ACTION_NEW_FILE: {
+	    int begin = commit->param.new_file.first_cluster;
+	    mapping_t* mapping = find_mapping_for_cluster(s, begin);
+	    direntry_t* entry;
+	    int i;
+
+	    /* find direntry */
+	    for (i = 0; i < s->directory.next; i++) {
+		entry = array_get(&(s->directory), i);
+		if (is_file(entry) && begin_of_direntry(entry) == begin)
+		    break;
+	    }
+
+	    if (i >= s->directory.next) {
+		fail = -6;
+		continue;
+	    }
+
+	    /* make sure there exists an initial mapping */
+	    if (mapping && mapping->begin != begin) {
+		mapping->end = begin;
+		mapping = NULL;
+	    }
+	    if (mapping == NULL) {
+		mapping = insert_mapping(s, begin, begin+1);
+	    }
+	    /* most members will be fixed in commit_mappings() */
+	    assert(commit->path);
+	    mapping->path = commit->path;
+	    mapping->read_only = 0;
+	    mapping->mode = MODE_NORMAL;
+	    mapping->info.file.offset = 0;
+
+	    if (commit_one_file(s, i, 0))
+		fail = -7;
+
+	    break;
+	}
+	default:
+	    assert(0);
+	}
+    }
+    if (i > 0 && array_remove_slice(&(s->commits), 0, i))
+	return -1;
+    return fail;
+}
+
+static int handle_deletes(BDRVVVFATState* s)
+{
+    int i, deferred = 1, deleted = 1;
+
+    /* delete files corresponding to mappings marked as deleted */
+    /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */
+    while (deferred && deleted) {
+	deferred = 0;
+	deleted = 0;
+
+	for (i = 1; i < s->mapping.next; i++) {
+	    mapping_t* mapping = array_get(&(s->mapping), i);
+	    if (mapping->mode & MODE_DELETED) {
+		direntry_t* entry = array_get(&(s->directory),
+			mapping->dir_index);
+
+		if (is_free(entry)) {
+		    /* remove file/directory */
+		    if (mapping->mode & MODE_DIRECTORY) {
+			int j, next_dir_index = s->directory.next,
+			first_dir_index = mapping->info.dir.first_dir_index;
+
+			if (rmdir(mapping->path) < 0) {
+			    if (errno == ENOTEMPTY) {
+				deferred++;
+				continue;
+			    } else
+				return -5;
+			}
+
+			for (j = 1; j < s->mapping.next; j++) {
+			    mapping_t* m = array_get(&(s->mapping), j);
+			    if (m->mode & MODE_DIRECTORY &&
+				    m->info.dir.first_dir_index >
+				    first_dir_index &&
+				    m->info.dir.first_dir_index <
+				    next_dir_index)
+				next_dir_index =
+				    m->info.dir.first_dir_index;
+			}
+			remove_direntries(s, first_dir_index,
+				next_dir_index - first_dir_index);
+
+			deleted++;
+		    }
+		} else {
+		    if (unlink(mapping->path))
+			return -4;
+		    deleted++;
+		}
+		DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry));
+		remove_mapping(s, i);
+	    }
+	}
+    }
+
+    return 0;
+}
+
+/*
+ * synchronize mapping with new state:
+ *
+ * - copy FAT (with bdrv_read)
+ * - mark all filenames corresponding to mappings as deleted
+ * - recurse direntries from root (using bs->bdrv_read)
+ * - delete files corresponding to mappings marked as deleted
+ */
+static int do_commit(BDRVVVFATState* s)
+{
+    int ret = 0;
+
+    /* the real meat are the commits. Nothing to do? Move along! */
+    if (s->commits.next == 0)
+	return 0;
+
+    vvfat_close_current_file(s);
+
+    ret = handle_renames_and_mkdirs(s);
+    if (ret) {
+	fprintf(stderr, "Error handling renames (%d)\n", ret);
+	assert(0);
+	return ret;
+    }
+
+    /* copy FAT (with bdrv_read) */
+    memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat);
+
+    /* recurse direntries from root (using bs->bdrv_read) */
+    ret = commit_direntries(s, 0, -1);
+    if (ret) {
+	fprintf(stderr, "Fatal: error while committing (%d)\n", ret);
+	assert(0);
+	return ret;
+    }
+
+    ret = handle_commits(s);
+    if (ret) {
+	fprintf(stderr, "Error handling commits (%d)\n", ret);
+	assert(0);
+	return ret;
+    }
+
+    ret = handle_deletes(s);
+    if (ret) {
+	fprintf(stderr, "Error deleting\n");
+        assert(0);
+	return ret;
+    }
+
+    s->qcow->drv->bdrv_make_empty(s->qcow);
+
+    memset(s->used_clusters, 0, sector2cluster(s, s->sector_count));
+
+DLOG(checkpoint());
+    return 0;
+}
+
+static int try_commit(BDRVVVFATState* s)
+{
+    vvfat_close_current_file(s);
+DLOG(checkpoint());
+    if(!is_consistent(s))
+	return -1;
+    return do_commit(s);
+}
+
+static int vvfat_write(BlockDriverState *bs, int64_t sector_num,
+                    const uint8_t *buf, int nb_sectors)
+{
+    BDRVVVFATState *s = bs->opaque;
+    int i, ret;
+
+DLOG(checkpoint());
+
+    vvfat_close_current_file(s);
+
+    /*
+     * Some sanity checks:
+     * - do not allow writing to the boot sector
+     * - do not allow to write non-ASCII filenames
+     */
+
+    if (sector_num < s->first_sectors_number)
+	return -1;
+
+    for (i = sector2cluster(s, sector_num);
+	    i <= sector2cluster(s, sector_num + nb_sectors - 1);) {
+	mapping_t* mapping = find_mapping_for_cluster(s, i);
+	if (mapping) {
+	    if (mapping->read_only) {
+		fprintf(stderr, "Tried to write to write-protected file %s\n",
+			mapping->path);
+		return -1;
+	    }
+
+	    if (mapping->mode & MODE_DIRECTORY) {
+		int begin = cluster2sector(s, i);
+		int end = begin + s->sectors_per_cluster, k;
+		int dir_index;
+		const direntry_t* direntries;
+		long_file_name lfn;
+
+		lfn_init(&lfn);
+
+		if (begin < sector_num)
+		    begin = sector_num;
+		if (end > sector_num + nb_sectors)
+		    end = sector_num + nb_sectors;
+		dir_index  = mapping->dir_index +
+		    0x10 * (begin - mapping->begin * s->sectors_per_cluster);
+		direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num));
+
+		for (k = 0; k < (end - begin) * 0x10; k++) {
+		    /* do not allow non-ASCII filenames */
+		    if (parse_long_name(&lfn, direntries + k) < 0) {
+			fprintf(stderr, "Warning: non-ASCII filename\n");
+			return -1;
+		    }
+		    /* no access to the direntry of a read-only file */
+		    else if (is_short_name(direntries+k) &&
+			    (direntries[k].attributes & 1)) {
+			if (memcmp(direntries + k,
+				    array_get(&(s->directory), dir_index + k),
+				    sizeof(direntry_t))) {
+			    fprintf(stderr, "Warning: tried to write to write-protected file\n");
+			    return -1;
+			}
+		    }
+		}
+	    }
+	    i = mapping->end;
+	} else
+	    i++;
+    }
+
+    /*
+     * Use qcow backend. Commit later.
+     */
+DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors));
+    ret = s->qcow->drv->bdrv_write(s->qcow, sector_num, buf, nb_sectors);
+    if (ret < 0) {
+	fprintf(stderr, "Error writing to qcow backend\n");
+	return ret;
+    }
+
+    for (i = sector2cluster(s, sector_num);
+	    i <= sector2cluster(s, sector_num + nb_sectors - 1); i++)
+	if (i >= 0)
+	    s->used_clusters[i] |= USED_ALLOCATED;
+
+DLOG(checkpoint());
+    /* TODO: add timeout */
+    try_commit(s);
+
+DLOG(checkpoint());
+    return 0;
+}
+
+static int vvfat_is_allocated(BlockDriverState *bs,
+	int64_t sector_num, int nb_sectors, int* n)
+{
+    BDRVVVFATState* s = bs->opaque;
+    *n = s->sector_count - sector_num;
+    if (*n > nb_sectors)
+	*n = nb_sectors;
+    else if (*n < 0)
+	return 0;
+    return 1;
+}
+
+static int write_target_commit(BlockDriverState *bs, int64_t sector_num,
+	const uint8_t* buffer, int nb_sectors) {
+    BDRVVVFATState* s = bs->opaque;
+    return try_commit(s);
+}
+
+static void write_target_close(BlockDriverState *bs) {
+    BDRVVVFATState* s = bs->opaque;
+    bdrv_delete(s->qcow);
+    free(s->qcow_filename);
+}
+
+static BlockDriver vvfat_write_target = {
+    "vvfat_write_target", 0, NULL, NULL, NULL,
+    write_target_commit,
+    write_target_close,
+    NULL, NULL, NULL
+};
+
+static int enable_write_target(BDRVVVFATState *s)
+{
+    int size = sector2cluster(s, s->sector_count);
+    s->used_clusters = calloc(size, 1);
+
+    array_init(&(s->commits), sizeof(commit_t));
+
+    s->qcow_filename = qemu_malloc(1024);
+    get_tmp_filename(s->qcow_filename, 1024);
+    if (bdrv_create(bdrv_find_format("qcow"),
+		s->qcow_filename, s->sector_count, "fat:", 0) < 0)
+	return -1;
+    s->qcow = bdrv_new("");
+    if (s->qcow == NULL || bdrv_open(s->qcow, s->qcow_filename, 0) < 0)
+	return -1;
+
+#ifndef _WIN32
+    unlink(s->qcow_filename);
+#endif
+
+    s->bs->backing_hd = calloc(sizeof(BlockDriverState), 1);
+    s->bs->backing_hd->drv = &vvfat_write_target;
+    s->bs->backing_hd->opaque = s;
+
+    return 0;
+}
+
+static void vvfat_close(BlockDriverState *bs)
+{
+    BDRVVVFATState *s = bs->opaque;
+
+    vvfat_close_current_file(s);
+    array_free(&(s->fat));
+    array_free(&(s->directory));
+    array_free(&(s->mapping));
+    if(s->cluster_buffer)
+        free(s->cluster_buffer);
+}
+
+static BlockDriver bdrv_vvfat = {
+    .format_name	= "vvfat",
+    .instance_size	= sizeof(BDRVVVFATState),
+    .bdrv_open		= vvfat_open,
+    .bdrv_read		= vvfat_read,
+    .bdrv_write		= vvfat_write,
+    .bdrv_close		= vvfat_close,
+    .bdrv_is_allocated	= vvfat_is_allocated,
+    .protocol_name	= "fat",
+};
+
+static void bdrv_vvfat_init(void)
+{
+    bdrv_register(&bdrv_vvfat);
+}
+
+block_init(bdrv_vvfat_init);
+
+#ifdef DEBUG
+static void checkpoint(void) {
+    assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2);
+    check1(vvv);
+    check2(vvv);
+    assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY));
+#if 0
+    if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf)
+	fprintf(stderr, "Nonono!\n");
+    mapping_t* mapping;
+    direntry_t* direntry;
+    assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next);
+    assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next);
+    if (vvv->mapping.next<47)
+	return;
+    assert((mapping = array_get(&(vvv->mapping), 47)));
+    assert(mapping->dir_index < vvv->directory.next);
+    direntry = array_get(&(vvv->directory), mapping->dir_index);
+    assert(!memcmp(direntry->name, "USB     H  ", 11) || direntry->name[0]==0);
+#endif
+    return;
+    /* avoid compiler warnings: */
+    hexdump(NULL, 100);
+    remove_mapping(vvv, NULL);
+    print_mapping(NULL);
+    print_direntry(NULL);
+}
+#endif
-- 
cgit v1.1