diff options
Diffstat (limited to 'block/sheepdog.c')
-rw-r--r-- | block/sheepdog.c | 171 |
1 files changed, 146 insertions, 25 deletions
diff --git a/block/sheepdog.c b/block/sheepdog.c index d17ee36..c14172c 100644 --- a/block/sheepdog.c +++ b/block/sheepdog.c @@ -37,6 +37,7 @@ #define SD_OP_READ_VDIS 0x15 #define SD_OP_FLUSH_VDI 0x16 #define SD_OP_DEL_VDI 0x17 +#define SD_OP_GET_CLUSTER_DEFAULT 0x18 #define SD_FLAG_CMD_WRITE 0x01 #define SD_FLAG_CMD_COW 0x02 @@ -91,6 +92,7 @@ #define SD_NR_VDIS (1U << 24) #define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) #define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 /* * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and * (SD_EC_MAX_STRIP - 1) for parity strips @@ -167,7 +169,8 @@ typedef struct SheepdogVdiReq { uint32_t base_vdi_id; uint8_t copies; uint8_t copy_policy; - uint8_t reserved[2]; + uint8_t store_policy; + uint8_t block_size_shift; uint32_t snapid; uint32_t type; uint32_t pad[2]; @@ -186,6 +189,21 @@ typedef struct SheepdogVdiRsp { uint32_t pad[5]; } SheepdogVdiRsp; +typedef struct SheepdogClusterRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint8_t nr_copies; + uint8_t copy_policy; + uint8_t block_size_shift; + uint8_t __pad1; + uint32_t __pad2[6]; +} SheepdogClusterRsp; + typedef struct SheepdogInode { char name[SD_MAX_VDI_LEN]; char tag[SD_MAX_VDI_TAG_LEN]; @@ -527,6 +545,7 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, return acb; } +/* Return -EIO in case of error, file descriptor on success */ static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) { int fd; @@ -546,11 +565,14 @@ static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) if (fd >= 0) { qemu_set_nonblock(fd); + } else { + fd = -EIO; } return fd; } +/* Return 0 on success and -errno in case of error */ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, unsigned int *wlen) { @@ -559,11 +581,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); if (ret != sizeof(*hdr)) { error_report("failed to send a req, %s", strerror(errno)); + ret = -socket_error(); return ret; } ret = qemu_co_send(sockfd, data, *wlen); if (ret != *wlen) { + ret = -socket_error(); error_report("failed to send a req, %s", strerror(errno)); } @@ -638,6 +662,11 @@ out: srco->finished = true; } +/* + * Send the request to the sheep in a synchronous manner. + * + * Return 0 on success, -errno in case of error. + */ static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, void *data, unsigned int *wlen, unsigned int *rlen) { @@ -1541,6 +1570,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, hdr.vdi_size = s->inode.vdi_size; hdr.copy_policy = s->inode.copy_policy; hdr.copies = s->inode.nr_copies; + hdr.block_size_shift = s->inode.block_size_shift; ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); @@ -1566,9 +1596,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, static int sd_prealloc(const char *filename, Error **errp) { BlockDriverState *bs = NULL; + BDRVSheepdogState *base = NULL; + unsigned long buf_size; uint32_t idx, max_idx; + uint32_t object_size; int64_t vdi_size; - void *buf = g_malloc0(SD_DATA_OBJ_SIZE); + void *buf = NULL; int ret; ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, @@ -1582,18 +1615,24 @@ static int sd_prealloc(const char *filename, Error **errp) ret = vdi_size; goto out; } - max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE); + + base = bs->opaque; + object_size = (UINT32_C(1) << base->inode.block_size_shift); + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); + buf = g_malloc0(buf_size); + + max_idx = DIV_ROUND_UP(vdi_size, buf_size); for (idx = 0; idx < max_idx; idx++) { /* * The created image can be a cloned image, so we need to read * a data from the source image. */ - ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE); + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } @@ -1666,6 +1705,27 @@ static int parse_redundancy(BDRVSheepdogState *s, const char *opt) return 0; } +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) +{ + struct SheepdogInode *inode = &s->inode; + uint64_t object_size; + int obj_order; + + object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0); + if (object_size) { + if ((object_size - 1) & object_size) { /* not a power of 2? */ + return -EINVAL; + } + obj_order = ffs(object_size) - 1; + if (obj_order < 20 || obj_order > 31) { + return -EINVAL; + } + inode->block_size_shift = (uint8_t)obj_order; + } + + return 0; +} + static int sd_create(const char *filename, QemuOpts *opts, Error **errp) { @@ -1676,6 +1736,7 @@ static int sd_create(const char *filename, QemuOpts *opts, BDRVSheepdogState *s; char tag[SD_MAX_VDI_TAG_LEN]; uint32_t snapid; + uint64_t max_vdi_size; bool prealloc = false; s = g_new0(BDRVSheepdogState, 1); @@ -1714,10 +1775,11 @@ static int sd_create(const char *filename, QemuOpts *opts, goto out; } } - - if (s->inode.vdi_size > SD_MAX_VDI_SIZE) { - error_setg(errp, "too big image size"); - ret = -EINVAL; + ret = parse_block_size_shift(s, opts); + if (ret < 0) { + error_setg(errp, "Invalid object_size." + " obect_size needs to be power of 2" + " and be limited from 2^20 to 2^31"); goto out; } @@ -1754,6 +1816,51 @@ static int sd_create(const char *filename, QemuOpts *opts, } s->aio_context = qemu_get_aio_context(); + + /* if block_size_shift is not specified, get cluster default value */ + if (s->inode.block_size_shift == 0) { + SheepdogVdiReq hdr; + SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; + Error *local_err = NULL; + int fd; + unsigned int wlen = 0, rlen = 0; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report("%s", error_get_pretty(local_err)); + error_free(local_err); + ret = -EIO; + goto out; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; + hdr.proto_ver = SD_PROTO_VER; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + NULL, &wlen, &rlen); + closesocket(fd); + if (ret) { + error_setg_errno(errp, -ret, "failed to get cluster default"); + goto out; + } + if (rsp->result == SD_RES_SUCCESS) { + s->inode.block_size_shift = rsp->block_size_shift; + } else { + s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; + } + } + + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; + + if (s->inode.vdi_size > max_vdi_size) { + error_setg(errp, "An image is too large." + " The maximum image size is %"PRIu64 "GB", + max_vdi_size / 1024 / 1024 / 1024); + ret = -EINVAL; + goto out; + } + ret = do_sd_create(s, &vid, 0, errp); if (ret) { goto out; @@ -1823,11 +1930,13 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset) BDRVSheepdogState *s = bs->opaque; int ret, fd; unsigned int datalen; + uint64_t max_vdi_size; + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; if (offset < s->inode.vdi_size) { error_report("shrinking is not supported"); return -EINVAL; - } else if (offset > SD_MAX_VDI_SIZE) { + } else if (offset > max_vdi_size) { error_report("too big image size"); return -EINVAL; } @@ -2005,9 +2114,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) SheepdogAIOCB *acb = p; int ret = 0; unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; - unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE; + unsigned long idx; + uint32_t object_size; uint64_t oid; - uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE; + uint64_t offset; BDRVSheepdogState *s = acb->common.bs->opaque; SheepdogInode *inode = &s->inode; AIOReq *aio_req; @@ -2024,6 +2134,10 @@ static int coroutine_fn sd_co_rw_vector(void *p) } } + object_size = (UINT32_C(1) << inode->block_size_shift); + idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; + offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; + /* * Make sure we don't free the aiocb before we are done with all requests. * This additional reference is dropped at the end of this function. @@ -2037,7 +2151,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); - len = MIN(total - done, SD_DATA_OBJ_SIZE - offset); + len = MIN(total - done, object_size - offset); switch (acb->aiocb_type) { case AIOCB_READ_UDATA: @@ -2061,7 +2175,7 @@ static int coroutine_fn sd_co_rw_vector(void *p) * We discard the object only when the whole object is * 1) allocated 2) trimmed. Otherwise, simply skip it. */ - if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) { + if (len != object_size || inode->data_vdi_id[idx] == 0) { goto done; } break; @@ -2225,9 +2339,8 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) ret = do_sd_create(s, &new_vid, 1, &local_err); if (ret < 0) { - error_report_err(local_err); - error_report("failed to create inode for snapshot. %s", - strerror(errno)); + error_report("failed to create inode for snapshot: %s", + error_get_pretty(local_err)); goto cleanup; } @@ -2414,6 +2527,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, uint64_t offset; uint32_t vdi_index; uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; + uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); fd = connect_to_sdog(s, &local_err); if (fd < 0) { @@ -2422,10 +2536,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, } while (remaining) { - vdi_index = pos / SD_DATA_OBJ_SIZE; - offset = pos % SD_DATA_OBJ_SIZE; + vdi_index = pos / object_size; + offset = pos % object_size; - data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset); + data_len = MIN(remaining, object_size - offset); vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); @@ -2512,10 +2626,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); uint64_t offset = sector_num * BDRV_SECTOR_SIZE; - unsigned long start = offset / SD_DATA_OBJ_SIZE, + unsigned long start = offset / object_size, end = DIV_ROUND_UP((sector_num + nb_sectors) * - BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE); + BDRV_SECTOR_SIZE, object_size); unsigned long idx; int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; @@ -2534,7 +2649,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, } } - *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE; + *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; if (*pnum > nb_sectors) { *pnum = nb_sectors; } @@ -2545,14 +2660,15 @@ static int64_t sd_get_allocated_file_size(BlockDriverState *bs) { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; - unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE); + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); uint64_t size = 0; for (i = 0; i < last; i++) { if (inode->data_vdi_id[i] == 0) { continue; } - size += SD_DATA_OBJ_SIZE; + size += object_size; } return size; } @@ -2581,6 +2697,11 @@ static QemuOptsList sd_create_opts = { .type = QEMU_OPT_STRING, .help = "Redundancy of the image" }, + { + .name = BLOCK_OPT_OBJECT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Object size of the image" + }, { /* end of list */ } } }; |