From a597e79ce14ea62266924acc7b8a7030a42ed29b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 25 Aug 2011 08:26:01 +0200 Subject: block: explicit I/O accounting Decouple the I/O accounting from bdrv_aio_readv/writev/flush and make the hardware models call directly into the accounting helpers. This means: - we do not count internal requests from image formats in addition to guest originating I/O - we do not double count I/O ops if the device model handles it chunk wise - we only account I/O once it actuall is done - can extent I/O accounting to synchronous or coroutine I/O easily - implement I/O latency tracking easily (see the next patch) I've conveted the existing device model callers to the new model, device models that are using synchronous I/O and weren't accounted before haven't been updated yet. Also scsi hasn't been converted to the end-to-end accounting as I want to defer that after the pending scsi layer overhaul. Signed-off-by: Christoph Hellwig Signed-off-by: Kevin Wolf --- hw/ide/ahci.c | 9 +++++++++ hw/ide/ahci.h | 1 + hw/ide/atapi.c | 29 ++++++++++++++++++++--------- hw/ide/core.c | 27 ++++++++++++++++++++++++++- hw/ide/internal.h | 1 + hw/ide/macio.c | 40 +++++++++++++++++++++++++++++----------- hw/scsi-disk.c | 17 +++++++++++++++++ hw/virtio-blk.c | 20 ++++++++++++++++++-- hw/xen_disk.c | 5 +++++ 9 files changed, 126 insertions(+), 23 deletions(-) (limited to 'hw') diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c index 29521ba..f4fa154 100644 --- a/hw/ide/ahci.c +++ b/hw/ide/ahci.c @@ -710,6 +710,7 @@ static void ncq_cb(void *opaque, int ret) DPRINTF(ncq_tfs->drive->port_no, "NCQ transfer tag %d finished\n", ncq_tfs->tag); + bdrv_acct_done(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct); qemu_sglist_destroy(&ncq_tfs->sglist); ncq_tfs->used = 0; } @@ -756,6 +757,10 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis, ncq_tfs->is_read = 1; DPRINTF(port, "tag %d aio read %ld\n", ncq_tfs->tag, ncq_tfs->lba); + + bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct, + (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE, + BDRV_ACCT_READ); ncq_tfs->aiocb = dma_bdrv_read(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->sglist, ncq_tfs->lba, ncq_cb, ncq_tfs); @@ -766,6 +771,10 @@ static void process_ncq_command(AHCIState *s, int port, uint8_t *cmd_fis, ncq_tfs->is_read = 0; DPRINTF(port, "tag %d aio write %ld\n", ncq_tfs->tag, ncq_tfs->lba); + + bdrv_acct_start(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->acct, + (ncq_tfs->sector_count-1) * BDRV_SECTOR_SIZE, + BDRV_ACCT_WRITE); ncq_tfs->aiocb = dma_bdrv_write(ncq_tfs->drive->port.ifs[0].bs, &ncq_tfs->sglist, ncq_tfs->lba, ncq_cb, ncq_tfs); diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h index e456193..832539c 100644 --- a/hw/ide/ahci.h +++ b/hw/ide/ahci.h @@ -258,6 +258,7 @@ typedef struct NCQTransferState { AHCIDevice *drive; BlockDriverAIOCB *aiocb; QEMUSGList sglist; + BlockAcctCookie acct; int is_read; uint16_t sector_count; uint64_t lba; diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c index fe2fb0b..c552320 100644 --- a/hw/ide/atapi.c +++ b/hw/ide/atapi.c @@ -104,17 +104,20 @@ static void cd_data_to_raw(uint8_t *buf, int lba) memset(buf, 0, 288); } -static int cd_read_sector(BlockDriverState *bs, int lba, uint8_t *buf, - int sector_size) +static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size) { int ret; switch(sector_size) { case 2048: - ret = bdrv_read(bs, (int64_t)lba << 2, buf, 4); + bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); + ret = bdrv_read(s->bs, (int64_t)lba << 2, buf, 4); + bdrv_acct_done(s->bs, &s->acct); break; case 2352: - ret = bdrv_read(bs, (int64_t)lba << 2, buf + 16, 4); + bdrv_acct_start(s->bs, &s->acct, 4 * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); + ret = bdrv_read(s->bs, (int64_t)lba << 2, buf + 16, 4); + bdrv_acct_done(s->bs, &s->acct); if (ret < 0) return ret; cd_data_to_raw(buf, lba); @@ -181,7 +184,7 @@ void ide_atapi_cmd_reply_end(IDEState *s) } else { /* see if a new sector must be read */ if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) { - ret = cd_read_sector(s->bs, s->lba, s->io_buffer, s->cd_sector_size); + ret = cd_read_sector(s, s->lba, s->io_buffer, s->cd_sector_size); if (ret < 0) { ide_transfer_stop(s); ide_atapi_io_error(s, ret); @@ -250,6 +253,7 @@ static void ide_atapi_cmd_reply(IDEState *s, int size, int max_size) s->io_buffer_index = 0; if (s->atapi_dma) { + bdrv_acct_start(s->bs, &s->acct, size, BDRV_ACCT_READ); s->status = READY_STAT | SEEK_STAT | DRQ_STAT; s->bus->dma->ops->start_dma(s->bus->dma, s, ide_atapi_cmd_read_dma_cb); @@ -322,10 +326,7 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) s->status = READY_STAT | SEEK_STAT; s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD; ide_set_irq(s->bus); - eot: - s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT); - ide_set_inactive(s); - return; + goto eot; } s->io_buffer_index = 0; @@ -343,9 +344,11 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) #ifdef DEBUG_AIO printf("aio_read_cd: lba=%u n=%d\n", s->lba, n); #endif + s->bus->dma->iov.iov_base = (void *)(s->io_buffer + data_offset); s->bus->dma->iov.iov_len = n * 4 * 512; qemu_iovec_init_external(&s->bus->dma->qiov, &s->bus->dma->iov, 1); + s->bus->dma->aiocb = bdrv_aio_readv(s->bs, (int64_t)s->lba << 2, &s->bus->dma->qiov, n * 4, ide_atapi_cmd_read_dma_cb, s); @@ -355,6 +358,12 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret) ASC_MEDIUM_NOT_PRESENT); goto eot; } + + return; +eot: + bdrv_acct_done(s->bs, &s->acct); + s->bus->dma->ops->add_status(s->bus->dma, BM_STATUS_INT); + ide_set_inactive(s); } /* start a CD-CDROM read command with DMA */ @@ -368,6 +377,8 @@ static void ide_atapi_cmd_read_dma(IDEState *s, int lba, int nb_sectors, s->io_buffer_size = 0; s->cd_sector_size = sector_size; + bdrv_acct_start(s->bs, &s->acct, s->packet_transfer_size, BDRV_ACCT_READ); + /* XXX: check if BUSY_STAT should be set */ s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT; s->bus->dma->ops->start_dma(s->bus->dma, s, diff --git a/hw/ide/core.c b/hw/ide/core.c index d145b19..40abc1e 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -473,7 +473,10 @@ void ide_sector_read(IDEState *s) #endif if (n > s->req_nb_sectors) n = s->req_nb_sectors; + + bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); ret = bdrv_read(s->bs, sector_num, s->io_buffer, n); + bdrv_acct_done(s->bs, &s->acct); if (ret != 0) { if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY | BM_STATUS_RETRY_READ)) @@ -610,7 +613,10 @@ handle_rw_error: return; eot: - ide_set_inactive(s); + if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { + bdrv_acct_done(s->bs, &s->acct); + } + ide_set_inactive(s); } static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd) @@ -619,6 +625,20 @@ static void ide_sector_start_dma(IDEState *s, enum ide_dma_cmd dma_cmd) s->io_buffer_index = 0; s->io_buffer_size = 0; s->dma_cmd = dma_cmd; + + switch (dma_cmd) { + case IDE_DMA_READ: + bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE, + BDRV_ACCT_READ); + break; + case IDE_DMA_WRITE: + bdrv_acct_start(s->bs, &s->acct, s->nsector * BDRV_SECTOR_SIZE, + BDRV_ACCT_WRITE); + break; + default: + break; + } + s->bus->dma->ops->start_dma(s->bus->dma, s, ide_dma_cb); } @@ -641,7 +661,10 @@ void ide_sector_write(IDEState *s) n = s->nsector; if (n > s->req_nb_sectors) n = s->req_nb_sectors; + + bdrv_acct_start(s->bs, &s->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); ret = bdrv_write(s->bs, sector_num, s->io_buffer, n); + bdrv_acct_done(s->bs, &s->acct); if (ret != 0) { if (ide_handle_rw_error(s, -ret, BM_STATUS_PIO_RETRY)) @@ -685,6 +708,7 @@ static void ide_flush_cb(void *opaque, int ret) } } + bdrv_acct_done(s->bs, &s->acct); s->status = READY_STAT | SEEK_STAT; ide_set_irq(s->bus); } @@ -698,6 +722,7 @@ void ide_flush_cache(IDEState *s) return; } + bdrv_acct_start(s->bs, &s->acct, 0, BDRV_ACCT_FLUSH); acb = bdrv_aio_flush(s->bs, ide_flush_cb, s); if (acb == NULL) { ide_flush_cb(s, -EIO); diff --git a/hw/ide/internal.h b/hw/ide/internal.h index 02e805f..7f5ef8d 100644 --- a/hw/ide/internal.h +++ b/hw/ide/internal.h @@ -440,6 +440,7 @@ struct IDEState { int lba; int cd_sector_size; int atapi_dma; /* true if dma is requested for the packet cmd */ + BlockAcctCookie acct; /* ATA DMA state */ int io_buffer_size; QEMUSGList sg; diff --git a/hw/ide/macio.c b/hw/ide/macio.c index 44fb3fe..fdf5d75 100644 --- a/hw/ide/macio.c +++ b/hw/ide/macio.c @@ -52,8 +52,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) m->aiocb = NULL; qemu_sglist_destroy(&s->sg); ide_atapi_io_error(s, ret); - io->dma_end(opaque); - return; + goto done; } if (s->io_buffer_size > 0) { @@ -71,8 +70,7 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) ide_atapi_cmd_ok(s); if (io->len == 0) { - io->dma_end(opaque); - return; + goto done; } /* launch next transfer */ @@ -92,9 +90,14 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret) /* Note: media not present is the most likely case */ ide_atapi_cmd_error(s, SENSE_NOT_READY, ASC_MEDIUM_NOT_PRESENT); - io->dma_end(opaque); - return; + goto done; } + return; + +done: + bdrv_acct_done(s->bs, &s->acct); + io->dma_end(opaque); + return; } static void pmac_ide_transfer_cb(void *opaque, int ret) @@ -109,8 +112,7 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) m->aiocb = NULL; qemu_sglist_destroy(&s->sg); ide_dma_error(s); - io->dma_end(io); - return; + goto done; } sector_num = ide_get_sector(s); @@ -130,10 +132,8 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) } /* end of DMA ? */ - if (io->len == 0) { - io->dma_end(io); - return; + goto done; } /* launch next transfer */ @@ -163,6 +163,12 @@ static void pmac_ide_transfer_cb(void *opaque, int ret) if (!m->aiocb) pmac_ide_transfer_cb(io, -1); + return; +done: + if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) { + bdrv_acct_done(s->bs, &s->acct); + } + io->dma_end(io); } static void pmac_ide_transfer(DBDMA_io *io) @@ -172,10 +178,22 @@ static void pmac_ide_transfer(DBDMA_io *io) s->io_buffer_size = 0; if (s->drive_kind == IDE_CD) { + bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ); pmac_ide_atapi_transfer_cb(io, 0); return; } + switch (s->dma_cmd) { + case IDE_DMA_READ: + bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_READ); + break; + case IDE_DMA_WRITE: + bdrv_acct_start(s->bs, &s->acct, io->len, BDRV_ACCT_WRITE); + break; + default: + break; + } + pmac_ide_transfer_cb(io, 0); } diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c index d94b1eb..3cc830f 100644 --- a/hw/scsi-disk.c +++ b/hw/scsi-disk.c @@ -57,6 +57,7 @@ typedef struct SCSIDiskReq { struct iovec iov; QEMUIOVector qiov; uint32_t status; + BlockAcctCookie acct; } SCSIDiskReq; struct SCSIDiskState @@ -107,10 +108,13 @@ static void scsi_cancel_io(SCSIRequest *req) static void scsi_read_complete(void * opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); int n; r->req.aiocb = NULL; + bdrv_acct_done(s->bs, &r->acct); + if (ret) { if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_READ)) { return; @@ -161,6 +165,8 @@ static void scsi_read_data(SCSIRequest *req) r->iov.iov_len = n * 512; qemu_iovec_init_external(&r->qiov, &r->iov, 1); + + bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_READ); r->req.aiocb = bdrv_aio_readv(s->bs, r->sector, &r->qiov, n, scsi_read_complete, r); if (r->req.aiocb == NULL) { @@ -207,11 +213,14 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error, int type) static void scsi_write_complete(void * opaque, int ret) { SCSIDiskReq *r = (SCSIDiskReq *)opaque; + SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev); uint32_t len; uint32_t n; r->req.aiocb = NULL; + bdrv_acct_done(s->bs, &r->acct); + if (ret) { if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_WRITE)) { return; @@ -252,6 +261,8 @@ static void scsi_write_data(SCSIRequest *req) n = r->iov.iov_len / 512; if (n) { qemu_iovec_init_external(&r->qiov, &r->iov, 1); + + bdrv_acct_start(s->bs, &r->acct, n * BDRV_SECTOR_SIZE, BDRV_ACCT_WRITE); r->req.aiocb = bdrv_aio_writev(s->bs, r->sector, &r->qiov, n, scsi_write_complete, r); if (r->req.aiocb == NULL) { @@ -854,13 +865,19 @@ static int scsi_disk_emulate_command(SCSIDiskReq *r, uint8_t *outbuf) buflen = 8; break; case SYNCHRONIZE_CACHE: + { + BlockAcctCookie acct; + + bdrv_acct_start(s->bs, &acct, 0, BDRV_ACCT_FLUSH); ret = bdrv_flush(s->bs); + bdrv_acct_done(s->bs, &acct); if (ret < 0) { if (scsi_handle_rw_error(r, -ret, SCSI_REQ_STATUS_RETRY_FLUSH)) { return -1; } } break; + } case GET_CONFIGURATION: memset(outbuf, 0, 8); /* ??? This should probably return much more information. For now diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c index dad8c0a..2a8ccd0 100644 --- a/hw/virtio-blk.c +++ b/hw/virtio-blk.c @@ -47,6 +47,7 @@ typedef struct VirtIOBlockReq struct virtio_scsi_inhdr *scsi; QEMUIOVector qiov; struct VirtIOBlockReq *next; + BlockAcctCookie acct; } VirtIOBlockReq; static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) @@ -58,8 +59,6 @@ static void virtio_blk_req_complete(VirtIOBlockReq *req, int status) stb_p(&req->in->status, status); virtqueue_push(s->vq, &req->elem, req->qiov.size + sizeof(*req->in)); virtio_notify(&s->vdev, s->vq); - - g_free(req); } static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, @@ -81,6 +80,8 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error, vm_stop(VMSTOP_DISKFULL); } else { virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + bdrv_acct_done(s->bs, &req->acct); + g_free(req); bdrv_mon_event(s->bs, BDRV_ACTION_REPORT, is_read); } @@ -100,6 +101,8 @@ static void virtio_blk_rw_complete(void *opaque, int ret) } virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + bdrv_acct_done(req->dev->bs, &req->acct); + g_free(req); } static void virtio_blk_flush_complete(void *opaque, int ret) @@ -113,6 +116,8 @@ static void virtio_blk_flush_complete(void *opaque, int ret) } virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + bdrv_acct_done(req->dev->bs, &req->acct); + g_free(req); } static VirtIOBlockReq *virtio_blk_alloc_request(VirtIOBlock *s) @@ -155,6 +160,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) */ if (req->elem.out_num < 2 || req->elem.in_num < 3) { virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR); + g_free(req); return; } @@ -163,6 +169,7 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) */ if (req->elem.out_num > 2 && req->elem.in_num > 3) { virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); + g_free(req); return; } @@ -229,11 +236,13 @@ static void virtio_blk_handle_scsi(VirtIOBlockReq *req) stl_p(&req->scsi->data_len, hdr.dxfer_len); virtio_blk_req_complete(req, status); + g_free(req); } #else static void virtio_blk_handle_scsi(VirtIOBlockReq *req) { virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP); + g_free(req); } #endif /* __linux__ */ @@ -266,6 +275,8 @@ static void virtio_blk_handle_flush(VirtIOBlockReq *req, MultiReqBuffer *mrb) { BlockDriverAIOCB *acb; + bdrv_acct_start(req->dev->bs, &req->acct, 0, BDRV_ACCT_FLUSH); + /* * Make sure all outstanding writes are posted to the backing device. */ @@ -284,6 +295,8 @@ static void virtio_blk_handle_write(VirtIOBlockReq *req, MultiReqBuffer *mrb) sector = ldq_p(&req->out->sector); + bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_WRITE); + trace_virtio_blk_handle_write(req, sector, req->qiov.size / 512); if (sector & req->dev->sector_mask) { @@ -317,6 +330,8 @@ static void virtio_blk_handle_read(VirtIOBlockReq *req) sector = ldq_p(&req->out->sector); + bdrv_acct_start(req->dev->bs, &req->acct, req->qiov.size, BDRV_ACCT_READ); + if (sector & req->dev->sector_mask) { virtio_blk_rw_complete(req, -EIO); return; @@ -370,6 +385,7 @@ static void virtio_blk_handle_request(VirtIOBlockReq *req, s->serial ? s->serial : "", MIN(req->elem.in_sg[0].iov_len, VIRTIO_BLK_ID_BYTES)); virtio_blk_req_complete(req, VIRTIO_BLK_S_OK); + g_free(req); } else if (type & VIRTIO_BLK_T_OUT) { qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1], req->elem.out_num - 1); diff --git a/hw/xen_disk.c b/hw/xen_disk.c index 31f9151..bd5c669 100644 --- a/hw/xen_disk.c +++ b/hw/xen_disk.c @@ -79,6 +79,7 @@ struct ioreq { struct XenBlkDev *blkdev; QLIST_ENTRY(ioreq) list; + BlockAcctCookie acct; }; struct XenBlkDev { @@ -401,6 +402,7 @@ static void qemu_aio_complete(void *opaque, int ret) ioreq->status = ioreq->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY; ioreq_unmap(ioreq); ioreq_finish(ioreq); + bdrv_acct_done(ioreq->blkdev->bs, &ioreq->acct); qemu_bh_schedule(ioreq->blkdev->bh); } @@ -419,6 +421,7 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) switch (ioreq->req.operation) { case BLKIF_OP_READ: + bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_READ); ioreq->aio_inflight++; bdrv_aio_readv(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, @@ -429,6 +432,8 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq) if (!ioreq->req.nr_segments) { break; } + + bdrv_acct_start(blkdev->bs, &ioreq->acct, ioreq->v.size, BDRV_ACCT_WRITE); ioreq->aio_inflight++; bdrv_aio_writev(blkdev->bs, ioreq->start / BLOCK_SIZE, &ioreq->v, ioreq->v.size / BLOCK_SIZE, -- cgit v1.1