diff options
-rw-r--r-- | block/replication.c | 35 | ||||
-rw-r--r-- | docs/COLO-FT.txt | 224 | ||||
-rw-r--r-- | docs/block-replication.txt | 28 | ||||
-rw-r--r-- | hw/net/cadence_gem.c | 11 | ||||
-rw-r--r-- | hw/net/dp8393x.c | 200 | ||||
-rw-r--r-- | hw/net/e1000e_core.c | 15 | ||||
-rw-r--r-- | hw/net/net_rx_pkt.c | 44 | ||||
-rw-r--r-- | hw/net/net_rx_pkt.h | 6 | ||||
-rw-r--r-- | hw/net/trace-events | 4 | ||||
-rw-r--r-- | include/net/filter.h | 2 | ||||
-rw-r--r-- | net/filter.c | 92 | ||||
-rw-r--r-- | qemu-options.hx | 35 | ||||
-rw-r--r-- | tests/test-replication.c | 52 |
13 files changed, 591 insertions, 157 deletions
diff --git a/block/replication.c b/block/replication.c index d6681b6..413d954 100644 --- a/block/replication.c +++ b/block/replication.c @@ -450,6 +450,17 @@ static void replication_start(ReplicationState *rs, ReplicationMode mode, aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary is promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->stage != BLOCK_REPLICATION_NONE) { error_setg(errp, "Block replication is running or done"); aio_context_release(aio_context); @@ -574,6 +585,17 @@ static void replication_do_checkpoint(ReplicationState *rs, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary was promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->mode == REPLICATION_MODE_SECONDARY) { secondary_do_checkpoint(s, errp); } @@ -590,7 +612,7 @@ static void replication_get_error(ReplicationState *rs, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; - if (s->stage != BLOCK_REPLICATION_RUNNING) { + if (s->stage == BLOCK_REPLICATION_NONE) { error_setg(errp, "Block replication is not running"); aio_context_release(aio_context); return; @@ -632,6 +654,17 @@ static void replication_stop(ReplicationState *rs, bool failover, Error **errp) aio_context_acquire(aio_context); s = bs->opaque; + if (s->stage == BLOCK_REPLICATION_DONE || + s->stage == BLOCK_REPLICATION_FAILOVER) { + /* + * This case happens when a secondary was promoted to primary. + * Ignore the request because the secondary side of replication + * doesn't have to do anything anymore. + */ + aio_context_release(aio_context); + return; + } + if (s->stage != BLOCK_REPLICATION_RUNNING) { error_setg(errp, "Block replication is not running"); aio_context_release(aio_context); diff --git a/docs/COLO-FT.txt b/docs/COLO-FT.txt index ad24680..c8e1740 100644 --- a/docs/COLO-FT.txt +++ b/docs/COLO-FT.txt @@ -145,81 +145,189 @@ The diagram just shows the main qmp command, you can get the detail in test procedure. == Test procedure == -1. Startup qemu -Primary: -# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name primary \ - -device piix3-usb-uhci -vnc :7 \ - -device usb-tablet -netdev tap,id=hn0,vhost=off \ - -device virtio-net-pci,id=net-pci0,netdev=hn0 \ - -drive if=virtio,id=primary-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ - children.0.file.filename=1.raw,\ - children.0.driver=raw -S -Secondary: -# qemu-system-x86_64 -accel kvm -m 2048 -smp 2 -qmp stdio -name secondary \ - -device piix3-usb-uhci -vnc :7 \ - -device usb-tablet -netdev tap,id=hn0,vhost=off \ - -device virtio-net-pci,id=net-pci0,netdev=hn0 \ - -drive if=none,id=secondary-disk0,file.filename=1.raw,driver=raw,node-name=node0 \ - -drive if=virtio,id=active-disk0,driver=replication,mode=secondary,\ - file.driver=qcow2,top-id=active-disk0,\ - file.file.filename=/mnt/ramfs/active_disk.img,\ - file.backing.driver=qcow2,\ - file.backing.file.filename=/mnt/ramfs/hidden_disk.img,\ - file.backing.backing=secondary-disk0 \ - -incoming tcp:0:8888 - -2. On Secondary VM's QEMU monitor, issue command +Note: Here we are running both instances on the same host for testing, +change the IP Addresses if you want to run it on two hosts. Initally +127.0.0.1 is the Primary Host and 127.0.0.2 is the Secondary Host. + +== Startup qemu == +1. Primary: +Note: Initally, $imagefolder/primary.qcow2 needs to be copied to all hosts. +You don't need to change any IP's here, because 0.0.0.0 listens on any +interface. The chardev's with 127.0.0.1 IP's loopback to the local qemu +instance. + +# imagefolder="/mnt/vms/colo-test-primary" + +# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \ + -device piix3-usb-uhci -device usb-tablet -name primary \ + -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \ + -device rtl8139,id=e0,netdev=hn0 \ + -chardev socket,id=mirror0,host=0.0.0.0,port=9003,server,nowait \ + -chardev socket,id=compare1,host=0.0.0.0,port=9004,server,wait \ + -chardev socket,id=compare0,host=127.0.0.1,port=9001,server,nowait \ + -chardev socket,id=compare0-0,host=127.0.0.1,port=9001 \ + -chardev socket,id=compare_out,host=127.0.0.1,port=9005,server,nowait \ + -chardev socket,id=compare_out0,host=127.0.0.1,port=9005 \ + -object filter-mirror,id=m0,netdev=hn0,queue=tx,outdev=mirror0 \ + -object filter-redirector,netdev=hn0,id=redire0,queue=rx,indev=compare_out \ + -object filter-redirector,netdev=hn0,id=redire1,queue=rx,outdev=compare0 \ + -object iothread,id=iothread1 \ + -object colo-compare,id=comp0,primary_in=compare0-0,secondary_in=compare1,\ +outdev=compare_out0,iothread=iothread1 \ + -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ +children.0.file.filename=$imagefolder/primary.qcow2,children.0.driver=qcow2 -S + +2. Secondary: +Note: Active and hidden images need to be created only once and the +size should be the same as primary.qcow2. Again, you don't need to change +any IP's here, except for the $primary_ip variable. + +# imagefolder="/mnt/vms/colo-test-secondary" +# primary_ip=127.0.0.1 + +# qemu-img create -f qcow2 $imagefolder/secondary-active.qcow2 10G + +# qemu-img create -f qcow2 $imagefolder/secondary-hidden.qcow2 10G + +# qemu-system-x86_64 -enable-kvm -cpu qemu64,+kvmclock -m 512 -smp 1 -qmp stdio \ + -device piix3-usb-uhci -device usb-tablet -name secondary \ + -netdev tap,id=hn0,vhost=off,helper=/usr/lib/qemu/qemu-bridge-helper \ + -device rtl8139,id=e0,netdev=hn0 \ + -chardev socket,id=red0,host=$primary_ip,port=9003,reconnect=1 \ + -chardev socket,id=red1,host=$primary_ip,port=9004,reconnect=1 \ + -object filter-redirector,id=f1,netdev=hn0,queue=tx,indev=red0 \ + -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 \ + -object filter-rewriter,id=rew0,netdev=hn0,queue=all \ + -drive if=none,id=parent0,file.filename=$imagefolder/primary.qcow2,driver=qcow2 \ + -drive if=none,id=childs0,driver=replication,mode=secondary,file.driver=qcow2,\ +top-id=colo-disk0,file.file.filename=$imagefolder/secondary-active.qcow2,\ +file.backing.driver=qcow2,file.backing.file.filename=$imagefolder/secondary-hidden.qcow2,\ +file.backing.backing=parent0 \ + -drive if=ide,id=colo-disk0,driver=quorum,read-pattern=fifo,vote-threshold=1,\ +children.0=childs0 \ + -incoming tcp:0.0.0.0:9998 + + +3. On Secondary VM's QEMU monitor, issue command {'execute':'qmp_capabilities'} -{ 'execute': 'nbd-server-start', - 'arguments': {'addr': {'type': 'inet', 'data': {'host': 'xx.xx.xx.xx', 'port': '8889'} } } -} -{'execute': 'nbd-server-add', 'arguments': {'device': 'secondary-disk0', 'writable': true } } +{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': {'host': '0.0.0.0', 'port': '9999'} } } } +{'execute': 'nbd-server-add', 'arguments': {'device': 'parent0', 'writable': true } } Note: a. The qmp command nbd-server-start and nbd-server-add must be run before running the qmp command migrate on primary QEMU b. Active disk, hidden disk and nbd target's length should be the same. - c. It is better to put active disk and hidden disk in ramdisk. + c. It is better to put active disk and hidden disk in ramdisk. They + will be merged into the parent disk on failover. -3. On Primary VM's QEMU monitor, issue command: +4. On Primary VM's QEMU monitor, issue command: {'execute':'qmp_capabilities'} -{ 'execute': 'human-monitor-command', - 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=xx.xx.xx.xx,file.port=8889,file.export=secondary-disk0,node-name=nbd_client0'}} -{ 'execute':'x-blockdev-change', 'arguments':{'parent': 'primary-disk0', 'node': 'nbd_client0' } } -{ 'execute': 'migrate-set-capabilities', - 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } -{ 'execute': 'migrate', 'arguments': {'uri': 'tcp:xx.xx.xx.xx:8888' } } +{'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}} +{'execute': 'x-blockdev-change', 'arguments':{'parent': 'colo-disk0', 'node': 'replication0' } } +{'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{'execute': 'migrate', 'arguments': {'uri': 'tcp:127.0.0.2:9998' } } Note: a. There should be only one NBD Client for each primary disk. - b. xx.xx.xx.xx is the secondary physical machine's hostname or IP - c. The qmp command line must be run after running qmp command line in + b. The qmp command line must be run after running qmp command line in secondary qemu. -4. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced. +5. After the above steps, you will see, whenever you make changes to PVM, SVM will be synced. You can issue command '{ "execute": "migrate-set-parameters" , "arguments":{ "x-checkpoint-delay": 2000 } }' -to change the checkpoint period time +to change the idle checkpoint period time + +6. Failover test +You can kill one of the VMs and Failover on the surviving VM: + +If you killed the Secondary, then follow "Primary Failover". After that, +if you want to resume the replication, follow "Primary resume replication" + +If you killed the Primary, then follow "Secondary Failover". After that, +if you want to resume the replication, follow "Secondary resume replication" + +== Primary Failover == +The Secondary died, resume on the Primary + +{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'child': 'children.1'} } +{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_del replication0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'comp0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'iothread1' } } +{'execute': 'object-del', 'arguments':{ 'id': 'm0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'redire0' } } +{'execute': 'object-del', 'arguments':{ 'id': 'redire1' } } +{'execute': 'x-colo-lost-heartbeat' } + +== Secondary Failover == +The Primary died, resume on the Secondary and prepare to become the new Primary + +{'execute': 'nbd-server-stop'} +{'execute': 'x-colo-lost-heartbeat'} + +{'execute': 'object-del', 'arguments':{ 'id': 'f2' } } +{'execute': 'object-del', 'arguments':{ 'id': 'f1' } } +{'execute': 'chardev-remove', 'arguments':{ 'id': 'red1' } } +{'execute': 'chardev-remove', 'arguments':{ 'id': 'red0' } } + +{'execute': 'chardev-add', 'arguments':{ 'id': 'mirror0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9003' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare1', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '0.0.0.0', 'port': '9004' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare0-0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9001' } }, 'server': false } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': true } } } } +{'execute': 'chardev-add', 'arguments':{ 'id': 'compare_out0', 'backend': {'type': 'socket', 'data': {'addr': { 'type': 'inet', 'data': { 'host': '127.0.0.1', 'port': '9005' } }, 'server': false } } } } + +== Primary resume replication == +Resume replication after new Secondary is up. + +Start the new Secondary (Steps 2 and 3 above), then on the Primary: +{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.2:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} } + +Wait until disk is synced, then: +{'execute': 'stop'} +{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync'} } + +{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.2,file.port=9999,file.export=parent0,node-name=replication0'}} +{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } } + +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } } + +{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.2:9998' } } + +Note: +If this Primary previously was a Secondary, then we need to insert the +filters before the filter-rewriter by using the +"'insert': 'before', 'position': 'id=rew0'" Options. See below. + +== Secondary resume replication == +Become Primary and resume replication after new Secondary is up. Note +that now 127.0.0.1 is the Secondary and 127.0.0.2 is the Primary. + +Start the new Secondary (Steps 2 and 3 above, but with primary_ip=127.0.0.2), +then on the old Secondary: +{'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://127.0.0.1:9999/parent0', 'mode': 'existing', 'format': 'raw', 'sync': 'full'} } + +Wait until disk is synced, then: +{'execute': 'stop'} +{'execute': 'block-job-cancel', 'arguments':{ 'device': 'resync' } } -5. Failover test -You can kill Primary VM and run 'x_colo_lost_heartbeat' in Secondary VM's -monitor at the same time, then SVM will failover and client will not detect this -change. +{'execute': 'human-monitor-command', 'arguments':{ 'command-line': 'drive_add -n buddy driver=replication,mode=primary,file.driver=nbd,file.host=127.0.0.1,file.port=9999,file.export=parent0,node-name=replication0'}} +{'execute': 'x-blockdev-change', 'arguments':{ 'parent': 'colo-disk0', 'node': 'replication0' } } -Before issuing '{ "execute": "x-colo-lost-heartbeat" }' command, we have to -issue block related command to stop block replication. -Primary: - Remove the nbd child from the quorum: - { 'execute': 'x-blockdev-change', 'arguments': {'parent': 'colo-disk0', 'child': 'children.1'}} - { 'execute': 'human-monitor-command','arguments': {'command-line': 'drive_del blk-buddy0'}} - Note: there is no qmp command to remove the blockdev now +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-mirror', 'id': 'm0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'tx', 'outdev': 'mirror0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire0', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'indev': 'compare_out' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'filter-redirector', 'id': 'redire1', 'props': { 'insert': 'before', 'position': 'id=rew0', 'netdev': 'hn0', 'queue': 'rx', 'outdev': 'compare0' } } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'iothread', 'id': 'iothread1' } } +{'execute': 'object-add', 'arguments':{ 'qom-type': 'colo-compare', 'id': 'comp0', 'props': { 'primary_in': 'compare0-0', 'secondary_in': 'compare1', 'outdev': 'compare_out0', 'iothread': 'iothread1' } } } -Secondary: - The primary host is down, so we should do the following thing: - { 'execute': 'nbd-server-stop' } +{'execute': 'migrate-set-capabilities', 'arguments':{ 'capabilities': [ {'capability': 'x-colo', 'state': true } ] } } +{'execute': 'migrate', 'arguments':{ 'uri': 'tcp:127.0.0.1:9998' } } == TODO == -1. Support continuous VM replication. -2. Support shared storage. -3. Develop the heartbeat part. -4. Reduce checkpoint VM’s downtime while doing checkpoint. +1. Support shared storage. +2. Develop the heartbeat part. +3. Reduce checkpoint VM’s downtime while doing checkpoint. diff --git a/docs/block-replication.txt b/docs/block-replication.txt index 6bde673..108e916 100644 --- a/docs/block-replication.txt +++ b/docs/block-replication.txt @@ -65,12 +65,12 @@ blocks that are already in QEMU. ^ || .---------- | || | Secondary 1 Quorum || '---------- - / \ || - / \ || - Primary 2 filter - disk ^ virtio-blk - | ^ - 3 NBD -------> 3 NBD | + / \ || virtio-blk + / \ || ^ + Primary 2 filter | + disk ^ 7 Quorum + | / + 3 NBD -------> 3 NBD / client || server 2 filter || ^ ^ --------. || | | @@ -106,6 +106,10 @@ any state that would otherwise be lost by the speculative write-through of the NBD server into the secondary disk. So before block replication, the primary disk and secondary disk should contain the same data. +7) The secondary also has a quorum node, so after secondary failover it +can become the new primary and continue replication. + + == Failure Handling == There are 7 internal errors when block replication is running: 1. I/O error on primary disk @@ -171,16 +175,18 @@ Primary: leading whitespace. 5. The qmp command line must be run after running qmp command line in secondary qemu. - 6. After failover we need remove children.1 (replication driver). + 6. After primary failover we need remove children.1 (replication driver). Secondary: -drive if=none,driver=raw,file.filename=1.raw,id=colo1 \ - -drive if=xxx,id=topxxx,driver=replication,mode=secondary,top-id=topxxx\ + -drive if=none,id=childs1,driver=replication,mode=secondary,top-id=childs1 file.file.filename=active_disk.qcow2,\ file.driver=qcow2,\ file.backing.file.filename=hidden_disk.qcow2,\ file.backing.driver=qcow2,\ file.backing.backing=colo1 + -drive if=xxx,driver=quorum,read-pattern=fifo,id=top-disk1,\ + vote-threshold=1,children.0=childs1 Then run qmp command in secondary qemu: { 'execute': 'nbd-server-start', @@ -234,6 +240,8 @@ Secondary: The primary host is down, so we should do the following thing: { 'execute': 'nbd-server-stop' } +Promote Secondary to Primary: + see COLO-FT.txt + TODO: -1. Continuous block replication -2. Shared disk +1. Shared disk diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c index ddabdb3..6340c1e 100644 --- a/hw/net/cadence_gem.c +++ b/hw/net/cadence_gem.c @@ -987,8 +987,9 @@ static ssize_t gem_receive(NetClientState *nc, const uint8_t *buf, size_t size) return -1; } - DB_PRINT("copy %d bytes to 0x%x\n", MIN(bytes_to_copy, rxbufsize), - rx_desc_get_buffer(s->rx_desc[q])); + DB_PRINT("copy %u bytes to 0x%" PRIx64 "\n", + MIN(bytes_to_copy, rxbufsize), + rx_desc_get_buffer(s, s->rx_desc[q])); /* Copy packet data to emulated DMA buffer */ address_space_write(&s->dma_as, rx_desc_get_buffer(s, s->rx_desc[q]) + @@ -1159,9 +1160,9 @@ static void gem_transmit(CadenceGEMState *s) if (tx_desc_get_length(desc) > sizeof(tx_packet) - (p - tx_packet)) { - DB_PRINT("TX descriptor @ 0x%x too large: size 0x%x space " \ - "0x%x\n", (unsigned)packet_desc_addr, - (unsigned)tx_desc_get_length(desc), + DB_PRINT("TX descriptor @ 0x%" HWADDR_PRIx \ + " too large: size 0x%x space 0x%zx\n", + packet_desc_addr, tx_desc_get_length(desc), sizeof(tx_packet) - (p - tx_packet)); break; } diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c index 7045193..8a3504d 100644 --- a/hw/net/dp8393x.c +++ b/hw/net/dp8393x.c @@ -137,6 +137,7 @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0) #define SONIC_TCR_CRCI 0x2000 #define SONIC_TCR_PINT 0x8000 +#define SONIC_ISR_RBAE 0x0010 #define SONIC_ISR_RBE 0x0020 #define SONIC_ISR_RDE 0x0040 #define SONIC_ISR_TC 0x0080 @@ -145,6 +146,9 @@ do { printf("sonic ERROR: %s: " fmt, __func__ , ## __VA_ARGS__); } while (0) #define SONIC_ISR_PINT 0x0800 #define SONIC_ISR_LCD 0x1000 +#define SONIC_DESC_EOL 0x0001 +#define SONIC_DESC_ADDR 0xFFFE + #define TYPE_DP8393X "dp8393x" #define DP8393X(obj) OBJECT_CHECK(dp8393xState, (obj), TYPE_DP8393X) @@ -154,6 +158,7 @@ typedef struct dp8393xState { /* Hardware */ uint8_t it_shift; bool big_endian; + bool last_rba_is_full; qemu_irq irq; #ifdef DEBUG_SONIC int irq_level; @@ -197,7 +202,8 @@ static uint32_t dp8393x_crba(dp8393xState *s) static uint32_t dp8393x_crda(dp8393xState *s) { - return (s->regs[SONIC_URDA] << 16) | s->regs[SONIC_CRDA]; + return (s->regs[SONIC_URDA] << 16) | + (s->regs[SONIC_CRDA] & SONIC_DESC_ADDR); } static uint32_t dp8393x_rbwc(dp8393xState *s) @@ -217,7 +223,8 @@ static uint32_t dp8393x_tsa(dp8393xState *s) static uint32_t dp8393x_ttda(dp8393xState *s) { - return (s->regs[SONIC_UTDA] << 16) | s->regs[SONIC_TTDA]; + return (s->regs[SONIC_UTDA] << 16) | + (s->regs[SONIC_TTDA] & SONIC_DESC_ADDR); } static uint32_t dp8393x_wt(dp8393xState *s) @@ -241,9 +248,19 @@ static void dp8393x_put(dp8393xState *s, int width, int offset, uint16_t val) { if (s->big_endian) { - s->data[offset * width + width - 1] = cpu_to_be16(val); + if (width == 2) { + s->data[offset * 2] = 0; + s->data[offset * 2 + 1] = cpu_to_be16(val); + } else { + s->data[offset] = cpu_to_be16(val); + } } else { - s->data[offset * width] = cpu_to_le16(val); + if (width == 2) { + s->data[offset * 2] = cpu_to_le16(val); + s->data[offset * 2 + 1] = 0; + } else { + s->data[offset] = cpu_to_le16(val); + } } } @@ -331,15 +348,15 @@ static void dp8393x_do_read_rra(dp8393xState *s) s->regs[SONIC_RRP] = s->regs[SONIC_RSA]; } - /* Check resource exhaustion */ + /* Warn the host if CRBA now has the last available resource */ if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP]) { s->regs[SONIC_ISR] |= SONIC_ISR_RBE; dp8393x_update_irq(s); } - /* Done */ - s->regs[SONIC_CR] &= ~SONIC_CR_RRRA; + /* Allow packet reception */ + s->last_rba_is_full = false; } static void dp8393x_do_software_reset(dp8393xState *s) @@ -509,7 +526,7 @@ static void dp8393x_do_transmit_packets(dp8393xState *s) MEMTXATTRS_UNSPECIFIED, s->data, size); s->regs[SONIC_CTDA] = dp8393x_get(s, width, 0) & ~0x1; - if (dp8393x_get(s, width, 0) & 0x1) { + if (dp8393x_get(s, width, 0) & SONIC_DESC_EOL) { /* EOL detected */ break; } @@ -550,8 +567,10 @@ static void dp8393x_do_command(dp8393xState *s, uint16_t command) dp8393x_do_start_timer(s); if (command & SONIC_CR_RST) dp8393x_do_software_reset(s); - if (command & SONIC_CR_RRRA) + if (command & SONIC_CR_RRRA) { dp8393x_do_read_rra(s); + s->regs[SONIC_CR] &= ~SONIC_CR_RRRA; + } if (command & SONIC_CR_LCAM) dp8393x_do_load_cam(s); } @@ -585,7 +604,7 @@ static uint64_t dp8393x_read(void *opaque, hwaddr addr, unsigned int size) DPRINTF("read 0x%04x from reg %s\n", val, reg_names[reg]); - return val; + return s->big_endian ? val << 16 : val; } static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, @@ -593,13 +612,14 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, { dp8393xState *s = opaque; int reg = addr >> s->it_shift; + uint32_t val = s->big_endian ? data >> 16 : data; - DPRINTF("write 0x%04x to reg %s\n", (uint16_t)data, reg_names[reg]); + DPRINTF("write 0x%04x to reg %s\n", (uint16_t)val, reg_names[reg]); switch (reg) { /* Command register */ case SONIC_CR: - dp8393x_do_command(s, data); + dp8393x_do_command(s, val); break; /* Prevent write to read-only registers */ case SONIC_CAP2: @@ -612,59 +632,60 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, /* Accept write to some registers only when in reset mode */ case SONIC_DCR: if (s->regs[SONIC_CR] & SONIC_CR_RST) { - s->regs[reg] = data & 0xbfff; + s->regs[reg] = val & 0xbfff; } else { DPRINTF("writing to DCR invalid\n"); } break; case SONIC_DCR2: if (s->regs[SONIC_CR] & SONIC_CR_RST) { - s->regs[reg] = data & 0xf017; + s->regs[reg] = val & 0xf017; } else { DPRINTF("writing to DCR2 invalid\n"); } break; /* 12 lower bytes are Read Only */ case SONIC_TCR: - s->regs[reg] = data & 0xf000; + s->regs[reg] = val & 0xf000; break; /* 9 lower bytes are Read Only */ case SONIC_RCR: - s->regs[reg] = data & 0xffe0; + s->regs[reg] = val & 0xffe0; break; /* Ignore most significant bit */ case SONIC_IMR: - s->regs[reg] = data & 0x7fff; + s->regs[reg] = val & 0x7fff; dp8393x_update_irq(s); break; /* Clear bits by writing 1 to them */ case SONIC_ISR: - data &= s->regs[reg]; - s->regs[reg] &= ~data; - if (data & SONIC_ISR_RBE) { + val &= s->regs[reg]; + s->regs[reg] &= ~val; + if (val & SONIC_ISR_RBE) { dp8393x_do_read_rra(s); } dp8393x_update_irq(s); - if (dp8393x_can_receive(s->nic->ncs)) { - qemu_flush_queued_packets(qemu_get_queue(s->nic)); - } break; - /* Ignore least significant bit */ + /* The guest is required to store aligned pointers here */ case SONIC_RSA: case SONIC_REA: case SONIC_RRP: case SONIC_RWP: - s->regs[reg] = data & 0xfffe; + if (s->regs[SONIC_DCR] & SONIC_DCR_DW) { + s->regs[reg] = val & 0xfffc; + } else { + s->regs[reg] = val & 0xfffe; + } break; /* Invert written value for some registers */ case SONIC_CRCT: case SONIC_FAET: case SONIC_MPT: - s->regs[reg] = data ^ 0xffff; + s->regs[reg] = val ^ 0xffff; break; /* All other registers have no special contrainst */ default: - s->regs[reg] = data; + s->regs[reg] = val; } if (reg == SONIC_WT0 || reg == SONIC_WT1) { @@ -675,8 +696,8 @@ static void dp8393x_write(void *opaque, hwaddr addr, uint64_t data, static const MemoryRegionOps dp8393x_ops = { .read = dp8393x_read, .write = dp8393x_write, - .impl.min_access_size = 2, - .impl.max_access_size = 2, + .impl.min_access_size = 4, + .impl.max_access_size = 4, .endianness = DEVICE_NATIVE_ENDIAN, }; @@ -703,8 +724,6 @@ static int dp8393x_can_receive(NetClientState *nc) if (!(s->regs[SONIC_CR] & SONIC_CR_RXEN)) return 0; - if (s->regs[SONIC_ISR] & SONIC_ISR_RBE) - return 0; return 1; } @@ -743,40 +762,69 @@ static int dp8393x_receive_filter(dp8393xState *s, const uint8_t * buf, } static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, - size_t size) + size_t pkt_size) { dp8393xState *s = qemu_get_nic_opaque(nc); int packet_type; uint32_t available, address; - int width, rx_len = size; + int width, rx_len, padded_len; uint32_t checksum; - - width = (s->regs[SONIC_DCR] & SONIC_DCR_DW) ? 2 : 1; + int size; s->regs[SONIC_RCR] &= ~(SONIC_RCR_PRX | SONIC_RCR_LBK | SONIC_RCR_FAER | SONIC_RCR_CRCR | SONIC_RCR_LPKT | SONIC_RCR_BC | SONIC_RCR_MC); - packet_type = dp8393x_receive_filter(s, buf, size); + if (s->last_rba_is_full) { + return pkt_size; + } + + rx_len = pkt_size + sizeof(checksum); + if (s->regs[SONIC_DCR] & SONIC_DCR_DW) { + width = 2; + padded_len = ((rx_len - 1) | 3) + 1; + } else { + width = 1; + padded_len = ((rx_len - 1) | 1) + 1; + } + + if (padded_len > dp8393x_rbwc(s) * 2) { + DPRINTF("oversize packet, pkt_size is %d\n", pkt_size); + s->regs[SONIC_ISR] |= SONIC_ISR_RBAE; + dp8393x_update_irq(s); + s->regs[SONIC_RCR] |= SONIC_RCR_LPKT; + goto done; + } + + packet_type = dp8393x_receive_filter(s, buf, pkt_size); if (packet_type < 0) { DPRINTF("packet not for netcard\n"); return -1; } - /* XXX: Check byte ordering */ - /* Check for EOL */ - if (s->regs[SONIC_LLFA] & 0x1) { + if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* Are we still in resource exhaustion? */ size = sizeof(uint16_t) * 1 * width; address = dp8393x_crda(s) + sizeof(uint16_t) * 5 * width; address_space_read(&s->as, address, MEMTXATTRS_UNSPECIFIED, s->data, size); - if (dp8393x_get(s, width, 0) & 0x1) { + s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0); + if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* Still EOL ; stop reception */ return -1; - } else { - s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; } + /* Link has been updated by host */ + + /* Clear in_use */ + size = sizeof(uint16_t) * width; + address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width; + dp8393x_put(s, width, 0, 0); + address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED, + (uint8_t *)s->data, size, 1); + + /* Move to next descriptor */ + s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; + s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; } /* Save current position */ @@ -784,21 +832,32 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, s->regs[SONIC_TRBA0] = s->regs[SONIC_CRBA0]; /* Calculate the ethernet checksum */ - checksum = cpu_to_le32(crc32(0, buf, rx_len)); + checksum = cpu_to_le32(crc32(0, buf, pkt_size)); /* Put packet into RBA */ DPRINTF("Receive packet at %08x\n", dp8393x_crba(s)); address = dp8393x_crba(s); address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, - buf, rx_len); - address += rx_len; + buf, pkt_size); + address += pkt_size; + + /* Put frame checksum into RBA */ address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, - &checksum, 4); - rx_len += 4; + &checksum, sizeof(checksum)); + address += sizeof(checksum); + + /* Pad short packets to keep pointers aligned */ + if (rx_len < padded_len) { + size = padded_len - rx_len; + address_space_rw(&s->as, address, MEMTXATTRS_UNSPECIFIED, + (uint8_t *)"\xFF\xFF\xFF", size, 1); + address += size; + } + s->regs[SONIC_CRBA1] = address >> 16; s->regs[SONIC_CRBA0] = address & 0xffff; available = dp8393x_rbwc(s); - available -= rx_len / 2; + available -= padded_len >> 1; s->regs[SONIC_RBWC1] = available >> 16; s->regs[SONIC_RBWC0] = available & 0xffff; @@ -825,39 +884,46 @@ static ssize_t dp8393x_receive(NetClientState *nc, const uint8_t * buf, MEMTXATTRS_UNSPECIFIED, s->data, size); - /* Move to next descriptor */ + /* Check link field */ size = sizeof(uint16_t) * width; address_space_read(&s->as, dp8393x_crda(s) + sizeof(uint16_t) * 5 * width, MEMTXATTRS_UNSPECIFIED, s->data, size); s->regs[SONIC_LLFA] = dp8393x_get(s, width, 0); - if (s->regs[SONIC_LLFA] & 0x1) { + if (s->regs[SONIC_LLFA] & SONIC_DESC_EOL) { /* EOL detected */ s->regs[SONIC_ISR] |= SONIC_ISR_RDE; } else { - /* Clear in_use, but it is always 16bit wide */ - int offset = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width; - if (s->big_endian && width == 2) { - /* we need to adjust the offset of the 16bit field */ - offset += sizeof(uint16_t); - } - s->data[0] = 0; - address_space_write(&s->as, offset, MEMTXATTRS_UNSPECIFIED, - s->data, sizeof(uint16_t)); + /* Clear in_use */ + size = sizeof(uint16_t) * width; + address = dp8393x_crda(s) + sizeof(uint16_t) * 6 * width; + dp8393x_put(s, width, 0, 0); + address_space_write(&s->as, address, MEMTXATTRS_UNSPECIFIED, + s->data, size); + + /* Move to next descriptor */ s->regs[SONIC_CRDA] = s->regs[SONIC_LLFA]; s->regs[SONIC_ISR] |= SONIC_ISR_PKTRX; - s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | (((s->regs[SONIC_RSC] & 0x00ff) + 1) & 0x00ff); + } + + dp8393x_update_irq(s); + + s->regs[SONIC_RSC] = (s->regs[SONIC_RSC] & 0xff00) | + ((s->regs[SONIC_RSC] + 1) & 0x00ff); - if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) { - /* Read next RRA */ +done: + + if (s->regs[SONIC_RCR] & SONIC_RCR_LPKT) { + if (s->regs[SONIC_RRP] == s->regs[SONIC_RWP]) { + /* Stop packet reception */ + s->last_rba_is_full = true; + } else { + /* Read next resource */ dp8393x_do_read_rra(s); } } - /* Done */ - dp8393x_update_irq(s); - - return size; + return pkt_size; } static void dp8393x_reset(DeviceState *dev) @@ -866,6 +932,7 @@ static void dp8393x_reset(DeviceState *dev) timer_del(s->watchdog); memset(s->regs, 0, sizeof(s->regs)); + s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux/mips */ s->regs[SONIC_CR] = SONIC_CR_RST | SONIC_CR_STP | SONIC_CR_RXDIS; s->regs[SONIC_DCR] &= ~(SONIC_DCR_EXBUS | SONIC_DCR_LBR); s->regs[SONIC_RCR] &= ~(SONIC_RCR_LB0 | SONIC_RCR_LB1 | SONIC_RCR_BRD | SONIC_RCR_RNT); @@ -918,7 +985,6 @@ static void dp8393x_realize(DeviceState *dev, Error **errp) qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a); s->watchdog = timer_new_ns(QEMU_CLOCK_VIRTUAL, dp8393x_watchdog, s); - s->regs[SONIC_SR] = 0x0004; /* only revision recognized by Linux */ memory_region_init_ram(&s->prom, OBJECT(dev), "dp8393x-prom", SONIC_PROM_SIZE, &local_err); diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c index 9b76f82..94ea34d 100644 --- a/hw/net/e1000e_core.c +++ b/hw/net/e1000e_core.c @@ -582,7 +582,7 @@ e1000e_rss_calc_hash(E1000ECore *core, type = NetPktRssIpV4Tcp; break; case E1000_MRQ_RSS_TYPE_IPV6TCP: - type = NetPktRssIpV6Tcp; + type = NetPktRssIpV6TcpEx; break; case E1000_MRQ_RSS_TYPE_IPV6: type = NetPktRssIpV6; @@ -2813,12 +2813,15 @@ e1000e_set_eitr(E1000ECore *core, int index, uint32_t val) static void e1000e_set_psrctl(E1000ECore *core, int index, uint32_t val) { - if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) { - hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero"); - } + if (core->mac[RCTL] & E1000_RCTL_DTYP_MASK) { + + if ((val & E1000_PSRCTL_BSIZE0_MASK) == 0) { + hw_error("e1000e: PSRCTL.BSIZE0 cannot be zero"); + } - if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) { - hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero"); + if ((val & E1000_PSRCTL_BSIZE1_MASK) == 0) { + hw_error("e1000e: PSRCTL.BSIZE1 cannot be zero"); + } } core->mac[PSRCTL] = val; diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c index 98a5030..1e1c504 100644 --- a/hw/net/net_rx_pkt.c +++ b/hw/net/net_rx_pkt.c @@ -307,6 +307,20 @@ _net_rx_rss_prepare_tcp(uint8_t *rss_input, &tcphdr->th_dport, sizeof(uint16_t)); } +static inline void +_net_rx_rss_prepare_udp(uint8_t *rss_input, + struct NetRxPkt *pkt, + size_t *bytes_written) +{ + struct udp_header *udphdr = &pkt->l4hdr_info.hdr.udp; + + _net_rx_rss_add_chunk(rss_input, bytes_written, + &udphdr->uh_sport, sizeof(uint16_t)); + + _net_rx_rss_add_chunk(rss_input, bytes_written, + &udphdr->uh_dport, sizeof(uint16_t)); +} + uint32_t net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, NetRxPktRssType type, @@ -334,7 +348,7 @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, assert(pkt->isip6); assert(pkt->istcp); trace_net_rx_pkt_rss_ip6_tcp(); - _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); break; case NetPktRssIpV6: @@ -347,6 +361,34 @@ net_rx_pkt_calc_rss_hash(struct NetRxPkt *pkt, trace_net_rx_pkt_rss_ip6_ex(); _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); break; + case NetPktRssIpV6TcpEx: + assert(pkt->isip6); + assert(pkt->istcp); + trace_net_rx_pkt_rss_ip6_ex_tcp(); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); + _net_rx_rss_prepare_tcp(&rss_input[0], pkt, &rss_length); + break; + case NetPktRssIpV4Udp: + assert(pkt->isip4); + assert(pkt->isudp); + trace_net_rx_pkt_rss_ip4_udp(); + _net_rx_rss_prepare_ip4(&rss_input[0], pkt, &rss_length); + _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); + break; + case NetPktRssIpV6Udp: + assert(pkt->isip6); + assert(pkt->isudp); + trace_net_rx_pkt_rss_ip6_udp(); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, false, &rss_length); + _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); + break; + case NetPktRssIpV6UdpEx: + assert(pkt->isip6); + assert(pkt->isudp); + trace_net_rx_pkt_rss_ip6_ex_udp(); + _net_rx_rss_prepare_ip6(&rss_input[0], pkt, true, &rss_length); + _net_rx_rss_prepare_udp(&rss_input[0], pkt, &rss_length); + break; default: assert(false); break; diff --git a/hw/net/net_rx_pkt.h b/hw/net/net_rx_pkt.h index 7adf0fa..048e346 100644 --- a/hw/net/net_rx_pkt.h +++ b/hw/net/net_rx_pkt.h @@ -133,7 +133,11 @@ typedef enum { NetPktRssIpV4Tcp, NetPktRssIpV6Tcp, NetPktRssIpV6, - NetPktRssIpV6Ex + NetPktRssIpV6Ex, + NetPktRssIpV6TcpEx, + NetPktRssIpV4Udp, + NetPktRssIpV6Udp, + NetPktRssIpV6UdpEx, } NetRxPktRssType; /** diff --git a/hw/net/trace-events b/hw/net/trace-events index 42066fc..a1da98a 100644 --- a/hw/net/trace-events +++ b/hw/net/trace-events @@ -92,9 +92,13 @@ net_rx_pkt_l3_csum_validate_csum(size_t l3hdr_off, uint32_t csl, uint32_t cntr, net_rx_pkt_rss_ip4(void) "Calculating IPv4 RSS hash" net_rx_pkt_rss_ip4_tcp(void) "Calculating IPv4/TCP RSS hash" +net_rx_pkt_rss_ip4_udp(void) "Calculating IPv4/UDP RSS hash" net_rx_pkt_rss_ip6_tcp(void) "Calculating IPv6/TCP RSS hash" +net_rx_pkt_rss_ip6_udp(void) "Calculating IPv6/UDP RSS hash" net_rx_pkt_rss_ip6(void) "Calculating IPv6 RSS hash" net_rx_pkt_rss_ip6_ex(void) "Calculating IPv6/EX RSS hash" +net_rx_pkt_rss_ip6_ex_tcp(void) "Calculating IPv6/EX/TCP RSS hash" +net_rx_pkt_rss_ip6_ex_udp(void) "Calculating IPv6/EX/UDP RSS hash" net_rx_pkt_rss_hash(size_t rss_length, uint32_t rss_hash) "RSS hash for %zu bytes: 0x%X" net_rx_pkt_rss_add_chunk(void* ptr, size_t size, size_t input_offset) "Add RSS chunk %p, %zu bytes, RSS input offset %zu bytes" diff --git a/include/net/filter.h b/include/net/filter.h index e8fb625..9393c59 100644 --- a/include/net/filter.h +++ b/include/net/filter.h @@ -62,6 +62,8 @@ struct NetFilterState { NetClientState *netdev; NetFilterDirection direction; bool on; + char *position; + bool insert_before_flag; QTAILQ_ENTRY(NetFilterState) next; }; diff --git a/net/filter.c b/net/filter.c index 4b932e7..8221666 100644 --- a/net/filter.c +++ b/net/filter.c @@ -171,11 +171,47 @@ static void netfilter_set_status(Object *obj, const char *str, Error **errp) } } +static char *netfilter_get_position(Object *obj, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + return g_strdup(nf->position); +} + +static void netfilter_set_position(Object *obj, const char *str, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + nf->position = g_strdup(str); +} + +static char *netfilter_get_insert(Object *obj, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + return nf->insert_before_flag ? g_strdup("before") : g_strdup("behind"); +} + +static void netfilter_set_insert(Object *obj, const char *str, Error **errp) +{ + NetFilterState *nf = NETFILTER(obj); + + if (strcmp(str, "before") && strcmp(str, "behind")) { + error_setg(errp, "Invalid value for netfilter insert, " + "should be 'before' or 'behind'"); + return; + } + + nf->insert_before_flag = !strcmp(str, "before"); +} + static void netfilter_init(Object *obj) { NetFilterState *nf = NETFILTER(obj); nf->on = true; + nf->insert_before_flag = false; + nf->position = g_strdup("tail"); object_property_add_str(obj, "netdev", netfilter_get_netdev_id, netfilter_set_netdev_id, @@ -187,11 +223,18 @@ static void netfilter_init(Object *obj) object_property_add_str(obj, "status", netfilter_get_status, netfilter_set_status, NULL); + object_property_add_str(obj, "position", + netfilter_get_position, netfilter_set_position, + NULL); + object_property_add_str(obj, "insert", + netfilter_get_insert, netfilter_set_insert, + NULL); } static void netfilter_complete(UserCreatable *uc, Error **errp) { NetFilterState *nf = NETFILTER(uc); + NetFilterState *position = NULL; NetClientState *ncs[MAX_QUEUE_NUM]; NetFilterClass *nfc = NETFILTER_GET_CLASS(uc); int queues; @@ -219,6 +262,41 @@ static void netfilter_complete(UserCreatable *uc, Error **errp) return; } + if (strcmp(nf->position, "head") && strcmp(nf->position, "tail")) { + Object *container; + Object *obj; + char *position_id; + + if (!g_str_has_prefix(nf->position, "id=")) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "position", + "'head', 'tail' or 'id=<id>'"); + return; + } + + /* get the id from the string */ + position_id = g_strndup(nf->position + 3, strlen(nf->position) - 3); + + /* Search for the position to insert before/behind */ + container = object_get_objects_root(); + obj = object_resolve_path_component(container, position_id); + if (!obj) { + error_setg(errp, "filter '%s' not found", position_id); + g_free(position_id); + return; + } + + position = NETFILTER(obj); + + if (position->netdev != ncs[0]) { + error_setg(errp, "filter '%s' belongs to a different netdev", + position_id); + g_free(position_id); + return; + } + + g_free(position_id); + } + nf->netdev = ncs[0]; if (nfc->setup) { @@ -228,7 +306,18 @@ static void netfilter_complete(UserCreatable *uc, Error **errp) return; } } - QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next); + + if (position) { + if (nf->insert_before_flag) { + QTAILQ_INSERT_BEFORE(position, nf, next); + } else { + QTAILQ_INSERT_AFTER(&nf->netdev->filters, position, nf, next); + } + } else if (!strcmp(nf->position, "head")) { + QTAILQ_INSERT_HEAD(&nf->netdev->filters, nf, next); + } else if (!strcmp(nf->position, "tail")) { + QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next); + } } static void netfilter_finalize(Object *obj) @@ -245,6 +334,7 @@ static void netfilter_finalize(Object *obj) QTAILQ_REMOVE(&nf->netdev->filters, nf, next); } g_free(nf->netdev_id); + g_free(nf->position); } static void default_handle_event(NetFilterState *nf, int event, Error **errp) diff --git a/qemu-options.hx b/qemu-options.hx index ac315c1..084a1c1 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -2330,7 +2330,7 @@ DEF("netdev", HAS_ARG, QEMU_OPTION_netdev, " Linux kernel 3.3+ as well as most routers can talk\n" " L2TPv3. This transport allows connecting a VM to a VM,\n" " VM to a router and even VM to Host. It is a nearly-universal\n" - " standard (RFC3391). Note - this implementation uses static\n" + " standard (RFC3931). Note - this implementation uses static\n" " pre-configured tunnels (same as the Linux kernel).\n" " use 'src=' to specify source address\n" " use 'dst=' to specify destination address\n" @@ -2737,7 +2737,7 @@ Example (send packets from host's 1.2.3.4): @end example @item -netdev l2tpv3,id=@var{id},src=@var{srcaddr},dst=@var{dstaddr}[,srcport=@var{srcport}][,dstport=@var{dstport}],txsession=@var{txsession}[,rxsession=@var{rxsession}][,ipv6][,udp][,cookie64][,counter][,pincounter][,txcookie=@var{txcookie}][,rxcookie=@var{rxcookie}][,offset=@var{offset}] -Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3391) is a +Configure a L2TPv3 pseudowire host network backend. L2TPv3 (RFC3931) is a popular protocol to transport Ethernet (and other Layer 2) data frames between two systems. It is present in routers, firewalls and the Linux kernel (from version 3.3 onwards). @@ -4546,7 +4546,7 @@ applications, they can do this through this parameter. Its format is a gnutls priority string as described at @url{https://gnutls.org/manual/html_node/Priority-Strings.html}. -@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}] +@item -object filter-buffer,id=@var{id},netdev=@var{netdevid},interval=@var{t}[,queue=@var{all|rx|tx}][,status=@var{on|off}][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}] Interval @var{t} can't be 0, this filter batches the packet delivery: all packets arriving in a given interval on netdev @var{netdevid} are delayed @@ -4565,11 +4565,32 @@ queue @var{all|rx|tx} is an option that can be applied to any netfilter. @option{tx}: the filter is attached to the transmit queue of the netdev, where it will receive packets sent by the netdev. -@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support] +position @var{head|tail|id=<id>} is an option to specify where the +filter should be inserted in the filter list. It can be applied to any +netfilter. + +@option{head}: the filter is inserted at the head of the filter + list, before any existing filters. + +@option{tail}: the filter is inserted at the tail of the filter + list, behind any existing filters (default). + +@option{id=<id>}: the filter is inserted before or behind the filter + specified by <id>, see the insert option below. + +insert @var{behind|before} is an option to specify where to insert the +new filter relative to the one specified with position=id=<id>. It can +be applied to any netfilter. + +@option{before}: insert before the specified filter. + +@option{behind}: insert behind the specified filter (default). + +@item -object filter-mirror,id=@var{id},netdev=@var{netdevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}] filter-mirror on netdev @var{netdevid},mirror net packet to chardev@var{chardevid}, if it has the vnet_hdr_support flag, filter-mirror will mirror packet with vnet_hdr_len. -@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support] +@item -object filter-redirector,id=@var{id},netdev=@var{netdevid},indev=@var{chardevid},outdev=@var{chardevid},queue=@var{all|rx|tx}[,vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}] filter-redirector on netdev @var{netdevid},redirect filter's net packet to chardev @var{chardevid},and redirect indev's packet to filter.if it has the vnet_hdr_support flag, @@ -4578,7 +4599,7 @@ Create a filter-redirector we need to differ outdev id from indev id, id can not be the same. we can just use indev or outdev, but at least one of indev or outdev need to be specified. -@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support] +@item -object filter-rewriter,id=@var{id},netdev=@var{netdevid},queue=@var{all|rx|tx},[vnet_hdr_support][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}] Filter-rewriter is a part of COLO project.It will rewrite tcp packet to secondary from primary to keep secondary tcp connection,and rewrite @@ -4591,7 +4612,7 @@ colo secondary: -object filter-redirector,id=f2,netdev=hn0,queue=rx,outdev=red1 -object filter-rewriter,id=rew0,netdev=hn0,queue=all -@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}] +@item -object filter-dump,id=@var{id},netdev=@var{dev}[,file=@var{filename}][,maxlen=@var{len}][,position=@var{head|tail|id=<id>}][,insert=@var{behind|before}] Dump the network traffic on netdev @var{dev} to the file specified by @var{filename}. At most @var{len} bytes (64k by default) per packet are stored. diff --git a/tests/test-replication.c b/tests/test-replication.c index 4747d8a..cbc37db 100644 --- a/tests/test-replication.c +++ b/tests/test-replication.c @@ -490,6 +490,56 @@ static void test_secondary_stop(void) teardown_secondary(); } +static void test_secondary_continuous_replication(void) +{ + BlockBackend *top_blk, *local_blk; + Error *local_err = NULL; + + top_blk = start_secondary(); + replication_start_all(REPLICATION_MODE_SECONDARY, &local_err); + g_assert(!local_err); + + /* write 0x22 to s_local_disk (IMG_SIZE / 2, IMG_SIZE) */ + local_blk = blk_by_name(S_LOCAL_DISK_ID); + test_blk_write(local_blk, 0x22, IMG_SIZE / 2, IMG_SIZE / 2, false); + + /* replication will backup s_local_disk to s_hidden_disk */ + test_blk_read(top_blk, 0x11, IMG_SIZE / 2, + IMG_SIZE / 2, 0, IMG_SIZE, false); + + /* write 0x33 to s_active_disk (0, IMG_SIZE / 2) */ + test_blk_write(top_blk, 0x33, 0, IMG_SIZE / 2, false); + + /* do failover (active commit) */ + replication_stop_all(true, &local_err); + g_assert(!local_err); + + /* it should ignore all requests from now on */ + + /* start after failover */ + replication_start_all(REPLICATION_MODE_PRIMARY, &local_err); + g_assert(!local_err); + + /* checkpoint */ + replication_do_checkpoint_all(&local_err); + g_assert(!local_err); + + /* stop */ + replication_stop_all(true, &local_err); + g_assert(!local_err); + + /* read from s_local_disk (0, IMG_SIZE / 2) */ + test_blk_read(top_blk, 0x33, 0, IMG_SIZE / 2, + 0, IMG_SIZE / 2, false); + + + /* read from s_local_disk (IMG_SIZE / 2, IMG_SIZE) */ + test_blk_read(top_blk, 0x22, IMG_SIZE / 2, + IMG_SIZE / 2, 0, IMG_SIZE, false); + + teardown_secondary(); +} + static void test_secondary_do_checkpoint(void) { BlockBackend *top_blk, *local_blk; @@ -585,6 +635,8 @@ int main(int argc, char **argv) g_test_add_func("/replication/secondary/write", test_secondary_write); g_test_add_func("/replication/secondary/start", test_secondary_start); g_test_add_func("/replication/secondary/stop", test_secondary_stop); + g_test_add_func("/replication/secondary/continuous_replication", + test_secondary_continuous_replication); g_test_add_func("/replication/secondary/do_checkpoint", test_secondary_do_checkpoint); g_test_add_func("/replication/secondary/get_error_all", |