4 files changed, 2313 insertions, 0 deletions
diff --git a/src/config/fault.h b/src/config/fault.h
index 5024a8f..b6ee3c9 100644
--- a/src/config/fault.h
+++ b/src/config/fault.h
@@ -29,6 +29,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
 /* Corrupt every N received PeerDist packets */
 #define PEERBLK_CORRUPT_RATE 0
 
+/* Experience virtual machine migration on every N watchdog checks */
+#define VM_MIGRATED_RATE 0
+
 #include <config/local/fault.h>
 
 #endif /* CONFIG_FAULT_H */
diff --git a/src/drivers/net/gve.c b/src/drivers/net/gve.c
new file mode 100644
index 0000000..03edc08
--- /dev/null
+++ b/src/drivers/net/gve.c
@@ -0,0 +1,1607 @@
+/*
+ * Copyright (C) 2024 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * You can also choose to distribute this program under the terms of
+ * the Unmodified Binary Distribution Licence (as given in the file
+ * COPYING.UBDL), provided that you have satisfied its requirements.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <byteswap.h>
+#include <ipxe/netdevice.h>
+#include <ipxe/ethernet.h>
+#include <ipxe/if_ether.h>
+#include <ipxe/iobuf.h>
+#include <ipxe/dma.h>
+#include <ipxe/pci.h>
+#include <ipxe/fault.h>
+#include "gve.h"
+
+/** @file
+ *
+ * Google Virtual Ethernet network driver
+ *
+ */
+
+/* Disambiguate the various error causes */
+#define EINFO_EIO_ADMIN_UNSET						\
+	__einfo_uniqify ( EINFO_EIO, 0x00, "Uncompleted" )
+#define EIO_ADMIN_UNSET							\
+	__einfo_error ( EINFO_EIO_ADMIN_UNSET )
+#define EINFO_EIO_ADMIN_ABORTED						\
+	__einfo_uniqify ( EINFO_EIO, 0x10, "Aborted" )
+#define EIO_ADMIN_ABORTED						\
+	__einfo_error ( EINFO_EIO_ADMIN_ABORTED )
+#define EINFO_EIO_ADMIN_EXISTS						\
+	__einfo_uniqify ( EINFO_EIO, 0x11, "Already exists" )
+#define EIO_ADMIN_EXISTS						\
+	__einfo_error ( EINFO_EIO_ADMIN_EXISTS )
+#define EINFO_EIO_ADMIN_CANCELLED					\
+	__einfo_uniqify ( EINFO_EIO, 0x12, "Cancelled" )
+#define EIO_ADMIN_CANCELLED						\
+	__einfo_error ( EINFO_EIO_ADMIN_CANCELLED )
+#define EINFO_EIO_ADMIN_DATALOSS					\
+	__einfo_uniqify ( EINFO_EIO, 0x13, "Data loss" )
+#define EIO_ADMIN_DATALOSS						\
+	__einfo_error ( EINFO_EIO_ADMIN_DATALOSS )
+#define EINFO_EIO_ADMIN_DEADLINE					\
+	__einfo_uniqify ( EINFO_EIO, 0x14, "Deadline exceeded" )
+#define EIO_ADMIN_DEADLINE						\
+	__einfo_error ( EINFO_EIO_ADMIN_DEADLINE )
+#define EINFO_EIO_ADMIN_PRECONDITION					\
+	__einfo_uniqify ( EINFO_EIO, 0x15, "Failed precondition" )
+#define EIO_ADMIN_PRECONDITION						\
+	__einfo_error ( EINFO_EIO_ADMIN_PRECONDITION )
+#define EINFO_EIO_ADMIN_INTERNAL					\
+	__einfo_uniqify ( EINFO_EIO, 0x16, "Internal error" )
+#define EIO_ADMIN_INTERNAL						\
+	__einfo_error ( EINFO_EIO_ADMIN_INTERNAL )
+#define EINFO_EIO_ADMIN_INVAL						\
+	__einfo_uniqify ( EINFO_EIO, 0x17, "Invalid argument" )
+#define EIO_ADMIN_INVAL							\
+	__einfo_error ( EINFO_EIO_ADMIN_INVAL )
+#define EINFO_EIO_ADMIN_NOT_FOUND					\
+	__einfo_uniqify ( EINFO_EIO, 0x18, "Not found" )
+#define EIO_ADMIN_NOT_FOUND						\
+	__einfo_error ( EINFO_EIO_ADMIN_NOT_FOUND )
+#define EINFO_EIO_ADMIN_RANGE						\
+	__einfo_uniqify ( EINFO_EIO, 0x19, "Out of range" )
+#define EIO_ADMIN_RANGE							\
+	__einfo_error ( EINFO_EIO_ADMIN_RANGE )
+#define EINFO_EIO_ADMIN_PERM						\
+	__einfo_uniqify ( EINFO_EIO, 0x1a, "Permission denied" )
+#define EIO_ADMIN_PERM							\
+	__einfo_error ( EINFO_EIO_ADMIN_PERM )
+#define EINFO_EIO_ADMIN_UNAUTH						\
+	__einfo_uniqify ( EINFO_EIO, 0x1b, "Unauthenticated" )
+#define EIO_ADMIN_UNAUTH						\
+	__einfo_error ( EINFO_EIO_ADMIN_UNAUTH )
+#define EINFO_EIO_ADMIN_RESOURCE					\
+	__einfo_uniqify ( EINFO_EIO, 0x1c, "Resource exhausted" )
+#define EIO_ADMIN_RESOURCE						\
+	__einfo_error ( EINFO_EIO_ADMIN_RESOURCE )
+#define EINFO_EIO_ADMIN_UNAVAIL						\
+	__einfo_uniqify ( EINFO_EIO, 0x1d, "Unavailable" )
+#define EIO_ADMIN_UNAVAIL						\
+	__einfo_error ( EINFO_EIO_ADMIN_UNAVAIL )
+#define EINFO_EIO_ADMIN_NOTSUP						\
+	__einfo_uniqify ( EINFO_EIO, 0x1e, "Unimplemented" )
+#define EIO_ADMIN_NOTSUP	       					\
+	__einfo_error ( EINFO_EIO_ADMIN_NOTSUP )
+#define EINFO_EIO_ADMIN_UNKNOWN						\
+	__einfo_uniqify ( EINFO_EIO, 0x1f, "Unknown error" )
+#define EIO_ADMIN_UNKNOWN						\
+	__einfo_error ( EINFO_EIO_ADMIN_UNKNOWN )
+#define EIO_ADMIN( status )						\
+	EUNIQ ( EINFO_EIO, ( (status) & 0x1f ),				\
+		EIO_ADMIN_UNSET, EIO_ADMIN_ABORTED, EIO_ADMIN_EXISTS,	\
+		EIO_ADMIN_CANCELLED, EIO_ADMIN_DATALOSS,		\
+		EIO_ADMIN_DEADLINE, EIO_ADMIN_NOT_FOUND,		\
+		EIO_ADMIN_RANGE, EIO_ADMIN_PERM, EIO_ADMIN_UNAUTH,	\
+		EIO_ADMIN_RESOURCE, EIO_ADMIN_UNAVAIL,			\
+		EIO_ADMIN_NOTSUP, EIO_ADMIN_UNKNOWN )
+
+/******************************************************************************
+ *
+ * Device reset
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Reset hardware
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_reset ( struct gve_nic *gve ) {
+	uint32_t pfn;
+	unsigned int i;
+
+	/* Skip reset if admin queue page frame number is already
+	 * clear.  Triggering a reset on an already-reset device seems
+	 * to cause a delayed reset to be scheduled.  This can cause
+	 * the device to end up in a reset loop, where each attempt to
+	 * recover from reset triggers another reset a few seconds
+	 * later.
+	 */
+	pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+	if ( ! pfn ) {
+		DBGC ( gve, "GVE %p skipping reset\n", gve );
+		return 0;
+	}
+
+	/* Clear admin queue page frame number */
+	writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN );
+	wmb();
+
+	/* Wait for device to reset */
+	for ( i = 0 ; i < GVE_RESET_MAX_WAIT_MS ; i++ ) {
+
+		/* Delay */
+		mdelay ( 1 );
+
+		/* Check for reset completion */
+		pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+		if ( ! pfn )
+			return 0;
+	}
+
+	DBGC ( gve, "GVE %p reset timed out (PFN %#08x devstat %#08x)\n",
+	       gve, bswap_32 ( pfn ),
+	       bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) );
+	return -ETIMEDOUT;
+}
+
+/******************************************************************************
+ *
+ * Admin queue
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Allocate admin queue
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_admin_alloc ( struct gve_nic *gve ) {
+	struct dma_device *dma = gve->dma;
+	struct gve_admin *admin = &gve->admin;
+	struct gve_irqs *irqs = &gve->irqs;
+	struct gve_events *events = &gve->events;
+	struct gve_scratch *scratch = &gve->scratch;
+	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
+	size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) );
+	size_t events_len = ( GVE_EVENT_MAX * sizeof ( events->event[0] ) );
+	size_t scratch_len = sizeof ( *scratch->buf );
+	int rc;
+
+	/* Allocate admin queue */
+	admin->cmd = dma_alloc ( dma, &admin->map, admin_len, GVE_ALIGN );
+	if ( ! admin->cmd ) {
+		rc = -ENOMEM;
+		goto err_admin;
+	}
+
+	/* Allocate interrupt channels */
+	irqs->irq = dma_alloc ( dma, &irqs->map, irqs_len, GVE_ALIGN );
+	if ( ! irqs->irq ) {
+		rc = -ENOMEM;
+		goto err_irqs;
+	}
+
+	/* Allocate event counters */
+	events->event = dma_alloc ( dma, &events->map, events_len, GVE_ALIGN );
+	if ( ! events->event ) {
+		rc = -ENOMEM;
+		goto err_events;
+	}
+
+	/* Allocate scratch buffer */
+	scratch->buf = dma_alloc ( dma, &scratch->map, scratch_len, GVE_ALIGN );
+	if ( ! scratch->buf ) {
+		rc = -ENOMEM;
+		goto err_scratch;
+	}
+
+	DBGC ( gve, "GVE %p AQ at [%08lx,%08lx)\n",
+	       gve, virt_to_phys ( admin->cmd ),
+	       ( virt_to_phys ( admin->cmd ) + admin_len ) );
+	return 0;
+
+	dma_free ( &scratch->map, scratch->buf, scratch_len );
+ err_scratch:
+	dma_free ( &events->map, events->event, events_len );
+ err_events:
+	dma_free ( &irqs->map, irqs->irq, irqs_len );
+ err_irqs:
+	dma_free ( &admin->map, admin->cmd, admin_len );
+ err_admin:
+	return rc;
+}
+
+/**
+ * Free admin queue
+ *
+ * @v gve		GVE device
+ */
+static void gve_admin_free ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	struct gve_irqs *irqs = &gve->irqs;
+	struct gve_events *events = &gve->events;
+	struct gve_scratch *scratch = &gve->scratch;
+	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
+	size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) );
+	size_t events_len = ( GVE_EVENT_MAX * sizeof ( events->event[0] ) );
+	size_t scratch_len = sizeof ( *scratch->buf );
+
+	/* Free scratch buffer */
+	dma_free ( &scratch->map, scratch->buf, scratch_len );
+
+	/* Free event counter */
+	dma_free ( &events->map, events->event, events_len );
+
+	/* Free interrupt channels */
+	dma_free ( &irqs->map, irqs->irq, irqs_len );
+
+	/* Free admin queue */
+	dma_free ( &admin->map, admin->cmd, admin_len );
+}
+
+/**
+ * Enable admin queue
+ *
+ * @v gve		GVE device
+ */
+static void gve_admin_enable ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
+	physaddr_t base;
+
+	/* Reset queue */
+	admin->prod = 0;
+
+	/* Program queue addresses and capabilities */
+	base = dma ( &admin->map, admin->cmd );
+	writel ( bswap_32 ( base / GVE_PAGE_SIZE ),
+		 gve->cfg + GVE_CFG_ADMIN_PFN );
+	writel ( bswap_32 ( base & 0xffffffffUL ),
+		 gve->cfg + GVE_CFG_ADMIN_BASE_LO );
+	if ( sizeof ( base ) > sizeof ( uint32_t ) ) {
+		writel ( bswap_32 ( ( ( uint64_t ) base ) >> 32 ),
+			 gve->cfg + GVE_CFG_ADMIN_BASE_HI );
+	} else {
+		writel ( 0, gve->cfg + GVE_CFG_ADMIN_BASE_HI );
+	}
+	writel ( bswap_16 ( admin_len ), gve->cfg + GVE_CFG_ADMIN_LEN );
+	writel ( bswap_32 ( GVE_CFG_DRVSTAT_RUN ), gve->cfg + GVE_CFG_DRVSTAT );
+}
+
+/**
+ * Get next available admin queue command slot
+ *
+ * @v gve		GVE device
+ * @ret cmd		Admin queue command
+ */
+static union gve_admin_command * gve_admin_command ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	union gve_admin_command *cmd;
+	unsigned int index;
+
+	/* Get next command slot */
+	index = admin->prod;
+	cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ];
+
+	/* Initialise request */
+	memset ( cmd, 0, sizeof ( *cmd ) );
+
+	return cmd;
+}
+
+/**
+ * Wait for admin queue command to complete
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_admin_wait ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	uint32_t evt;
+	uint32_t pfn;
+	unsigned int i;
+
+	/* Wait for any outstanding commands to complete */
+	for ( i = 0 ; i < GVE_ADMIN_MAX_WAIT_MS ; i++ ) {
+
+		/* Check event counter */
+		rmb();
+		evt = bswap_32 ( readl ( gve->cfg + GVE_CFG_ADMIN_EVT ) );
+		if ( evt == admin->prod )
+			return 0;
+
+		/* Check for device reset */
+		pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+		if ( ! pfn )
+			break;
+
+		/* Delay */
+		mdelay ( 1 );
+	}
+
+	DBGC ( gve, "GVE %p AQ %#02x %s (completed %#02x, status %#08x)\n",
+	       gve, admin->prod, ( pfn ? "timed out" : "saw reset" ), evt,
+	       bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) );
+	return ( pfn ? -ETIMEDOUT : -ECONNRESET );
+}
+
+/**
+ * Issue admin queue command
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_admin ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	union gve_admin_command *cmd;
+	unsigned int index;
+	uint32_t opcode;
+	uint32_t status;
+	int rc;
+
+	/* Ensure admin queue is idle */
+	if ( ( rc = gve_admin_wait ( gve ) ) != 0 )
+		return rc;
+
+	/* Get next command slot */
+	index = admin->prod;
+	cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ];
+	opcode = cmd->hdr.opcode;
+	DBGC2 ( gve, "GVE %p AQ %#02x command %#04x request:\n",
+		gve, index, opcode );
+	DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
+
+	/* Increment producer counter */
+	admin->prod++;
+
+	/* Ring doorbell */
+	wmb();
+	writel ( bswap_32 ( admin->prod ), gve->cfg + GVE_CFG_ADMIN_DB );
+
+	/* Wait for command to complete */
+	if ( ( rc = gve_admin_wait ( gve ) ) != 0 )
+		return rc;
+
+	/* Check command status */
+	status = be32_to_cpu ( cmd->hdr.status );
+	if ( status != GVE_ADMIN_STATUS_OK ) {
+		rc = -EIO_ADMIN ( status );
+		DBGC ( gve, "GVE %p AQ %#02x command %#04x failed: %#08x\n",
+		       gve, index, opcode, status );
+		DBGC_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
+		DBGC ( gve, "GVE %p AQ error: %s\n", gve, strerror ( rc ) );
+		return rc;
+	}
+
+	DBGC2 ( gve, "GVE %p AQ %#02x command %#04x result:\n",
+		gve, index, opcode );
+	DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
+	return 0;
+}
+
+/**
+ * Issue simple admin queue command
+ *
+ * @v gve		GVE device
+ * @v opcode		Operation code
+ * @v id		ID parameter (or zero if not applicable)
+ * @ret rc		Return status code
+ *
+ * Several admin queue commands take either an empty parameter list or
+ * a single 32-bit ID parameter.
+ */
+static int gve_admin_simple ( struct gve_nic *gve, unsigned int opcode,
+			      unsigned int id ) {
+	union gve_admin_command *cmd;
+	int rc;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = opcode;
+	cmd->simple.id = cpu_to_be32 ( id );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/**
+ * Get device descriptor
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_describe ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+	struct gve_device_descriptor *desc = &gve->scratch.buf->desc;
+	union gve_admin_command *cmd;
+	int rc;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = GVE_ADMIN_DESCRIBE;
+	cmd->desc.addr = cpu_to_be64 ( dma ( &gve->scratch.map, desc ) );
+	cmd->desc.ver = cpu_to_be32 ( GVE_ADMIN_DESCRIBE_VER );
+	cmd->desc.len = cpu_to_be32 ( sizeof ( *desc ) );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+	DBGC2 ( gve, "GVE %p device descriptor:\n", gve );
+	DBGC2_HDA ( gve, 0, desc, sizeof ( *desc ) );
+
+	/* Extract queue parameters */
+	gve->events.count = be16_to_cpu ( desc->counters );
+	if ( gve->events.count > GVE_EVENT_MAX )
+		gve->events.count = GVE_EVENT_MAX;
+	gve->tx.count = be16_to_cpu ( desc->tx_count );
+	gve->rx.count = be16_to_cpu ( desc->rx_count );
+	DBGC ( gve, "GVE %p using %d TX, %d RX, %d/%d events\n",
+	       gve, gve->tx.count, gve->rx.count, gve->events.count,
+	       be16_to_cpu ( desc->counters ) );
+
+	/* Extract network parameters */
+	build_assert ( sizeof ( desc->mac ) == ETH_ALEN );
+	memcpy ( netdev->hw_addr, &desc->mac, sizeof ( desc->mac ) );
+	netdev->mtu = be16_to_cpu ( desc->mtu );
+	netdev->max_pkt_len = ( netdev->mtu + ETH_HLEN );
+	DBGC ( gve, "GVE %p MAC %s (\"%s\") MTU %zd\n",
+	       gve, eth_ntoa ( netdev->hw_addr ),
+	       inet_ntoa ( desc->mac.in ), netdev->mtu );
+
+	return 0;
+}
+
+/**
+ * Configure device resources
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_configure ( struct gve_nic *gve ) {
+	struct gve_events *events = &gve->events;
+	struct gve_irqs *irqs = &gve->irqs;
+	union gve_admin_command *cmd;
+	unsigned int db_off;
+	unsigned int i;
+	int rc;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = GVE_ADMIN_CONFIGURE;
+	cmd->conf.events =
+		cpu_to_be64 ( dma ( &events->map, events->event ) );
+	cmd->conf.irqs =
+		cpu_to_be64 ( dma ( &irqs->map, irqs->irq ) );
+	cmd->conf.num_events = cpu_to_be32 ( events->count );
+	cmd->conf.num_irqs = cpu_to_be32 ( GVE_IRQ_COUNT );
+	cmd->conf.irq_stride = cpu_to_be32 ( sizeof ( irqs->irq[0] ) );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	/* Disable all interrupts */
+	for ( i = 0 ; i < GVE_IRQ_COUNT ; i++ ) {
+		db_off = ( be32_to_cpu ( irqs->irq[i].db_idx ) *
+			   sizeof ( uint32_t ) );
+		DBGC ( gve, "GVE %p IRQ %d doorbell +%#04x\n", gve, i, db_off );
+		irqs->db[i] = ( gve->db + db_off );
+		writel ( bswap_32 ( GVE_IRQ_DISABLE ), irqs->db[i] );
+	}
+
+	return 0;
+}
+
+/**
+ * Deconfigure device resources
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_deconfigure ( struct gve_nic *gve ) {
+	int rc;
+
+	/* Issue command (with meaningless ID) */
+	if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_DECONFIGURE, 0 ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/**
+ * Register queue page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ * @ret rc		Return status code
+ */
+static int gve_register ( struct gve_nic *gve, struct gve_qpl *qpl ) {
+	struct gve_pages *pages = &gve->scratch.buf->pages;
+	union gve_admin_command *cmd;
+	physaddr_t addr;
+	unsigned int i;
+	int rc;
+
+	/* Build page address list */
+	for ( i = 0 ; i < qpl->count ; i++ ) {
+		addr = user_to_phys ( qpl->data, ( i * GVE_PAGE_SIZE ) );
+		pages->addr[i] = cpu_to_be64 ( dma_phys ( &qpl->map, addr ) );
+	}
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = GVE_ADMIN_REGISTER;
+	cmd->reg.id = cpu_to_be32 ( qpl->id );
+	cmd->reg.count = cpu_to_be32 ( qpl->count );
+	cmd->reg.addr = cpu_to_be64 ( dma ( &gve->scratch.map, pages ) );
+	cmd->reg.size = cpu_to_be64 ( GVE_PAGE_SIZE );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/**
+ * Unregister page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ * @ret rc		Return status code
+ */
+static int gve_unregister ( struct gve_nic *gve, struct gve_qpl *qpl ) {
+	int rc;
+
+	/* Issue command */
+	if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_UNREGISTER,
+				       qpl->id ) ) != 0 ) {
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * Construct command to create transmit queue
+ *
+ * @v queue		Transmit queue
+ * @v cmd		Admin queue command
+ */
+static void gve_create_tx_param ( struct gve_queue *queue,
+				  union gve_admin_command *cmd ) {
+	struct gve_admin_create_tx *create = &cmd->create_tx;
+	const struct gve_queue_type *type = queue->type;
+	physaddr_t desc = user_to_phys ( queue->desc, 0 );
+
+	/* Construct request parameters */
+	create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) );
+	create->desc = cpu_to_be64 ( dma_phys ( &queue->desc_map, desc ) );
+	create->qpl_id = cpu_to_be32 ( type->qpl );
+	create->notify_id = cpu_to_be32 ( type->irq );
+}
+
+/**
+ * Construct command to create receive queue
+ *
+ * @v queue		Receive queue
+ * @v cmd		Admin queue command
+ */
+static void gve_create_rx_param ( struct gve_queue *queue,
+				  union gve_admin_command *cmd ) {
+	struct gve_admin_create_rx *create = &cmd->create_rx;
+	const struct gve_queue_type *type = queue->type;
+	physaddr_t desc = user_to_phys ( queue->desc, 0 );
+	physaddr_t cmplt = user_to_phys ( queue->cmplt, 0 );
+
+	/* Construct request parameters */
+	create->notify_id = cpu_to_be32 ( type->irq );
+	create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) );
+	create->desc = cpu_to_be64 ( dma_phys ( &queue->desc_map, desc ) );
+	create->cmplt = cpu_to_be64 ( dma_phys ( &queue->cmplt_map, cmplt ) );
+	create->qpl_id = cpu_to_be32 ( type->qpl );
+	create->bufsz = cpu_to_be16 ( GVE_BUF_SIZE );
+}
+
+/**
+ * Create transmit or receive queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ * @ret rc		Return status code
+ */
+static int gve_create_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	union gve_admin_command *cmd;
+	unsigned int db_off;
+	unsigned int evt_idx;
+	int rc;
+
+	/* Reset queue */
+	queue->prod = 0;
+	queue->cons = 0;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = type->create;
+	type->param ( queue, cmd );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	/* Record indices */
+	db_off = ( be32_to_cpu ( queue->res->db_idx ) * sizeof ( uint32_t ) );
+	evt_idx = be32_to_cpu ( queue->res->evt_idx );
+	DBGC ( gve, "GVE %p %s doorbell +%#04x event counter %d\n",
+	       gve, type->name, db_off, evt_idx );
+	queue->db = ( gve->db + db_off );
+	assert ( evt_idx < gve->events.count );
+	queue->event = &gve->events.event[evt_idx];
+	assert ( queue->event->count == 0 );
+
+	return 0;
+}
+
+/**
+ * Destroy transmit or receive queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ * @ret rc		Return status code
+ */
+static int gve_destroy_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	int rc;
+
+	/* Issue command */
+	if ( ( rc = gve_admin_simple ( gve, type->destroy, 0 ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ * Network device interface
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Allocate queue page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ * @v id		Queue page list ID
+ * @v buffers		Number of data buffers
+ * @ret rc		Return status code
+ */
+static int gve_alloc_qpl ( struct gve_nic *gve, struct gve_qpl *qpl,
+			   uint32_t id, unsigned int buffers ) {
+	size_t len;
+
+	/* Record ID */
+	qpl->id = id;
+
+	/* Calculate number of pages required */
+	build_assert ( GVE_BUF_SIZE <= GVE_PAGE_SIZE );
+	qpl->count = ( ( buffers + GVE_BUF_PER_PAGE - 1 ) / GVE_BUF_PER_PAGE );
+
+	/* Allocate pages (as a single block) */
+	len = ( qpl->count * GVE_PAGE_SIZE );
+	qpl->data = dma_umalloc ( gve->dma, &qpl->map, len, GVE_ALIGN );
+	if ( ! qpl->data )
+		return -ENOMEM;
+
+	DBGC ( gve, "GVE %p QPL %#08x at [%08lx,%08lx)\n",
+	       gve, qpl->id, user_to_phys ( qpl->data, 0 ),
+	       user_to_phys ( qpl->data, len ) );
+	return 0;
+}
+
+/**
+ * Free queue page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ */
+static void gve_free_qpl ( struct gve_nic *nic __unused,
+			   struct gve_qpl *qpl ) {
+	size_t len = ( qpl->count * GVE_PAGE_SIZE );
+
+	/* Free pages */
+	dma_ufree ( &qpl->map, qpl->data, len );
+}
+
+/**
+ * Get buffer address (within queue page list address space)
+ *
+ * @v queue		Descriptor queue
+ * @v index		Buffer index
+ * @ret addr		Buffer address within queue page list address space
+ */
+static inline __attribute__ (( always_inline)) size_t
+gve_address ( struct gve_queue *queue, unsigned int index ) {
+
+	/* We allocate sufficient pages for the maximum fill level of
+	 * buffers, and reuse the pages in strict rotation as we
+	 * progress through the queue.
+	 */
+	return ( ( index & ( queue->fill - 1 ) ) * GVE_BUF_SIZE );
+}
+
+/**
+ * Get buffer address
+ *
+ * @v queue		Descriptor queue
+ * @v index		Buffer index
+ * @ret addr		Buffer address
+ */
+static inline __attribute__ (( always_inline )) userptr_t
+gve_buffer ( struct gve_queue *queue, unsigned int index ) {
+
+	/* Pages are currently allocated as a single contiguous block */
+	return userptr_add ( queue->qpl.data, gve_address ( queue, index ) );
+}
+
+/**
+ * Calculate next receive sequence number
+ *
+ * @v seq		Current sequence number, or zero to start sequence
+ * @ret next		Next sequence number
+ */
+static inline __attribute__ (( always_inline )) unsigned int
+gve_next ( unsigned int seq ) {
+
+	/* The receive completion sequence number is a modulo 7
+	 * counter that cycles through the non-zero three-bit values 1
+	 * to 7 inclusive.
+	 *
+	 * Since 7 is coprime to 2^n, this ensures that the sequence
+	 * number changes each time that a new completion is written
+	 * to memory.
+	 *
+	 * Since the counter takes only non-zero values, this ensures
+	 * that the sequence number changes whenever a new completion
+	 * is first written to a zero-initialised completion ring.
+	 */
+	seq = ( ( seq + 1 ) & GVE_RX_SEQ_MASK );
+	return ( seq ? seq : 1 );
+}
+
+/**
+ * Allocate descriptor queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ * @ret rc		Return status code
+ */
+static int gve_alloc_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	struct dma_device *dma = gve->dma;
+	size_t desc_len = ( queue->count * type->desc_len );
+	size_t cmplt_len = ( queue->count * type->cmplt_len );
+	size_t res_len = sizeof ( *queue->res );
+	struct gve_buffer buf;
+	size_t offset;
+	unsigned int i;
+	int rc;
+
+	/* Sanity checks */
+	if ( ( queue->count == 0 ) ||
+	     ( queue->count & ( queue->count - 1 ) ) ) {
+		DBGC ( gve, "GVE %p %s invalid queue size %d\n",
+		       gve, type->name, queue->count );
+		rc = -EINVAL;
+		goto err_sanity;
+	}
+
+	/* Calculate maximum fill level */
+	assert ( ( type->fill & ( type->fill - 1 ) ) == 0 );
+	queue->fill = type->fill;
+	if ( queue->fill > queue->count )
+		queue->fill = queue->count;
+	DBGC ( gve, "GVE %p %s using QPL %#08x with %d/%d descriptors\n",
+	       gve, type->name, type->qpl, queue->fill, queue->count );
+
+	/* Allocate queue page list */
+	if ( ( rc = gve_alloc_qpl ( gve, &queue->qpl, type->qpl,
+				    queue->fill ) ) != 0 )
+		goto err_qpl;
+
+	/* Allocate descriptors */
+	queue->desc = dma_umalloc ( dma, &queue->desc_map, desc_len,
+				    GVE_ALIGN );
+	if ( ! queue->desc ) {
+		rc = -ENOMEM;
+		goto err_desc;
+	}
+	DBGC ( gve, "GVE %p %s descriptors at [%08lx,%08lx)\n",
+	       gve, type->name, user_to_phys ( queue->desc, 0 ),
+	       user_to_phys ( queue->desc, desc_len ) );
+
+	/* Allocate completions */
+	if ( cmplt_len ) {
+		queue->cmplt = dma_umalloc ( dma, &queue->cmplt_map, cmplt_len,
+					     GVE_ALIGN );
+		if ( ! queue->cmplt ) {
+			rc = -ENOMEM;
+			goto err_cmplt;
+		}
+		DBGC ( gve, "GVE %p %s completions at [%08lx,%08lx)\n",
+		       gve, type->name, user_to_phys ( queue->cmplt, 0 ),
+		       user_to_phys ( queue->cmplt, cmplt_len ) );
+	}
+
+	/* Allocate queue resources */
+	queue->res = dma_alloc ( dma, &queue->res_map, res_len, GVE_ALIGN );
+	if ( ! queue->res ) {
+		rc = -ENOMEM;
+		goto err_res;
+	}
+	memset ( queue->res, 0, res_len );
+
+	/* Populate descriptor offsets */
+	offset = ( type->desc_len - sizeof ( buf ) );
+	for ( i = 0 ; i < queue->count ; i++ ) {
+		buf.addr = cpu_to_be64 ( gve_address ( queue, i ) );
+		copy_to_user ( queue->desc, offset, &buf, sizeof ( buf ) );
+		offset += type->desc_len;
+	}
+
+	return 0;
+
+	dma_free ( &queue->res_map, queue->res, res_len );
+ err_res:
+	if ( cmplt_len )
+		dma_ufree ( &queue->cmplt_map, queue->cmplt, cmplt_len );
+ err_cmplt:
+	dma_ufree ( &queue->desc_map, queue->desc, desc_len );
+ err_desc:
+	gve_free_qpl ( gve, &queue->qpl );
+ err_qpl:
+ err_sanity:
+	return rc;
+}
+
+/**
+ * Free descriptor queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ */
+static void gve_free_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	size_t desc_len = ( queue->count * type->desc_len );
+	size_t cmplt_len = ( queue->count * type->cmplt_len );
+	size_t res_len = sizeof ( *queue->res );
+
+	/* Free queue resources */
+	dma_free ( &queue->res_map, queue->res, res_len );
+
+	/* Free completions, if applicable */
+	if ( cmplt_len )
+		dma_ufree ( &queue->cmplt_map, queue->cmplt, cmplt_len );
+
+	/* Free descriptors */
+	dma_ufree ( &queue->desc_map, queue->desc, desc_len );
+
+	/* Free queue page list */
+	gve_free_qpl ( gve, &queue->qpl );
+}
+
+/**
+ * Start up device
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_start ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+	struct io_buffer *iobuf;
+	unsigned int i;
+	int rc;
+
+	/* Cancel any pending transmissions */
+	for ( i = 0 ; i < ( sizeof ( gve->tx_iobuf ) /
+			    sizeof ( gve->tx_iobuf[0] ) ) ; i++ ) {
+		iobuf = gve->tx_iobuf[i];
+		gve->tx_iobuf[i] = NULL;
+		if ( iobuf )
+			netdev_tx_complete_err ( netdev, iobuf, -ECANCELED );
+	}
+
+	/* Invalidate receive completions */
+	memset_user ( rx->cmplt, 0, 0, ( rx->count * rx->type->cmplt_len ) );
+
+	/* Reset receive sequence */
+	gve->seq = gve_next ( 0 );
+
+	/* Configure device resources */
+	if ( ( rc = gve_configure ( gve ) ) != 0 )
+		goto err_configure;
+
+	/* Register transmit queue page list */
+	if ( ( rc = gve_register ( gve, &tx->qpl ) ) != 0 )
+		goto err_register_tx;
+
+	/* Register receive queue page list */
+	if ( ( rc = gve_register ( gve, &rx->qpl ) ) != 0 )
+		goto err_register_rx;
+
+	/* Create transmit queue */
+	if ( ( rc = gve_create_queue ( gve, tx ) ) != 0 )
+		goto err_create_tx;
+
+	/* Create receive queue */
+	if ( ( rc = gve_create_queue ( gve, rx ) ) != 0 )
+		goto err_create_rx;
+
+	return 0;
+
+	gve_destroy_queue ( gve, rx );
+ err_create_rx:
+	gve_destroy_queue ( gve, tx );
+ err_create_tx:
+	gve_unregister ( gve, &rx->qpl );
+ err_register_rx:
+	gve_unregister ( gve, &tx->qpl );
+ err_register_tx:
+	gve_deconfigure ( gve );
+ err_configure:
+	return rc;
+}
+
+/**
+ * Stop device
+ *
+ * @v gve		GVE device
+ */
+static void gve_stop ( struct gve_nic *gve ) {
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+
+	/* Destroy queues */
+	gve_destroy_queue ( gve, rx );
+	gve_destroy_queue ( gve, tx );
+
+	/* Unregister page lists */
+	gve_unregister ( gve, &rx->qpl );
+	gve_unregister ( gve, &tx->qpl );
+
+	/* Deconfigure device */
+	gve_deconfigure ( gve );
+}
+
+/**
+ * Device startup process
+ *
+ * @v gve		GVE device
+ */
+static void gve_startup ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+	int rc;
+
+	/* Reset device */
+	if ( ( rc = gve_reset ( gve ) ) != 0 )
+		goto err_reset;
+
+	/* Enable admin queue */
+	gve_admin_enable ( gve );
+
+	/* Start device */
+	if ( ( rc = gve_start ( gve ) ) != 0 )
+		goto err_start;
+
+	/* Reset retry count */
+	gve->retries = 0;
+
+	/* (Ab)use link status to report startup status */
+	netdev_link_up ( netdev );
+
+	return;
+
+	gve_stop ( gve );
+ err_start:
+ err_reset:
+	DBGC ( gve, "GVE %p startup failed: %s\n", gve, strerror ( rc ) );
+	netdev_link_err ( netdev, rc );
+	if ( gve->retries++ < GVE_RESET_MAX_RETRY )
+		process_add ( &gve->startup );
+}
+
+/**
+ * Trigger startup process
+ *
+ * @v gve		GVE device
+ */
+static void gve_restart ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+
+	/* Mark link down to inhibit polling and transmit activity */
+	netdev_link_down ( netdev );
+
+	/* Schedule startup process */
+	process_add ( &gve->startup );
+}
+
+/**
+ * Reset recovery watchdog
+ *
+ * @v timer		Reset recovery watchdog timer
+ * @v over		Failure indicator
+ */
+static void gve_watchdog ( struct retry_timer *timer, int over __unused ) {
+	struct gve_nic *gve = container_of ( timer, struct gve_nic, watchdog );
+	uint32_t activity;
+	uint32_t pfn;
+	int rc;
+
+	/* Reschedule watchdog */
+	start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT );
+
+	/* Reset device (for test purposes) if applicable */
+	if ( ( rc = inject_fault ( VM_MIGRATED_RATE ) ) != 0 ) {
+		DBGC ( gve, "GVE %p synthesising host reset\n", gve );
+		writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN );
+	}
+
+	/* Check for activity since last timer invocation */
+	activity = ( gve->tx.cons + gve->rx.cons );
+	if ( activity != gve->activity ) {
+		gve->activity = activity;
+		return;
+	}
+
+	/* Check for reset */
+	pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+	if ( pfn ) {
+		DBGC2 ( gve, "GVE %p idle but not in reset\n", gve );
+		return;
+	}
+
+	/* Schedule restart */
+	DBGC ( gve, "GVE %p watchdog detected reset by host\n", gve );
+	gve_restart ( gve );
+}
+
+/**
+ * Open network device
+ *
+ * @v netdev		Network device
+ * @ret rc		Return status code
+ */
+static int gve_open ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+	int rc;
+
+	/* Allocate and prepopulate transmit queue */
+	if ( ( rc = gve_alloc_queue ( gve, tx ) ) != 0 )
+		goto err_alloc_tx;
+
+	/* Allocate and prepopulate receive queue */
+	if ( ( rc = gve_alloc_queue ( gve, rx ) ) != 0 )
+		goto err_alloc_rx;
+
+	/* Trigger startup */
+	gve_restart ( gve );
+
+	/* Start reset recovery watchdog timer */
+	start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT );
+
+	return 0;
+
+	gve_free_queue ( gve, rx );
+ err_alloc_rx:
+	gve_free_queue ( gve, tx );
+ err_alloc_tx:
+	return rc;
+}
+
+/**
+ * Close network device
+ *
+ * @v netdev		Network device
+ */
+static void gve_close ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+
+	/* Stop reset recovery timer */
+	stop_timer ( &gve->watchdog );
+
+	/* Terminate startup process */
+	process_del ( &gve->startup );
+
+	/* Stop and reset device */
+	gve_stop ( gve );
+	gve_reset ( gve );
+
+	/* Free queues */
+	gve_free_queue ( gve, rx );
+	gve_free_queue ( gve, tx );
+}
+
+/**
+ * Transmit packet
+ *
+ * @v netdev		Network device
+ * @v iobuf		I/O buffer
+ * @ret rc		Return status code
+ */
+static int gve_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_tx_descriptor desc;
+	unsigned int count;
+	unsigned int index;
+	size_t frag_len;
+	size_t offset;
+	size_t len;
+
+	/* Do nothing if queues are not yet set up */
+	if ( ! netdev_link_ok ( netdev ) )
+		return -ENETDOWN;
+
+	/* Defer packet if there is no space in the transmit ring */
+	len = iob_len ( iobuf );
+	count = ( ( len + GVE_BUF_SIZE - 1 ) / GVE_BUF_SIZE );
+	if ( ( ( tx->prod - tx->cons ) + count ) > tx->fill ) {
+		netdev_tx_defer ( netdev, iobuf );
+		return 0;
+	}
+
+	/* Copy packet to queue pages and populate descriptors */
+	for ( offset = 0 ; offset < len ; offset += frag_len ) {
+
+		/* Sanity check */
+		assert ( gve->tx_iobuf[ tx->prod % GVE_TX_FILL ] == NULL );
+
+		/* Copy packet fragment */
+		frag_len = ( len - offset );
+		if ( frag_len > GVE_BUF_SIZE )
+			frag_len = GVE_BUF_SIZE;
+		copy_to_user ( gve_buffer ( tx, tx->prod ), 0,
+			       ( iobuf->data + offset ), frag_len );
+
+		/* Populate descriptor */
+		index = ( tx->prod++ & ( tx->count - 1 ) );
+		memset ( &desc.pkt, 0, sizeof ( desc.pkt ) );
+		if ( offset ) {
+			desc.pkt.type = GVE_TX_TYPE_CONT;
+		} else {
+			desc.pkt.type = GVE_TX_TYPE_START;
+			desc.pkt.count = count;
+			desc.pkt.total = cpu_to_be16 ( len );
+		}
+		desc.pkt.len = cpu_to_be16 ( frag_len );
+		copy_to_user ( tx->desc, ( index * sizeof ( desc ) ), &desc,
+			       sizeof ( desc.pkt ) );
+		DBGC2 ( gve, "GVE %p TX %#04x %#02x:%#02x len %#04x/%#04x at "
+			"%#08zx\n", gve, index, desc.pkt.type, desc.pkt.count,
+			be16_to_cpu ( desc.pkt.len ),
+			be16_to_cpu ( desc.pkt.total ),
+			gve_address ( tx, index ) );
+	}
+	assert ( ( tx->prod - tx->cons ) <= tx->fill );
+
+	/* Record I/O buffer against final descriptor */
+	gve->tx_iobuf[ ( tx->prod - 1U ) % GVE_TX_FILL ] = iobuf;
+
+	/* Ring doorbell */
+	wmb();
+	writel ( bswap_32 ( tx->prod ), tx->db );
+
+	return 0;
+}
+
+/**
+ * Poll for completed transmissions
+ *
+ * @v netdev		Network device
+ */
+static void gve_poll_tx ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct io_buffer *iobuf;
+	uint32_t count;
+
+	/* Read event counter */
+	count = be32_to_cpu ( tx->event->count );
+
+	/* Process transmit completions */
+	while ( count != tx->cons ) {
+		DBGC2 ( gve, "GVE %p TX %#04x complete\n", gve, tx->cons );
+		iobuf = gve->tx_iobuf[ tx->cons % GVE_TX_FILL ];
+		gve->tx_iobuf[ tx->cons % GVE_TX_FILL ] = NULL;
+		tx->cons++;
+		if ( iobuf )
+			netdev_tx_complete ( netdev, iobuf );
+	}
+}
+
+/**
+ * Poll for received packets
+ *
+ * @v netdev		Network device
+ */
+static void gve_poll_rx ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *rx = &gve->rx;
+	struct gve_rx_completion cmplt;
+	struct io_buffer *iobuf;
+	unsigned int index;
+	unsigned int seq;
+	uint32_t cons;
+	size_t offset;
+	size_t total;
+	size_t len;
+	int rc;
+
+	/* Process receive completions */
+	cons = rx->cons;
+	seq = gve->seq;
+	total = 0;
+	while ( 1 ) {
+
+		/* Read next possible completion */
+		index = ( cons++ & ( rx->count - 1 ) );
+		offset = ( ( index * sizeof ( cmplt ) ) +
+			   offsetof ( typeof ( cmplt ), pkt ) );
+		copy_from_user ( &cmplt.pkt, rx->cmplt, offset,
+				 sizeof ( cmplt.pkt ) );
+
+		/* Check sequence number */
+		if ( ( cmplt.pkt.seq & GVE_RX_SEQ_MASK ) != seq )
+			break;
+		seq = gve_next ( seq );
+
+		/* Parse completion */
+		len = be16_to_cpu ( cmplt.pkt.len );
+		DBGC2 ( gve, "GVE %p RX %#04x %#02x:%#02x len %#04zx at "
+			"%#08zx\n", gve, index, cmplt.pkt.seq, cmplt.pkt.flags,
+			len, gve_address ( rx, index ) );
+
+		/* Accumulate a complete packet */
+		if ( cmplt.pkt.flags & GVE_RXF_ERROR ) {
+			total = 0;
+		} else {
+			total += len;
+			if ( cmplt.pkt.flags & GVE_RXF_MORE )
+				continue;
+		}
+		gve->seq = seq;
+
+		/* Allocate and populate I/O buffer */
+		iobuf = ( total ? alloc_iob ( total ) : NULL );
+		for ( ; rx->cons != cons ; rx->cons++ ) {
+
+			/* Re-read completion length */
+			index = ( rx->cons & ( rx->count - 1 ) );
+			offset = ( ( index * sizeof ( cmplt ) ) +
+				   offsetof ( typeof ( cmplt ), pkt.len ) );
+			copy_from_user ( &cmplt.pkt, rx->cmplt, offset,
+					 sizeof ( cmplt.pkt.len ) );
+
+			/* Copy data */
+			if ( iobuf ) {
+				len = be16_to_cpu ( cmplt.pkt.len );
+				copy_from_user ( iob_put ( iobuf, len ),
+						 gve_buffer ( rx, rx->cons ),
+						 0, len );
+			}
+		}
+		assert ( ( iobuf == NULL ) || ( iob_len ( iobuf ) == total ) );
+		total = 0;
+
+		/* Hand off packet to network stack */
+		if ( iobuf ) {
+			iob_pull ( iobuf, GVE_RX_PAD );
+			netdev_rx ( netdev, iobuf );
+		} else {
+			rc = ( ( cmplt.pkt.flags & GVE_RXF_ERROR ) ?
+			       -EIO : -ENOMEM );
+			netdev_rx_err ( netdev, NULL, rc );
+		}
+
+		/* Sanity check */
+		assert ( rx->cons == cons );
+		assert ( gve->seq == seq );
+		assert ( total == 0 );
+	}
+}
+
+/**
+ * Refill receive queue
+ *
+ * @v netdev		Network device
+ */
+static void gve_refill_rx ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *rx = &gve->rx;
+	unsigned int prod;
+
+	/* The receive descriptors are prepopulated at the time of
+	 * creating the receive queue (pointing to the preallocated
+	 * queue pages).  Refilling is therefore just a case of
+	 * ringing the doorbell if the device is not yet aware of any
+	 * available descriptors.
+	 */
+	prod = ( rx->cons + rx->fill );
+	if ( prod != rx->prod ) {
+		rx->prod = prod;
+		writel ( bswap_32 ( prod ), rx->db );
+		DBGC2 ( gve, "GVE %p RX %#04x ready\n", gve, rx->prod );
+	}
+}
+
+/**
+ * Poll for completed and received packets
+ *
+ * @v netdev		Network device
+ */
+static void gve_poll ( struct net_device *netdev ) {
+
+	/* Do nothing if queues are not yet set up */
+	if ( ! netdev_link_ok ( netdev ) )
+		return;
+
+	/* Poll for transmit completions */
+	gve_poll_tx ( netdev );
+
+	/* Poll for receive completions */
+	gve_poll_rx ( netdev );
+
+	/* Refill receive queue */
+	gve_refill_rx ( netdev );
+}
+
+/** GVE network device operations */
+static struct net_device_operations gve_operations = {
+	.open		= gve_open,
+	.close		= gve_close,
+	.transmit	= gve_transmit,
+	.poll		= gve_poll,
+};
+
+/******************************************************************************
+ *
+ * PCI interface
+ *
+ ******************************************************************************
+ */
+
+/** Transmit descriptor queue type */
+static const struct gve_queue_type gve_tx_type = {
+	.name = "TX",
+	.param = gve_create_tx_param,
+	.qpl = GVE_TX_QPL,
+	.irq = GVE_TX_IRQ,
+	.fill = GVE_TX_FILL,
+	.desc_len = sizeof ( struct gve_tx_descriptor ),
+	.create = GVE_ADMIN_CREATE_TX,
+	.destroy = GVE_ADMIN_DESTROY_TX,
+};
+
+/** Receive descriptor queue type */
+static const struct gve_queue_type gve_rx_type = {
+	.name = "RX",
+	.param = gve_create_rx_param,
+	.qpl = GVE_RX_QPL,
+	.irq = GVE_RX_IRQ,
+	.fill = GVE_RX_FILL,
+	.desc_len = sizeof ( struct gve_rx_descriptor ),
+	.cmplt_len = sizeof ( struct gve_rx_completion ),
+	.create = GVE_ADMIN_CREATE_RX,
+	.destroy = GVE_ADMIN_DESTROY_RX,
+};
+
+/**
+ * Set up admin queue and get device description
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_setup ( struct gve_nic *gve ) {
+	unsigned int i;
+	int rc;
+
+	/* Attempt several times, since the device may decide to add
+	 * in a few spurious resets.
+	 */
+	for ( i = 0 ; i < GVE_RESET_MAX_RETRY ; i++ ) {
+
+		/* Reset device */
+		if ( ( rc = gve_reset ( gve ) ) != 0 )
+			continue;
+
+		/* Enable admin queue */
+		gve_admin_enable ( gve );
+
+		/* Fetch MAC address */
+		if ( ( rc = gve_describe ( gve ) ) != 0 )
+			continue;
+
+		/* Success */
+		return 0;
+	}
+
+	DBGC ( gve, "GVE %p failed to get device description: %s\n",
+	       gve, strerror ( rc ) );
+	return rc;
+}
+
+/** Device startup process descriptor */
+static struct process_descriptor gve_startup_desc =
+	PROC_DESC_ONCE ( struct gve_nic, startup, gve_startup );
+
+/**
+ * Probe PCI device
+ *
+ * @v pci		PCI device
+ * @ret rc		Return status code
+ */
+static int gve_probe ( struct pci_device *pci ) {
+	struct net_device *netdev;
+	struct gve_nic *gve;
+	unsigned long cfg_start;
+	unsigned long db_start;
+	unsigned long db_size;
+	int rc;
+
+	/* Allocate and initialise net device */
+	netdev = alloc_etherdev ( sizeof ( *gve ) );
+	if ( ! netdev ) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
+	netdev_init ( netdev, &gve_operations );
+	gve = netdev->priv;
+	pci_set_drvdata ( pci, netdev );
+	netdev->dev = &pci->dev;
+	memset ( gve, 0, sizeof ( *gve ) );
+	gve->netdev = netdev;
+	gve->tx.type = &gve_tx_type;
+	gve->rx.type = &gve_rx_type;
+	process_init ( &gve->startup, &gve_startup_desc, &netdev->refcnt );
+	timer_init ( &gve->watchdog, gve_watchdog, &netdev->refcnt );
+
+	/* Fix up PCI device */
+	adjust_pci_device ( pci );
+
+	/* Check PCI revision */
+	pci_read_config_byte ( pci, PCI_REVISION, &gve->revision );
+	DBGC ( gve, "GVE %p is revision %#02x\n", gve, gve->revision );
+
+	/* Map configuration registers */
+	cfg_start = pci_bar_start ( pci, GVE_CFG_BAR );
+	gve->cfg = pci_ioremap ( pci, cfg_start, GVE_CFG_SIZE );
+	if ( ! gve->cfg ) {
+		rc = -ENODEV;
+		goto err_cfg;
+	}
+
+	/* Map doorbell registers */
+	db_start = pci_bar_start ( pci, GVE_DB_BAR );
+	db_size = pci_bar_size ( pci, GVE_DB_BAR );
+	gve->db = pci_ioremap ( pci, db_start, db_size );
+	if ( ! gve->db ) {
+		rc = -ENODEV;
+		goto err_db;
+	}
+
+	/* Configure DMA */
+	gve->dma = &pci->dma;
+	dma_set_mask_64bit ( gve->dma );
+	assert ( netdev->dma == NULL );
+
+	/* Allocate admin queue */
+	if ( ( rc = gve_admin_alloc ( gve ) ) != 0 )
+		goto err_admin;
+
+	/* Set up the device */
+	if ( ( rc = gve_setup ( gve ) ) != 0 )
+		goto err_setup;
+
+	/* Register network device */
+	if ( ( rc = register_netdev ( netdev ) ) != 0 )
+		goto err_register_netdev;
+
+	return 0;
+
+	unregister_netdev ( netdev );
+ err_register_netdev:
+ err_setup:
+	gve_reset ( gve );
+	gve_admin_free ( gve );
+ err_admin:
+	iounmap ( gve->db );
+ err_db:
+	iounmap ( gve->cfg );
+ err_cfg:
+	netdev_nullify ( netdev );
+	netdev_put ( netdev );
+ err_alloc:
+	return rc;
+}
+
+/**
+ * Remove PCI device
+ *
+ * @v pci		PCI device
+ */
+static void gve_remove ( struct pci_device *pci ) {
+	struct net_device *netdev = pci_get_drvdata ( pci );
+	struct gve_nic *gve = netdev->priv;
+
+	/* Unregister network device */
+	unregister_netdev ( netdev );
+
+	/* Reset device */
+	gve_reset ( gve );
+
+	/* Free admin queue */
+	gve_admin_free ( gve );
+
+	/* Unmap registers */
+	iounmap ( gve->db );
+	iounmap ( gve->cfg );
+
+	/* Free network device */
+	netdev_nullify ( netdev );
+	netdev_put ( netdev );
+}
+
+/** GVE PCI device IDs */
+static struct pci_device_id gve_nics[] = {
+	PCI_ROM ( 0x1ae0, 0x0042, "gve", "gVNIC", 0 ),
+};
+
+/** GVE PCI driver */
+struct pci_driver gve_driver __pci_driver = {
+	.ids = gve_nics,
+	.id_count = ( sizeof ( gve_nics ) / sizeof ( gve_nics[0] ) ),
+	.probe = gve_probe,
+	.remove = gve_remove,
+};
diff --git a/src/drivers/net/gve.h b/src/drivers/net/gve.h
new file mode 100644
index 0000000..2845699
--- /dev/null
+++ b/src/drivers/net/gve.h
@@ -0,0 +1,702 @@
+#ifndef _GVE_H
+#define _GVE_H
+
+/** @file
+ *
+ * Google Virtual Ethernet network driver
+ *
+ * The Google Virtual Ethernet NIC (GVE or gVNIC) is found only in
+ * Google Cloud instances.  There is essentially zero documentation
+ * available beyond the mostly uncommented source code in the Linux
+ * kernel.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+#include <ipxe/dma.h>
+#include <ipxe/pci.h>
+#include <ipxe/in.h>
+#include <ipxe/uaccess.h>
+#include <ipxe/process.h>
+#include <ipxe/retry.h>
+
+struct gve_nic;
+
+/**
+ * A Google Cloud MAC address
+ *
+ * Google Cloud locally assigned MAC addresses encode the local IPv4
+ * address in the trailing 32 bits, presumably as a performance
+ * optimisation to allow ARP resolution to be skipped by a suitably
+ * aware network stack.
+ */
+struct google_mac {
+	/** Reserved */
+	uint8_t reserved[2];
+	/** Local IPv4 address */
+	struct in_addr in;
+} __attribute__ (( packed ));
+
+/** Page size */
+#define GVE_PAGE_SIZE 0x1000
+
+/**
+ * Address alignment
+ *
+ * All DMA data structure base addresses seem to need to be aligned to
+ * a page boundary.  (This is not documented anywhere, but is inferred
+ * from existing source code and experimentation.)
+ */
+#define GVE_ALIGN GVE_PAGE_SIZE
+
+/**
+ * Length alignment
+ *
+ * All DMA data structure lengths seem to need to be aligned to a
+ * multiple of 64 bytes.  (This is not documented anywhere, but is
+ * inferred from existing source code and experimentation.)
+ */
+#define GVE_LEN_ALIGN 64
+
+/** Maximum number of pages per queue (must be a power of two) */
+#define GVE_QPL_MAX 16
+
+/** Configuration BAR */
+#define GVE_CFG_BAR PCI_BASE_ADDRESS_0
+
+/**
+ * Configuration BAR size
+ *
+ * All registers within the configuration BAR are big-endian.
+ */
+#define GVE_CFG_SIZE 0x1000
+
+/** Device status */
+#define GVE_CFG_DEVSTAT 0x0000
+#define GVE_CFG_DEVSTAT_RESET 0x00000010UL	/**< Device is reset */
+
+/** Driver status */
+#define GVE_CFG_DRVSTAT 0x0004
+#define GVE_CFG_DRVSTAT_RUN 0x00000001UL	/**< Run admin queue */
+
+/** Maximum time to wait for reset */
+#define GVE_RESET_MAX_WAIT_MS 500
+
+/** Admin queue page frame number (for older devices) */
+#define GVE_CFG_ADMIN_PFN 0x0010
+
+/** Admin queue doorbell */
+#define GVE_CFG_ADMIN_DB 0x0014
+
+/** Admin queue event counter */
+#define GVE_CFG_ADMIN_EVT 0x0018
+
+/** Driver version (8-bit register) */
+#define GVE_CFG_VERSION 0x001f
+
+/** Admin queue base address high 32 bits */
+#define GVE_CFG_ADMIN_BASE_HI 0x0020
+
+/** Admin queue base address low 32 bits */
+#define GVE_CFG_ADMIN_BASE_LO 0x0024
+
+/** Admin queue base address length (16-bit register) */
+#define GVE_CFG_ADMIN_LEN 0x0028
+
+/** Doorbell BAR */
+#define GVE_DB_BAR PCI_BASE_ADDRESS_2
+
+/**
+ * Admin queue entry header
+ *
+ * All values within admin queue entries are big-endian.
+ */
+struct gve_admin_header {
+	/** Reserved */
+	uint8_t reserved[3];
+	/** Operation code */
+	uint8_t opcode;
+	/** Status */
+	uint32_t status;
+} __attribute__ (( packed ));
+
+/** Command succeeded */
+#define GVE_ADMIN_STATUS_OK 0x00000001
+
+/** Simple admin command */
+struct gve_admin_simple {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** ID */
+	uint32_t id;
+} __attribute__ (( packed ));
+
+/** Describe device command */
+#define GVE_ADMIN_DESCRIBE 0x0001
+
+/** Describe device command */
+struct gve_admin_describe {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Descriptor buffer address */
+	uint64_t addr;
+	/** Descriptor version */
+	uint32_t ver;
+	/** Descriptor maximum length */
+	uint32_t len;
+} __attribute__ (( packed ));
+
+/** Device descriptor version */
+#define GVE_ADMIN_DESCRIBE_VER 1
+
+/** Device descriptor */
+struct gve_device_descriptor {
+	/** Reserved */
+	uint8_t reserved_a[10];
+	/** Number of transmit queue entries */
+	uint16_t tx_count;
+	/** Number of receive queue entries */
+	uint16_t rx_count;
+	/** Reserved */
+	uint8_t reserved_b[2];
+	/** Maximum transmit unit */
+	uint16_t mtu;
+	/** Number of event counters */
+	uint16_t counters;
+	/** Reserved */
+	uint8_t reserved_c[4];
+	/** MAC address */
+	struct google_mac mac;
+	/** Reserved */
+	uint8_t reserved_d[10];
+} __attribute__ (( packed ));
+
+/** Configure device resources command */
+#define GVE_ADMIN_CONFIGURE 0x0002
+
+/** Configure device resources command */
+struct gve_admin_configure {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Event counter array */
+	uint64_t events;
+	/** IRQ doorbell address */
+	uint64_t irqs;
+	/** Number of event counters */
+	uint32_t num_events;
+	/** Number of IRQ doorbells */
+	uint32_t num_irqs;
+	/** IRQ doorbell stride */
+	uint32_t irq_stride;
+} __attribute__ (( packed ));
+
+/** Register page list command */
+#define GVE_ADMIN_REGISTER 0x0003
+
+/** Register page list command */
+struct gve_admin_register {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Page list ID */
+	uint32_t id;
+	/** Number of pages */
+	uint32_t count;
+	/** Address list address */
+	uint64_t addr;
+	/** Page size */
+	uint64_t size;
+} __attribute__ (( packed ));
+
+/** Page list */
+struct gve_pages {
+	/** Page address */
+	uint64_t addr[GVE_QPL_MAX];
+} __attribute__ (( packed ));
+
+/** Unregister page list command */
+#define GVE_ADMIN_UNREGISTER 0x0004
+
+/** Create transmit queue command */
+#define GVE_ADMIN_CREATE_TX 0x0005
+
+/** Create transmit queue command */
+struct gve_admin_create_tx {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Queue ID */
+	uint32_t id;
+	/** Reserved */
+	uint8_t reserved_a[4];
+	/** Queue resources address */
+	uint64_t res;
+	/** Descriptor ring address */
+	uint64_t desc;
+	/** Queue page list ID */
+	uint32_t qpl_id;
+	/** Notification channel ID */
+	uint32_t notify_id;
+} __attribute__ (( packed ));
+
+/** Create receive queue command */
+#define GVE_ADMIN_CREATE_RX 0x0006
+
+/** Create receive queue command */
+struct gve_admin_create_rx {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Queue ID */
+	uint32_t id;
+	/** Index */
+	uint32_t index;
+	/** Reserved */
+	uint8_t reserved_a[4];
+	/** Notification channel ID */
+	uint32_t notify_id;
+	/** Queue resources address */
+	uint64_t res;
+	/** Completion ring address */
+	uint64_t cmplt;
+	/** Descriptor ring address */
+	uint64_t desc;
+	/** Queue page list ID */
+	uint32_t qpl_id;
+	/** Reserved */
+	uint8_t reserved_b[2];
+	/** Packet buffer size */
+	uint16_t bufsz;
+} __attribute__ (( packed ));
+
+/** Destroy transmit queue command */
+#define GVE_ADMIN_DESTROY_TX 0x0007
+
+/** Destroy receive queue command */
+#define GVE_ADMIN_DESTROY_RX 0x0008
+
+/** Deconfigure device resources command */
+#define GVE_ADMIN_DECONFIGURE 0x0009
+
+/** An admin queue command */
+union gve_admin_command {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Simple command */
+	struct gve_admin_simple simple;
+	/** Describe device */
+	struct gve_admin_describe desc;
+	/** Configure device resources */
+	struct gve_admin_configure conf;
+	/** Register page list */
+	struct gve_admin_register reg;
+	/** Create transmit queue */
+	struct gve_admin_create_tx create_tx;
+	/** Create receive queue */
+	struct gve_admin_create_rx create_rx;
+	/** Padding */
+	uint8_t pad[64];
+};
+
+/**
+ * Number of admin queue commands
+ *
+ * This is theoretically a policy decision.  However, older revisions
+ * of the hardware seem to have only the "admin queue page frame
+ * number" register and no "admin queue length" register, with the
+ * implication that the admin queue must be exactly one page in
+ * length.
+ *
+ * Choose to use a one page (4kB) admin queue for both older and newer
+ * versions of the hardware, to minimise variability.
+ */
+#define GVE_ADMIN_COUNT ( GVE_PAGE_SIZE / sizeof ( union gve_admin_command ) )
+
+/** Admin queue */
+struct gve_admin {
+	/** Commands */
+	union gve_admin_command *cmd;
+	/** Producer counter */
+	uint32_t prod;
+	/** DMA mapping */
+	struct dma_mapping map;
+};
+
+/** Scratch buffer for admin queue commands */
+struct gve_scratch {
+	/** Buffer contents */
+	union {
+		/** Device descriptor */
+		struct gve_device_descriptor desc;
+		/** Page address list */
+		struct gve_pages pages;
+	} *buf;
+	/** DMA mapping */
+	struct dma_mapping map;
+};
+
+/**
+ * An event counter
+ *
+ * Written by the device to indicate completions.  The device chooses
+ * which counter to use for each transmit queue, and stores the index
+ * of the chosen counter in the queue resources.
+ */
+struct gve_event {
+	/** Number of events that have occurred */
+	volatile uint32_t count;
+} __attribute__ (( packed ));
+
+/**
+ * Maximum number of event counters
+ *
+ * We tell the device how many event counters we have provided via the
+ * "configure device resources" admin queue command.  The device will
+ * accept being given only a single counter, but will subsequently
+ * fail to create a receive queue.
+ *
+ * There is, of course, no documentation indicating how may event
+ * counters actually need to be provided.  In the absence of evidence
+ * to the contrary, assume that 16 counters (i.e. the smallest number
+ * we can allocate, given the length alignment constraint on
+ * allocations) will be sufficient.
+ */
+#define GVE_EVENT_MAX ( GVE_LEN_ALIGN / sizeof ( struct gve_event ) )
+
+/** Event counter array */
+struct gve_events {
+	/** Event counters */
+	struct gve_event *event;
+	/** DMA mapping */
+	struct dma_mapping map;
+	/** Actual number of event counters */
+	unsigned int count;
+};
+
+/** An interrupt channel */
+struct gve_irq {
+	/** Interrupt doorbell index (within doorbell BAR) */
+	uint32_t db_idx;
+	/** Reserved */
+	uint8_t reserved[60];
+} __attribute__ (( packed ));
+
+/**
+ * Number of interrupt channels
+ *
+ * We tell the device how many interrupt channels we have provided via
+ * the "configure device resources" admin queue command.  The device
+ * will accept being given zero interrupt channels, but will
+ * subsequently fail to create more than a single queue (either
+ * transmit or receive).
+ *
+ * There is, of course, no documentation indicating how may interrupt
+ * channels actually need to be provided.  In the absence of evidence
+ * to the contrary, assume that two channels (one for transmit, one
+ * for receive) will be sufficient.
+ */
+#define GVE_IRQ_COUNT 2
+
+/** Interrupt channel array */
+struct gve_irqs {
+	/** Interrupt channels */
+	struct gve_irq *irq;
+	/** DMA mapping */
+	struct dma_mapping map;
+	/** Interrupt doorbells */
+	volatile uint32_t *db[GVE_IRQ_COUNT];
+};
+
+/** Disable interrupts */
+#define GVE_IRQ_DISABLE 0x40000000UL
+
+/**
+ * Queue resources
+ *
+ * Written by the device to indicate the indices of the chosen event
+ * counter and descriptor doorbell register.
+ *
+ * This appears to be a largely pointless data structure: the relevant
+ * information is static for the lifetime of the queue and could
+ * trivially have been returned in the response for the "create
+ * transmit/receive queue" command, instead of requiring yet another
+ * page-aligned coherent DMA buffer allocation.
+ */
+struct gve_resources {
+	/** Descriptor doorbell index (within doorbell BAR) */
+	uint32_t db_idx;
+	/** Event counter index (within event counter array) */
+	uint32_t evt_idx;
+	/** Reserved */
+	uint8_t reserved[56];
+} __attribute__ (( packed ));
+
+/**
+ * Queue data buffer size
+ *
+ * In theory, we may specify the size of receive buffers.  However,
+ * the original version of the device seems not to have a parameter
+ * for this, and assumes the use of half-page (2kB) buffers.  Choose
+ * to use this as the buffer size, on the assumption that older
+ * devices will not support any other buffer size.
+ */
+#define GVE_BUF_SIZE ( GVE_PAGE_SIZE / 2 )
+
+/** Number of data buffers per page */
+#define GVE_BUF_PER_PAGE ( GVE_PAGE_SIZE / GVE_BUF_SIZE )
+
+/**
+ * Queue page list
+ *
+ * The device uses preregistered pages for fast-path DMA operations
+ * (i.e. transmit and receive buffers).  A list of device addresses
+ * for each page must be registered before the transmit or receive
+ * queue is created, and cannot subsequently be modified.
+ *
+ * The Linux driver allocates pages as DMA_TO_DEVICE or
+ * DMA_FROM_DEVICE as appropriate, and uses dma_sync_single_for_cpu()
+ * etc to ensure that data is copied to/from bounce buffers as needed.
+ *
+ * Unfortunately there is no such sync operation available within our
+ * DMA API, since we are constrained by the limitations imposed by
+ * EFI_PCI_IO_PROTOCOL.  There is no way to synchronise a buffer
+ * without also [un]mapping it, and no way to force the reuse of the
+ * same device address for a subsequent remapping.  We are therefore
+ * constrained to use only DMA-coherent buffers, since this is the
+ * only way we can repeatedly reuse the same device address.
+ *
+ * Newer versions of the gVNIC device support "raw DMA addressing
+ * (RDA)", which is essentially a prebuilt queue page list covering
+ * the whole of the guest address space.  Unfortunately we cannot rely
+ * on this, since older versions will not support it.
+ *
+ * Experimentation suggests that the device will accept a request to
+ * create a queue page list covering the whole of the guest address
+ * space via two giant "pages" of 2^63 bytes each.  However,
+ * experimentation also suggests that the device will accept any old
+ * garbage value as the "page size".  In the total absence of any
+ * documentation, it is probably unsafe to conclude that the device is
+ * bothering to look at or respect the "page size" parameter: it is
+ * most likely just presuming the use of 4kB pages.
+ */
+struct gve_qpl {
+	/** Page addresses */
+	userptr_t data;
+	/** Page mapping */
+	struct dma_mapping map;
+	/** Number of pages */
+	unsigned int count;
+	/** Queue page list ID */
+	unsigned int id;
+};
+
+/**
+ * Maximum number of transmit buffers
+ *
+ * This is a policy decision.
+ */
+#define GVE_TX_FILL 8
+
+/** Transmit queue page list ID */
+#define GVE_TX_QPL 0x18ae5458
+
+/** Tranmsit queue interrupt channel */
+#define GVE_TX_IRQ 0
+
+/** A transmit or receive buffer descriptor */
+struct gve_buffer {
+	/** Address (within queue page list address space) */
+	uint64_t addr;
+} __attribute__ (( packed ));
+
+/** A transmit packet descriptor */
+struct gve_tx_packet {
+	/** Type */
+	uint8_t type;
+	/** Reserved */
+	uint8_t reserved_a[2];
+	/** Number of descriptors in this packet */
+	uint8_t count;
+	/** Total length of this packet */
+	uint16_t total;
+	/** Length of this descriptor */
+	uint16_t len;
+} __attribute__ (( packed ));
+
+/** A transmit descriptor */
+struct gve_tx_descriptor {
+	/** Packet descriptor */
+	struct gve_tx_packet pkt;
+	/** Buffer descriptor */
+	struct gve_buffer buf;
+} __attribute__ (( packed ));
+
+/** Start of packet transmit descriptor type */
+#define GVE_TX_TYPE_START 0x00
+
+/** Continuation of packet transmit descriptor type */
+#define GVE_TX_TYPE_CONT 0x20
+
+/**
+ * Maximum number of receive buffers
+ *
+ * This is a policy decision.
+ */
+#define GVE_RX_FILL 16
+
+/** Receive queue page list ID */
+#define GVE_RX_QPL 0x18ae5258
+
+/** Receive queue interrupt channel */
+#define GVE_RX_IRQ 1
+
+/** A receive descriptor */
+struct gve_rx_descriptor {
+	/** Buffer descriptor */
+	struct gve_buffer buf;
+} __attribute__ (( packed ));
+
+/** A receive packet descriptor */
+struct gve_rx_packet {
+	/** Length */
+	uint16_t len;
+	/** Flags */
+	uint8_t flags;
+	/** Sequence number */
+	uint8_t seq;
+} __attribute__ (( packed ));
+
+/** Receive error */
+#define GVE_RXF_ERROR 0x08
+
+/** Receive packet continues into next descriptor */
+#define GVE_RXF_MORE 0x20
+
+/** Receive sequence number mask */
+#define GVE_RX_SEQ_MASK 0x07
+
+/** A receive completion descriptor */
+struct gve_rx_completion {
+	/** Reserved */
+	uint8_t reserved[60];
+	/** Packet descriptor */
+	struct gve_rx_packet pkt;
+} __attribute__ (( packed ));
+
+/** Padding at the start of all received packets */
+#define GVE_RX_PAD 2
+
+/** A descriptor queue */
+struct gve_queue {
+	/** Descriptor ring */
+	userptr_t desc;
+	/** Completion ring */
+	userptr_t cmplt;
+	/** Queue resources */
+	struct gve_resources *res;
+
+	/** Queue type */
+	const struct gve_queue_type *type;
+	/** Number of descriptors (must be a power of two) */
+	unsigned int count;
+	/** Maximum fill level (must be a power of two) */
+	unsigned int fill;
+
+	/** Descriptor mapping */
+	struct dma_mapping desc_map;
+	/** Completion mapping */
+	struct dma_mapping cmplt_map;
+	/** Queue resources mapping */
+	struct dma_mapping res_map;
+
+	/** Doorbell register */
+	volatile uint32_t *db;
+	/** Event counter */
+	struct gve_event *event;
+
+	/** Producer counter */
+	uint32_t prod;
+	/** Consumer counter */
+	uint32_t cons;
+
+	/** Queue page list */
+	struct gve_qpl qpl;
+};
+
+/** A descriptor queue type */
+struct gve_queue_type {
+	/** Name */
+	const char *name;
+	/**
+	 * Populate command parameters to create queue
+	 *
+	 * @v queue		Descriptor queue
+	 * @v cmd		Admin queue command
+	 */
+	void ( * param ) ( struct gve_queue *queue,
+			   union gve_admin_command *cmd );
+	/** Queue page list ID */
+	uint32_t qpl;
+	/** Interrupt channel */
+	uint8_t irq;
+	/** Maximum fill level */
+	uint8_t fill;
+	/** Descriptor size */
+	uint8_t desc_len;
+	/** Completion size */
+	uint8_t cmplt_len;
+	/** Command to create queue */
+	uint8_t create;
+	/** Command to destroy queue */
+	uint8_t destroy;
+};
+
+/** A Google Virtual Ethernet NIC */
+struct gve_nic {
+	/** Configuration registers */
+	void *cfg;
+	/** Doorbell registers */
+	void *db;
+	/** PCI revision */
+	uint8_t revision;
+	/** Network device */
+	struct net_device *netdev;
+	/** DMA device */
+	struct dma_device *dma;
+
+	/** Admin queue */
+	struct gve_admin admin;
+	/** Interrupt channels */
+	struct gve_irqs irqs;
+	/** Event counters */
+	struct gve_events events;
+	/** Scratch buffer */
+	struct gve_scratch scratch;
+
+	/** Transmit queue */
+	struct gve_queue tx;
+	/** Receive queue */
+	struct gve_queue rx;
+	/** Transmit I/O buffers */
+	struct io_buffer *tx_iobuf[GVE_TX_FILL];
+	/** Receive sequence number */
+	unsigned int seq;
+
+	/** Startup process */
+	struct process startup;
+	/** Startup process retry counter */
+	unsigned int retries;
+	/** Reset recovery watchdog timer */
+	struct retry_timer watchdog;
+	/** Reset recovery recorded activity counter */
+	uint32_t activity;
+};
+
+/** Maximum time to wait for admin queue commands */
+#define GVE_ADMIN_MAX_WAIT_MS 500
+
+/** Maximum number of times to reattempt device reset */
+#define GVE_RESET_MAX_RETRY 5
+
+/** Time between reset recovery checks */
+#define GVE_WATCHDOG_TIMEOUT ( 1 * TICKS_PER_SEC )
+
+#endif /* _GVE_H */
diff --git a/src/include/ipxe/errfile.h b/src/include/ipxe/errfile.h
index fcb4f0e..7615075 100644
--- a/src/include/ipxe/errfile.h
+++ b/src/include/ipxe/errfile.h
@@ -224,6 +224,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
 #define ERRFILE_ecam		     ( ERRFILE_DRIVER | 0x00d30000 )
 #define ERRFILE_pcibridge	     ( ERRFILE_DRIVER | 0x00d40000 )
 #define ERRFILE_mnpnet		     ( ERRFILE_DRIVER | 0x00d50000 )
+#define ERRFILE_gve		     ( ERRFILE_DRIVER | 0x00d60000 )
 
 #define ERRFILE_aoe			( ERRFILE_NET | 0x00000000 )
 #define ERRFILE_arp			( ERRFILE_NET | 0x00010000 )