[gve] Add driver for Google Virtual Ethernet NIC

The Google Virtual Ethernet NIC (GVE or gVNIC) is found only in Google Cloud instances. There is essentially zero documentation available beyond the mostly uncommented source code in the Linux kernel. Signed-off-by: Michael Brown <mcb30@ipxe.org>
author: Michael Brown <mcb30@ipxe.org> 2024-07-24 14:30:58 +0100
committer: Michael Brown <mcb30@ipxe.org> 2024-07-24 14:45:46 +0100
commit: c7b76e3adc3b4365aa3b490f24ae22375901c559 (patch)
tree: 201e7d44fedbee4d2f6b20afc2c73f907b5b6dee
parent: 5a9f476d4f1395e69cbb845d7379b0e3591028c0 (diff)
download: ipxe-c7b76e3adc3b4365aa3b490f24ae22375901c559.zip
ipxe-c7b76e3adc3b4365aa3b490f24ae22375901c559.tar.gz
ipxe-c7b76e3adc3b4365aa3b490f24ae22375901c559.tar.bz2
4 files changed, 2313 insertions, 0 deletions
diff --git a/src/config/fault.h b/src/config/fault.h
index 5024a8f..b6ee3c9 100644
--- a/src/config/fault.h
+++ b/src/config/fault.h
@@ -29,6 +29,9 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
 /* Corrupt every N received PeerDist packets */
 #define PEERBLK_CORRUPT_RATE 0
 
+/* Experience virtual machine migration on every N watchdog checks */
+#define VM_MIGRATED_RATE 0
+
 #include <config/local/fault.h>
 
 #endif /* CONFIG_FAULT_H */
diff --git a/src/drivers/net/gve.c b/src/drivers/net/gve.c
new file mode 100644
index 0000000..03edc08
--- /dev/null
+++ b/src/drivers/net/gve.c
@@ -0,0 +1,1607 @@
+/*
+ * Copyright (C) 2024 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * You can also choose to distribute this program under the terms of
+ * the Unmodified Binary Distribution Licence (as given in the file
+ * COPYING.UBDL), provided that you have satisfied its requirements.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+#include <string.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <byteswap.h>
+#include <ipxe/netdevice.h>
+#include <ipxe/ethernet.h>
+#include <ipxe/if_ether.h>
+#include <ipxe/iobuf.h>
+#include <ipxe/dma.h>
+#include <ipxe/pci.h>
+#include <ipxe/fault.h>
+#include "gve.h"
+
+/** @file
+ *
+ * Google Virtual Ethernet network driver
+ *
+ */
+
+/* Disambiguate the various error causes */
+#define EINFO_EIO_ADMIN_UNSET						\
+	__einfo_uniqify ( EINFO_EIO, 0x00, "Uncompleted" )
+#define EIO_ADMIN_UNSET							\
+	__einfo_error ( EINFO_EIO_ADMIN_UNSET )
+#define EINFO_EIO_ADMIN_ABORTED						\
+	__einfo_uniqify ( EINFO_EIO, 0x10, "Aborted" )
+#define EIO_ADMIN_ABORTED						\
+	__einfo_error ( EINFO_EIO_ADMIN_ABORTED )
+#define EINFO_EIO_ADMIN_EXISTS						\
+	__einfo_uniqify ( EINFO_EIO, 0x11, "Already exists" )
+#define EIO_ADMIN_EXISTS						\
+	__einfo_error ( EINFO_EIO_ADMIN_EXISTS )
+#define EINFO_EIO_ADMIN_CANCELLED					\
+	__einfo_uniqify ( EINFO_EIO, 0x12, "Cancelled" )
+#define EIO_ADMIN_CANCELLED						\
+	__einfo_error ( EINFO_EIO_ADMIN_CANCELLED )
+#define EINFO_EIO_ADMIN_DATALOSS					\
+	__einfo_uniqify ( EINFO_EIO, 0x13, "Data loss" )
+#define EIO_ADMIN_DATALOSS						\
+	__einfo_error ( EINFO_EIO_ADMIN_DATALOSS )
+#define EINFO_EIO_ADMIN_DEADLINE					\
+	__einfo_uniqify ( EINFO_EIO, 0x14, "Deadline exceeded" )
+#define EIO_ADMIN_DEADLINE						\
+	__einfo_error ( EINFO_EIO_ADMIN_DEADLINE )
+#define EINFO_EIO_ADMIN_PRECONDITION					\
+	__einfo_uniqify ( EINFO_EIO, 0x15, "Failed precondition" )
+#define EIO_ADMIN_PRECONDITION						\
+	__einfo_error ( EINFO_EIO_ADMIN_PRECONDITION )
+#define EINFO_EIO_ADMIN_INTERNAL					\
+	__einfo_uniqify ( EINFO_EIO, 0x16, "Internal error" )
+#define EIO_ADMIN_INTERNAL						\
+	__einfo_error ( EINFO_EIO_ADMIN_INTERNAL )
+#define EINFO_EIO_ADMIN_INVAL						\
+	__einfo_uniqify ( EINFO_EIO, 0x17, "Invalid argument" )
+#define EIO_ADMIN_INVAL							\
+	__einfo_error ( EINFO_EIO_ADMIN_INVAL )
+#define EINFO_EIO_ADMIN_NOT_FOUND					\
+	__einfo_uniqify ( EINFO_EIO, 0x18, "Not found" )
+#define EIO_ADMIN_NOT_FOUND						\
+	__einfo_error ( EINFO_EIO_ADMIN_NOT_FOUND )
+#define EINFO_EIO_ADMIN_RANGE						\
+	__einfo_uniqify ( EINFO_EIO, 0x19, "Out of range" )
+#define EIO_ADMIN_RANGE							\
+	__einfo_error ( EINFO_EIO_ADMIN_RANGE )
+#define EINFO_EIO_ADMIN_PERM						\
+	__einfo_uniqify ( EINFO_EIO, 0x1a, "Permission denied" )
+#define EIO_ADMIN_PERM							\
+	__einfo_error ( EINFO_EIO_ADMIN_PERM )
+#define EINFO_EIO_ADMIN_UNAUTH						\
+	__einfo_uniqify ( EINFO_EIO, 0x1b, "Unauthenticated" )
+#define EIO_ADMIN_UNAUTH						\
+	__einfo_error ( EINFO_EIO_ADMIN_UNAUTH )
+#define EINFO_EIO_ADMIN_RESOURCE					\
+	__einfo_uniqify ( EINFO_EIO, 0x1c, "Resource exhausted" )
+#define EIO_ADMIN_RESOURCE						\
+	__einfo_error ( EINFO_EIO_ADMIN_RESOURCE )
+#define EINFO_EIO_ADMIN_UNAVAIL						\
+	__einfo_uniqify ( EINFO_EIO, 0x1d, "Unavailable" )
+#define EIO_ADMIN_UNAVAIL						\
+	__einfo_error ( EINFO_EIO_ADMIN_UNAVAIL )
+#define EINFO_EIO_ADMIN_NOTSUP						\
+	__einfo_uniqify ( EINFO_EIO, 0x1e, "Unimplemented" )
+#define EIO_ADMIN_NOTSUP	       					\
+	__einfo_error ( EINFO_EIO_ADMIN_NOTSUP )
+#define EINFO_EIO_ADMIN_UNKNOWN						\
+	__einfo_uniqify ( EINFO_EIO, 0x1f, "Unknown error" )
+#define EIO_ADMIN_UNKNOWN						\
+	__einfo_error ( EINFO_EIO_ADMIN_UNKNOWN )
+#define EIO_ADMIN( status )						\
+	EUNIQ ( EINFO_EIO, ( (status) & 0x1f ),				\
+		EIO_ADMIN_UNSET, EIO_ADMIN_ABORTED, EIO_ADMIN_EXISTS,	\
+		EIO_ADMIN_CANCELLED, EIO_ADMIN_DATALOSS,		\
+		EIO_ADMIN_DEADLINE, EIO_ADMIN_NOT_FOUND,		\
+		EIO_ADMIN_RANGE, EIO_ADMIN_PERM, EIO_ADMIN_UNAUTH,	\
+		EIO_ADMIN_RESOURCE, EIO_ADMIN_UNAVAIL,			\
+		EIO_ADMIN_NOTSUP, EIO_ADMIN_UNKNOWN )
+
+/******************************************************************************
+ *
+ * Device reset
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Reset hardware
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_reset ( struct gve_nic *gve ) {
+	uint32_t pfn;
+	unsigned int i;
+
+	/* Skip reset if admin queue page frame number is already
+	 * clear.  Triggering a reset on an already-reset device seems
+	 * to cause a delayed reset to be scheduled.  This can cause
+	 * the device to end up in a reset loop, where each attempt to
+	 * recover from reset triggers another reset a few seconds
+	 * later.
+	 */
+	pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+	if ( ! pfn ) {
+		DBGC ( gve, "GVE %p skipping reset\n", gve );
+		return 0;
+	}
+
+	/* Clear admin queue page frame number */
+	writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN );
+	wmb();
+
+	/* Wait for device to reset */
+	for ( i = 0 ; i < GVE_RESET_MAX_WAIT_MS ; i++ ) {
+
+		/* Delay */
+		mdelay ( 1 );
+
+		/* Check for reset completion */
+		pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+		if ( ! pfn )
+			return 0;
+	}
+
+	DBGC ( gve, "GVE %p reset timed out (PFN %#08x devstat %#08x)\n",
+	       gve, bswap_32 ( pfn ),
+	       bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) );
+	return -ETIMEDOUT;
+}
+
+/******************************************************************************
+ *
+ * Admin queue
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Allocate admin queue
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_admin_alloc ( struct gve_nic *gve ) {
+	struct dma_device *dma = gve->dma;
+	struct gve_admin *admin = &gve->admin;
+	struct gve_irqs *irqs = &gve->irqs;
+	struct gve_events *events = &gve->events;
+	struct gve_scratch *scratch = &gve->scratch;
+	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
+	size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) );
+	size_t events_len = ( GVE_EVENT_MAX * sizeof ( events->event[0] ) );
+	size_t scratch_len = sizeof ( *scratch->buf );
+	int rc;
+
+	/* Allocate admin queue */
+	admin->cmd = dma_alloc ( dma, &admin->map, admin_len, GVE_ALIGN );
+	if ( ! admin->cmd ) {
+		rc = -ENOMEM;
+		goto err_admin;
+	}
+
+	/* Allocate interrupt channels */
+	irqs->irq = dma_alloc ( dma, &irqs->map, irqs_len, GVE_ALIGN );
+	if ( ! irqs->irq ) {
+		rc = -ENOMEM;
+		goto err_irqs;
+	}
+
+	/* Allocate event counters */
+	events->event = dma_alloc ( dma, &events->map, events_len, GVE_ALIGN );
+	if ( ! events->event ) {
+		rc = -ENOMEM;
+		goto err_events;
+	}
+
+	/* Allocate scratch buffer */
+	scratch->buf = dma_alloc ( dma, &scratch->map, scratch_len, GVE_ALIGN );
+	if ( ! scratch->buf ) {
+		rc = -ENOMEM;
+		goto err_scratch;
+	}
+
+	DBGC ( gve, "GVE %p AQ at [%08lx,%08lx)\n",
+	       gve, virt_to_phys ( admin->cmd ),
+	       ( virt_to_phys ( admin->cmd ) + admin_len ) );
+	return 0;
+
+	dma_free ( &scratch->map, scratch->buf, scratch_len );
+ err_scratch:
+	dma_free ( &events->map, events->event, events_len );
+ err_events:
+	dma_free ( &irqs->map, irqs->irq, irqs_len );
+ err_irqs:
+	dma_free ( &admin->map, admin->cmd, admin_len );
+ err_admin:
+	return rc;
+}
+
+/**
+ * Free admin queue
+ *
+ * @v gve		GVE device
+ */
+static void gve_admin_free ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	struct gve_irqs *irqs = &gve->irqs;
+	struct gve_events *events = &gve->events;
+	struct gve_scratch *scratch = &gve->scratch;
+	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
+	size_t irqs_len = ( GVE_IRQ_COUNT * sizeof ( irqs->irq[0] ) );
+	size_t events_len = ( GVE_EVENT_MAX * sizeof ( events->event[0] ) );
+	size_t scratch_len = sizeof ( *scratch->buf );
+
+	/* Free scratch buffer */
+	dma_free ( &scratch->map, scratch->buf, scratch_len );
+
+	/* Free event counter */
+	dma_free ( &events->map, events->event, events_len );
+
+	/* Free interrupt channels */
+	dma_free ( &irqs->map, irqs->irq, irqs_len );
+
+	/* Free admin queue */
+	dma_free ( &admin->map, admin->cmd, admin_len );
+}
+
+/**
+ * Enable admin queue
+ *
+ * @v gve		GVE device
+ */
+static void gve_admin_enable ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	size_t admin_len = ( GVE_ADMIN_COUNT * sizeof ( admin->cmd[0] ) );
+	physaddr_t base;
+
+	/* Reset queue */
+	admin->prod = 0;
+
+	/* Program queue addresses and capabilities */
+	base = dma ( &admin->map, admin->cmd );
+	writel ( bswap_32 ( base / GVE_PAGE_SIZE ),
+		 gve->cfg + GVE_CFG_ADMIN_PFN );
+	writel ( bswap_32 ( base & 0xffffffffUL ),
+		 gve->cfg + GVE_CFG_ADMIN_BASE_LO );
+	if ( sizeof ( base ) > sizeof ( uint32_t ) ) {
+		writel ( bswap_32 ( ( ( uint64_t ) base ) >> 32 ),
+			 gve->cfg + GVE_CFG_ADMIN_BASE_HI );
+	} else {
+		writel ( 0, gve->cfg + GVE_CFG_ADMIN_BASE_HI );
+	}
+	writel ( bswap_16 ( admin_len ), gve->cfg + GVE_CFG_ADMIN_LEN );
+	writel ( bswap_32 ( GVE_CFG_DRVSTAT_RUN ), gve->cfg + GVE_CFG_DRVSTAT );
+}
+
+/**
+ * Get next available admin queue command slot
+ *
+ * @v gve		GVE device
+ * @ret cmd		Admin queue command
+ */
+static union gve_admin_command * gve_admin_command ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	union gve_admin_command *cmd;
+	unsigned int index;
+
+	/* Get next command slot */
+	index = admin->prod;
+	cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ];
+
+	/* Initialise request */
+	memset ( cmd, 0, sizeof ( *cmd ) );
+
+	return cmd;
+}
+
+/**
+ * Wait for admin queue command to complete
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_admin_wait ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	uint32_t evt;
+	uint32_t pfn;
+	unsigned int i;
+
+	/* Wait for any outstanding commands to complete */
+	for ( i = 0 ; i < GVE_ADMIN_MAX_WAIT_MS ; i++ ) {
+
+		/* Check event counter */
+		rmb();
+		evt = bswap_32 ( readl ( gve->cfg + GVE_CFG_ADMIN_EVT ) );
+		if ( evt == admin->prod )
+			return 0;
+
+		/* Check for device reset */
+		pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+		if ( ! pfn )
+			break;
+
+		/* Delay */
+		mdelay ( 1 );
+	}
+
+	DBGC ( gve, "GVE %p AQ %#02x %s (completed %#02x, status %#08x)\n",
+	       gve, admin->prod, ( pfn ? "timed out" : "saw reset" ), evt,
+	       bswap_32 ( readl ( gve->cfg + GVE_CFG_DEVSTAT ) ) );
+	return ( pfn ? -ETIMEDOUT : -ECONNRESET );
+}
+
+/**
+ * Issue admin queue command
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_admin ( struct gve_nic *gve ) {
+	struct gve_admin *admin = &gve->admin;
+	union gve_admin_command *cmd;
+	unsigned int index;
+	uint32_t opcode;
+	uint32_t status;
+	int rc;
+
+	/* Ensure admin queue is idle */
+	if ( ( rc = gve_admin_wait ( gve ) ) != 0 )
+		return rc;
+
+	/* Get next command slot */
+	index = admin->prod;
+	cmd = &admin->cmd[ index % GVE_ADMIN_COUNT ];
+	opcode = cmd->hdr.opcode;
+	DBGC2 ( gve, "GVE %p AQ %#02x command %#04x request:\n",
+		gve, index, opcode );
+	DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
+
+	/* Increment producer counter */
+	admin->prod++;
+
+	/* Ring doorbell */
+	wmb();
+	writel ( bswap_32 ( admin->prod ), gve->cfg + GVE_CFG_ADMIN_DB );
+
+	/* Wait for command to complete */
+	if ( ( rc = gve_admin_wait ( gve ) ) != 0 )
+		return rc;
+
+	/* Check command status */
+	status = be32_to_cpu ( cmd->hdr.status );
+	if ( status != GVE_ADMIN_STATUS_OK ) {
+		rc = -EIO_ADMIN ( status );
+		DBGC ( gve, "GVE %p AQ %#02x command %#04x failed: %#08x\n",
+		       gve, index, opcode, status );
+		DBGC_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
+		DBGC ( gve, "GVE %p AQ error: %s\n", gve, strerror ( rc ) );
+		return rc;
+	}
+
+	DBGC2 ( gve, "GVE %p AQ %#02x command %#04x result:\n",
+		gve, index, opcode );
+	DBGC2_HDA ( gve, 0, cmd, sizeof ( *cmd ) );
+	return 0;
+}
+
+/**
+ * Issue simple admin queue command
+ *
+ * @v gve		GVE device
+ * @v opcode		Operation code
+ * @v id		ID parameter (or zero if not applicable)
+ * @ret rc		Return status code
+ *
+ * Several admin queue commands take either an empty parameter list or
+ * a single 32-bit ID parameter.
+ */
+static int gve_admin_simple ( struct gve_nic *gve, unsigned int opcode,
+			      unsigned int id ) {
+	union gve_admin_command *cmd;
+	int rc;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = opcode;
+	cmd->simple.id = cpu_to_be32 ( id );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/**
+ * Get device descriptor
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_describe ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+	struct gve_device_descriptor *desc = &gve->scratch.buf->desc;
+	union gve_admin_command *cmd;
+	int rc;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = GVE_ADMIN_DESCRIBE;
+	cmd->desc.addr = cpu_to_be64 ( dma ( &gve->scratch.map, desc ) );
+	cmd->desc.ver = cpu_to_be32 ( GVE_ADMIN_DESCRIBE_VER );
+	cmd->desc.len = cpu_to_be32 ( sizeof ( *desc ) );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+	DBGC2 ( gve, "GVE %p device descriptor:\n", gve );
+	DBGC2_HDA ( gve, 0, desc, sizeof ( *desc ) );
+
+	/* Extract queue parameters */
+	gve->events.count = be16_to_cpu ( desc->counters );
+	if ( gve->events.count > GVE_EVENT_MAX )
+		gve->events.count = GVE_EVENT_MAX;
+	gve->tx.count = be16_to_cpu ( desc->tx_count );
+	gve->rx.count = be16_to_cpu ( desc->rx_count );
+	DBGC ( gve, "GVE %p using %d TX, %d RX, %d/%d events\n",
+	       gve, gve->tx.count, gve->rx.count, gve->events.count,
+	       be16_to_cpu ( desc->counters ) );
+
+	/* Extract network parameters */
+	build_assert ( sizeof ( desc->mac ) == ETH_ALEN );
+	memcpy ( netdev->hw_addr, &desc->mac, sizeof ( desc->mac ) );
+	netdev->mtu = be16_to_cpu ( desc->mtu );
+	netdev->max_pkt_len = ( netdev->mtu + ETH_HLEN );
+	DBGC ( gve, "GVE %p MAC %s (\"%s\") MTU %zd\n",
+	       gve, eth_ntoa ( netdev->hw_addr ),
+	       inet_ntoa ( desc->mac.in ), netdev->mtu );
+
+	return 0;
+}
+
+/**
+ * Configure device resources
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_configure ( struct gve_nic *gve ) {
+	struct gve_events *events = &gve->events;
+	struct gve_irqs *irqs = &gve->irqs;
+	union gve_admin_command *cmd;
+	unsigned int db_off;
+	unsigned int i;
+	int rc;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = GVE_ADMIN_CONFIGURE;
+	cmd->conf.events =
+		cpu_to_be64 ( dma ( &events->map, events->event ) );
+	cmd->conf.irqs =
+		cpu_to_be64 ( dma ( &irqs->map, irqs->irq ) );
+	cmd->conf.num_events = cpu_to_be32 ( events->count );
+	cmd->conf.num_irqs = cpu_to_be32 ( GVE_IRQ_COUNT );
+	cmd->conf.irq_stride = cpu_to_be32 ( sizeof ( irqs->irq[0] ) );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	/* Disable all interrupts */
+	for ( i = 0 ; i < GVE_IRQ_COUNT ; i++ ) {
+		db_off = ( be32_to_cpu ( irqs->irq[i].db_idx ) *
+			   sizeof ( uint32_t ) );
+		DBGC ( gve, "GVE %p IRQ %d doorbell +%#04x\n", gve, i, db_off );
+		irqs->db[i] = ( gve->db + db_off );
+		writel ( bswap_32 ( GVE_IRQ_DISABLE ), irqs->db[i] );
+	}
+
+	return 0;
+}
+
+/**
+ * Deconfigure device resources
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_deconfigure ( struct gve_nic *gve ) {
+	int rc;
+
+	/* Issue command (with meaningless ID) */
+	if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_DECONFIGURE, 0 ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/**
+ * Register queue page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ * @ret rc		Return status code
+ */
+static int gve_register ( struct gve_nic *gve, struct gve_qpl *qpl ) {
+	struct gve_pages *pages = &gve->scratch.buf->pages;
+	union gve_admin_command *cmd;
+	physaddr_t addr;
+	unsigned int i;
+	int rc;
+
+	/* Build page address list */
+	for ( i = 0 ; i < qpl->count ; i++ ) {
+		addr = user_to_phys ( qpl->data, ( i * GVE_PAGE_SIZE ) );
+		pages->addr[i] = cpu_to_be64 ( dma_phys ( &qpl->map, addr ) );
+	}
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = GVE_ADMIN_REGISTER;
+	cmd->reg.id = cpu_to_be32 ( qpl->id );
+	cmd->reg.count = cpu_to_be32 ( qpl->count );
+	cmd->reg.addr = cpu_to_be64 ( dma ( &gve->scratch.map, pages ) );
+	cmd->reg.size = cpu_to_be64 ( GVE_PAGE_SIZE );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/**
+ * Unregister page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ * @ret rc		Return status code
+ */
+static int gve_unregister ( struct gve_nic *gve, struct gve_qpl *qpl ) {
+	int rc;
+
+	/* Issue command */
+	if ( ( rc = gve_admin_simple ( gve, GVE_ADMIN_UNREGISTER,
+				       qpl->id ) ) != 0 ) {
+		return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * Construct command to create transmit queue
+ *
+ * @v queue		Transmit queue
+ * @v cmd		Admin queue command
+ */
+static void gve_create_tx_param ( struct gve_queue *queue,
+				  union gve_admin_command *cmd ) {
+	struct gve_admin_create_tx *create = &cmd->create_tx;
+	const struct gve_queue_type *type = queue->type;
+	physaddr_t desc = user_to_phys ( queue->desc, 0 );
+
+	/* Construct request parameters */
+	create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) );
+	create->desc = cpu_to_be64 ( dma_phys ( &queue->desc_map, desc ) );
+	create->qpl_id = cpu_to_be32 ( type->qpl );
+	create->notify_id = cpu_to_be32 ( type->irq );
+}
+
+/**
+ * Construct command to create receive queue
+ *
+ * @v queue		Receive queue
+ * @v cmd		Admin queue command
+ */
+static void gve_create_rx_param ( struct gve_queue *queue,
+				  union gve_admin_command *cmd ) {
+	struct gve_admin_create_rx *create = &cmd->create_rx;
+	const struct gve_queue_type *type = queue->type;
+	physaddr_t desc = user_to_phys ( queue->desc, 0 );
+	physaddr_t cmplt = user_to_phys ( queue->cmplt, 0 );
+
+	/* Construct request parameters */
+	create->notify_id = cpu_to_be32 ( type->irq );
+	create->res = cpu_to_be64 ( dma ( &queue->res_map, queue->res ) );
+	create->desc = cpu_to_be64 ( dma_phys ( &queue->desc_map, desc ) );
+	create->cmplt = cpu_to_be64 ( dma_phys ( &queue->cmplt_map, cmplt ) );
+	create->qpl_id = cpu_to_be32 ( type->qpl );
+	create->bufsz = cpu_to_be16 ( GVE_BUF_SIZE );
+}
+
+/**
+ * Create transmit or receive queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ * @ret rc		Return status code
+ */
+static int gve_create_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	union gve_admin_command *cmd;
+	unsigned int db_off;
+	unsigned int evt_idx;
+	int rc;
+
+	/* Reset queue */
+	queue->prod = 0;
+	queue->cons = 0;
+
+	/* Construct request */
+	cmd = gve_admin_command ( gve );
+	cmd->hdr.opcode = type->create;
+	type->param ( queue, cmd );
+
+	/* Issue command */
+	if ( ( rc = gve_admin ( gve ) ) != 0 )
+		return rc;
+
+	/* Record indices */
+	db_off = ( be32_to_cpu ( queue->res->db_idx ) * sizeof ( uint32_t ) );
+	evt_idx = be32_to_cpu ( queue->res->evt_idx );
+	DBGC ( gve, "GVE %p %s doorbell +%#04x event counter %d\n",
+	       gve, type->name, db_off, evt_idx );
+	queue->db = ( gve->db + db_off );
+	assert ( evt_idx < gve->events.count );
+	queue->event = &gve->events.event[evt_idx];
+	assert ( queue->event->count == 0 );
+
+	return 0;
+}
+
+/**
+ * Destroy transmit or receive queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ * @ret rc		Return status code
+ */
+static int gve_destroy_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	int rc;
+
+	/* Issue command */
+	if ( ( rc = gve_admin_simple ( gve, type->destroy, 0 ) ) != 0 )
+		return rc;
+
+	return 0;
+}
+
+/******************************************************************************
+ *
+ * Network device interface
+ *
+ ******************************************************************************
+ */
+
+/**
+ * Allocate queue page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ * @v id		Queue page list ID
+ * @v buffers		Number of data buffers
+ * @ret rc		Return status code
+ */
+static int gve_alloc_qpl ( struct gve_nic *gve, struct gve_qpl *qpl,
+			   uint32_t id, unsigned int buffers ) {
+	size_t len;
+
+	/* Record ID */
+	qpl->id = id;
+
+	/* Calculate number of pages required */
+	build_assert ( GVE_BUF_SIZE <= GVE_PAGE_SIZE );
+	qpl->count = ( ( buffers + GVE_BUF_PER_PAGE - 1 ) / GVE_BUF_PER_PAGE );
+
+	/* Allocate pages (as a single block) */
+	len = ( qpl->count * GVE_PAGE_SIZE );
+	qpl->data = dma_umalloc ( gve->dma, &qpl->map, len, GVE_ALIGN );
+	if ( ! qpl->data )
+		return -ENOMEM;
+
+	DBGC ( gve, "GVE %p QPL %#08x at [%08lx,%08lx)\n",
+	       gve, qpl->id, user_to_phys ( qpl->data, 0 ),
+	       user_to_phys ( qpl->data, len ) );
+	return 0;
+}
+
+/**
+ * Free queue page list
+ *
+ * @v gve		GVE device
+ * @v qpl		Queue page list
+ */
+static void gve_free_qpl ( struct gve_nic *nic __unused,
+			   struct gve_qpl *qpl ) {
+	size_t len = ( qpl->count * GVE_PAGE_SIZE );
+
+	/* Free pages */
+	dma_ufree ( &qpl->map, qpl->data, len );
+}
+
+/**
+ * Get buffer address (within queue page list address space)
+ *
+ * @v queue		Descriptor queue
+ * @v index		Buffer index
+ * @ret addr		Buffer address within queue page list address space
+ */
+static inline __attribute__ (( always_inline)) size_t
+gve_address ( struct gve_queue *queue, unsigned int index ) {
+
+	/* We allocate sufficient pages for the maximum fill level of
+	 * buffers, and reuse the pages in strict rotation as we
+	 * progress through the queue.
+	 */
+	return ( ( index & ( queue->fill - 1 ) ) * GVE_BUF_SIZE );
+}
+
+/**
+ * Get buffer address
+ *
+ * @v queue		Descriptor queue
+ * @v index		Buffer index
+ * @ret addr		Buffer address
+ */
+static inline __attribute__ (( always_inline )) userptr_t
+gve_buffer ( struct gve_queue *queue, unsigned int index ) {
+
+	/* Pages are currently allocated as a single contiguous block */
+	return userptr_add ( queue->qpl.data, gve_address ( queue, index ) );
+}
+
+/**
+ * Calculate next receive sequence number
+ *
+ * @v seq		Current sequence number, or zero to start sequence
+ * @ret next		Next sequence number
+ */
+static inline __attribute__ (( always_inline )) unsigned int
+gve_next ( unsigned int seq ) {
+
+	/* The receive completion sequence number is a modulo 7
+	 * counter that cycles through the non-zero three-bit values 1
+	 * to 7 inclusive.
+	 *
+	 * Since 7 is coprime to 2^n, this ensures that the sequence
+	 * number changes each time that a new completion is written
+	 * to memory.
+	 *
+	 * Since the counter takes only non-zero values, this ensures
+	 * that the sequence number changes whenever a new completion
+	 * is first written to a zero-initialised completion ring.
+	 */
+	seq = ( ( seq + 1 ) & GVE_RX_SEQ_MASK );
+	return ( seq ? seq : 1 );
+}
+
+/**
+ * Allocate descriptor queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ * @ret rc		Return status code
+ */
+static int gve_alloc_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	struct dma_device *dma = gve->dma;
+	size_t desc_len = ( queue->count * type->desc_len );
+	size_t cmplt_len = ( queue->count * type->cmplt_len );
+	size_t res_len = sizeof ( *queue->res );
+	struct gve_buffer buf;
+	size_t offset;
+	unsigned int i;
+	int rc;
+
+	/* Sanity checks */
+	if ( ( queue->count == 0 ) ||
+	     ( queue->count & ( queue->count - 1 ) ) ) {
+		DBGC ( gve, "GVE %p %s invalid queue size %d\n",
+		       gve, type->name, queue->count );
+		rc = -EINVAL;
+		goto err_sanity;
+	}
+
+	/* Calculate maximum fill level */
+	assert ( ( type->fill & ( type->fill - 1 ) ) == 0 );
+	queue->fill = type->fill;
+	if ( queue->fill > queue->count )
+		queue->fill = queue->count;
+	DBGC ( gve, "GVE %p %s using QPL %#08x with %d/%d descriptors\n",
+	       gve, type->name, type->qpl, queue->fill, queue->count );
+
+	/* Allocate queue page list */
+	if ( ( rc = gve_alloc_qpl ( gve, &queue->qpl, type->qpl,
+				    queue->fill ) ) != 0 )
+		goto err_qpl;
+
+	/* Allocate descriptors */
+	queue->desc = dma_umalloc ( dma, &queue->desc_map, desc_len,
+				    GVE_ALIGN );
+	if ( ! queue->desc ) {
+		rc = -ENOMEM;
+		goto err_desc;
+	}
+	DBGC ( gve, "GVE %p %s descriptors at [%08lx,%08lx)\n",
+	       gve, type->name, user_to_phys ( queue->desc, 0 ),
+	       user_to_phys ( queue->desc, desc_len ) );
+
+	/* Allocate completions */
+	if ( cmplt_len ) {
+		queue->cmplt = dma_umalloc ( dma, &queue->cmplt_map, cmplt_len,
+					     GVE_ALIGN );
+		if ( ! queue->cmplt ) {
+			rc = -ENOMEM;
+			goto err_cmplt;
+		}
+		DBGC ( gve, "GVE %p %s completions at [%08lx,%08lx)\n",
+		       gve, type->name, user_to_phys ( queue->cmplt, 0 ),
+		       user_to_phys ( queue->cmplt, cmplt_len ) );
+	}
+
+	/* Allocate queue resources */
+	queue->res = dma_alloc ( dma, &queue->res_map, res_len, GVE_ALIGN );
+	if ( ! queue->res ) {
+		rc = -ENOMEM;
+		goto err_res;
+	}
+	memset ( queue->res, 0, res_len );
+
+	/* Populate descriptor offsets */
+	offset = ( type->desc_len - sizeof ( buf ) );
+	for ( i = 0 ; i < queue->count ; i++ ) {
+		buf.addr = cpu_to_be64 ( gve_address ( queue, i ) );
+		copy_to_user ( queue->desc, offset, &buf, sizeof ( buf ) );
+		offset += type->desc_len;
+	}
+
+	return 0;
+
+	dma_free ( &queue->res_map, queue->res, res_len );
+ err_res:
+	if ( cmplt_len )
+		dma_ufree ( &queue->cmplt_map, queue->cmplt, cmplt_len );
+ err_cmplt:
+	dma_ufree ( &queue->desc_map, queue->desc, desc_len );
+ err_desc:
+	gve_free_qpl ( gve, &queue->qpl );
+ err_qpl:
+ err_sanity:
+	return rc;
+}
+
+/**
+ * Free descriptor queue
+ *
+ * @v gve		GVE device
+ * @v queue		Descriptor queue
+ */
+static void gve_free_queue ( struct gve_nic *gve, struct gve_queue *queue ) {
+	const struct gve_queue_type *type = queue->type;
+	size_t desc_len = ( queue->count * type->desc_len );
+	size_t cmplt_len = ( queue->count * type->cmplt_len );
+	size_t res_len = sizeof ( *queue->res );
+
+	/* Free queue resources */
+	dma_free ( &queue->res_map, queue->res, res_len );
+
+	/* Free completions, if applicable */
+	if ( cmplt_len )
+		dma_ufree ( &queue->cmplt_map, queue->cmplt, cmplt_len );
+
+	/* Free descriptors */
+	dma_ufree ( &queue->desc_map, queue->desc, desc_len );
+
+	/* Free queue page list */
+	gve_free_qpl ( gve, &queue->qpl );
+}
+
+/**
+ * Start up device
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_start ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+	struct io_buffer *iobuf;
+	unsigned int i;
+	int rc;
+
+	/* Cancel any pending transmissions */
+	for ( i = 0 ; i < ( sizeof ( gve->tx_iobuf ) /
+			    sizeof ( gve->tx_iobuf[0] ) ) ; i++ ) {
+		iobuf = gve->tx_iobuf[i];
+		gve->tx_iobuf[i] = NULL;
+		if ( iobuf )
+			netdev_tx_complete_err ( netdev, iobuf, -ECANCELED );
+	}
+
+	/* Invalidate receive completions */
+	memset_user ( rx->cmplt, 0, 0, ( rx->count * rx->type->cmplt_len ) );
+
+	/* Reset receive sequence */
+	gve->seq = gve_next ( 0 );
+
+	/* Configure device resources */
+	if ( ( rc = gve_configure ( gve ) ) != 0 )
+		goto err_configure;
+
+	/* Register transmit queue page list */
+	if ( ( rc = gve_register ( gve, &tx->qpl ) ) != 0 )
+		goto err_register_tx;
+
+	/* Register receive queue page list */
+	if ( ( rc = gve_register ( gve, &rx->qpl ) ) != 0 )
+		goto err_register_rx;
+
+	/* Create transmit queue */
+	if ( ( rc = gve_create_queue ( gve, tx ) ) != 0 )
+		goto err_create_tx;
+
+	/* Create receive queue */
+	if ( ( rc = gve_create_queue ( gve, rx ) ) != 0 )
+		goto err_create_rx;
+
+	return 0;
+
+	gve_destroy_queue ( gve, rx );
+ err_create_rx:
+	gve_destroy_queue ( gve, tx );
+ err_create_tx:
+	gve_unregister ( gve, &rx->qpl );
+ err_register_rx:
+	gve_unregister ( gve, &tx->qpl );
+ err_register_tx:
+	gve_deconfigure ( gve );
+ err_configure:
+	return rc;
+}
+
+/**
+ * Stop device
+ *
+ * @v gve		GVE device
+ */
+static void gve_stop ( struct gve_nic *gve ) {
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+
+	/* Destroy queues */
+	gve_destroy_queue ( gve, rx );
+	gve_destroy_queue ( gve, tx );
+
+	/* Unregister page lists */
+	gve_unregister ( gve, &rx->qpl );
+	gve_unregister ( gve, &tx->qpl );
+
+	/* Deconfigure device */
+	gve_deconfigure ( gve );
+}
+
+/**
+ * Device startup process
+ *
+ * @v gve		GVE device
+ */
+static void gve_startup ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+	int rc;
+
+	/* Reset device */
+	if ( ( rc = gve_reset ( gve ) ) != 0 )
+		goto err_reset;
+
+	/* Enable admin queue */
+	gve_admin_enable ( gve );
+
+	/* Start device */
+	if ( ( rc = gve_start ( gve ) ) != 0 )
+		goto err_start;
+
+	/* Reset retry count */
+	gve->retries = 0;
+
+	/* (Ab)use link status to report startup status */
+	netdev_link_up ( netdev );
+
+	return;
+
+	gve_stop ( gve );
+ err_start:
+ err_reset:
+	DBGC ( gve, "GVE %p startup failed: %s\n", gve, strerror ( rc ) );
+	netdev_link_err ( netdev, rc );
+	if ( gve->retries++ < GVE_RESET_MAX_RETRY )
+		process_add ( &gve->startup );
+}
+
+/**
+ * Trigger startup process
+ *
+ * @v gve		GVE device
+ */
+static void gve_restart ( struct gve_nic *gve ) {
+	struct net_device *netdev = gve->netdev;
+
+	/* Mark link down to inhibit polling and transmit activity */
+	netdev_link_down ( netdev );
+
+	/* Schedule startup process */
+	process_add ( &gve->startup );
+}
+
+/**
+ * Reset recovery watchdog
+ *
+ * @v timer		Reset recovery watchdog timer
+ * @v over		Failure indicator
+ */
+static void gve_watchdog ( struct retry_timer *timer, int over __unused ) {
+	struct gve_nic *gve = container_of ( timer, struct gve_nic, watchdog );
+	uint32_t activity;
+	uint32_t pfn;
+	int rc;
+
+	/* Reschedule watchdog */
+	start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT );
+
+	/* Reset device (for test purposes) if applicable */
+	if ( ( rc = inject_fault ( VM_MIGRATED_RATE ) ) != 0 ) {
+		DBGC ( gve, "GVE %p synthesising host reset\n", gve );
+		writel ( 0, gve->cfg + GVE_CFG_ADMIN_PFN );
+	}
+
+	/* Check for activity since last timer invocation */
+	activity = ( gve->tx.cons + gve->rx.cons );
+	if ( activity != gve->activity ) {
+		gve->activity = activity;
+		return;
+	}
+
+	/* Check for reset */
+	pfn = readl ( gve->cfg + GVE_CFG_ADMIN_PFN );
+	if ( pfn ) {
+		DBGC2 ( gve, "GVE %p idle but not in reset\n", gve );
+		return;
+	}
+
+	/* Schedule restart */
+	DBGC ( gve, "GVE %p watchdog detected reset by host\n", gve );
+	gve_restart ( gve );
+}
+
+/**
+ * Open network device
+ *
+ * @v netdev		Network device
+ * @ret rc		Return status code
+ */
+static int gve_open ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+	int rc;
+
+	/* Allocate and prepopulate transmit queue */
+	if ( ( rc = gve_alloc_queue ( gve, tx ) ) != 0 )
+		goto err_alloc_tx;
+
+	/* Allocate and prepopulate receive queue */
+	if ( ( rc = gve_alloc_queue ( gve, rx ) ) != 0 )
+		goto err_alloc_rx;
+
+	/* Trigger startup */
+	gve_restart ( gve );
+
+	/* Start reset recovery watchdog timer */
+	start_timer_fixed ( &gve->watchdog, GVE_WATCHDOG_TIMEOUT );
+
+	return 0;
+
+	gve_free_queue ( gve, rx );
+ err_alloc_rx:
+	gve_free_queue ( gve, tx );
+ err_alloc_tx:
+	return rc;
+}
+
+/**
+ * Close network device
+ *
+ * @v netdev		Network device
+ */
+static void gve_close ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_queue *rx = &gve->rx;
+
+	/* Stop reset recovery timer */
+	stop_timer ( &gve->watchdog );
+
+	/* Terminate startup process */
+	process_del ( &gve->startup );
+
+	/* Stop and reset device */
+	gve_stop ( gve );
+	gve_reset ( gve );
+
+	/* Free queues */
+	gve_free_queue ( gve, rx );
+	gve_free_queue ( gve, tx );
+}
+
+/**
+ * Transmit packet
+ *
+ * @v netdev		Network device
+ * @v iobuf		I/O buffer
+ * @ret rc		Return status code
+ */
+static int gve_transmit ( struct net_device *netdev, struct io_buffer *iobuf ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct gve_tx_descriptor desc;
+	unsigned int count;
+	unsigned int index;
+	size_t frag_len;
+	size_t offset;
+	size_t len;
+
+	/* Do nothing if queues are not yet set up */
+	if ( ! netdev_link_ok ( netdev ) )
+		return -ENETDOWN;
+
+	/* Defer packet if there is no space in the transmit ring */
+	len = iob_len ( iobuf );
+	count = ( ( len + GVE_BUF_SIZE - 1 ) / GVE_BUF_SIZE );
+	if ( ( ( tx->prod - tx->cons ) + count ) > tx->fill ) {
+		netdev_tx_defer ( netdev, iobuf );
+		return 0;
+	}
+
+	/* Copy packet to queue pages and populate descriptors */
+	for ( offset = 0 ; offset < len ; offset += frag_len ) {
+
+		/* Sanity check */
+		assert ( gve->tx_iobuf[ tx->prod % GVE_TX_FILL ] == NULL );
+
+		/* Copy packet fragment */
+		frag_len = ( len - offset );
+		if ( frag_len > GVE_BUF_SIZE )
+			frag_len = GVE_BUF_SIZE;
+		copy_to_user ( gve_buffer ( tx, tx->prod ), 0,
+			       ( iobuf->data + offset ), frag_len );
+
+		/* Populate descriptor */
+		index = ( tx->prod++ & ( tx->count - 1 ) );
+		memset ( &desc.pkt, 0, sizeof ( desc.pkt ) );
+		if ( offset ) {
+			desc.pkt.type = GVE_TX_TYPE_CONT;
+		} else {
+			desc.pkt.type = GVE_TX_TYPE_START;
+			desc.pkt.count = count;
+			desc.pkt.total = cpu_to_be16 ( len );
+		}
+		desc.pkt.len = cpu_to_be16 ( frag_len );
+		copy_to_user ( tx->desc, ( index * sizeof ( desc ) ), &desc,
+			       sizeof ( desc.pkt ) );
+		DBGC2 ( gve, "GVE %p TX %#04x %#02x:%#02x len %#04x/%#04x at "
+			"%#08zx\n", gve, index, desc.pkt.type, desc.pkt.count,
+			be16_to_cpu ( desc.pkt.len ),
+			be16_to_cpu ( desc.pkt.total ),
+			gve_address ( tx, index ) );
+	}
+	assert ( ( tx->prod - tx->cons ) <= tx->fill );
+
+	/* Record I/O buffer against final descriptor */
+	gve->tx_iobuf[ ( tx->prod - 1U ) % GVE_TX_FILL ] = iobuf;
+
+	/* Ring doorbell */
+	wmb();
+	writel ( bswap_32 ( tx->prod ), tx->db );
+
+	return 0;
+}
+
+/**
+ * Poll for completed transmissions
+ *
+ * @v netdev		Network device
+ */
+static void gve_poll_tx ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *tx = &gve->tx;
+	struct io_buffer *iobuf;
+	uint32_t count;
+
+	/* Read event counter */
+	count = be32_to_cpu ( tx->event->count );
+
+	/* Process transmit completions */
+	while ( count != tx->cons ) {
+		DBGC2 ( gve, "GVE %p TX %#04x complete\n", gve, tx->cons );
+		iobuf = gve->tx_iobuf[ tx->cons % GVE_TX_FILL ];
+		gve->tx_iobuf[ tx->cons % GVE_TX_FILL ] = NULL;
+		tx->cons++;
+		if ( iobuf )
+			netdev_tx_complete ( netdev, iobuf );
+	}
+}
+
+/**
+ * Poll for received packets
+ *
+ * @v netdev		Network device
+ */
+static void gve_poll_rx ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *rx = &gve->rx;
+	struct gve_rx_completion cmplt;
+	struct io_buffer *iobuf;
+	unsigned int index;
+	unsigned int seq;
+	uint32_t cons;
+	size_t offset;
+	size_t total;
+	size_t len;
+	int rc;
+
+	/* Process receive completions */
+	cons = rx->cons;
+	seq = gve->seq;
+	total = 0;
+	while ( 1 ) {
+
+		/* Read next possible completion */
+		index = ( cons++ & ( rx->count - 1 ) );
+		offset = ( ( index * sizeof ( cmplt ) ) +
+			   offsetof ( typeof ( cmplt ), pkt ) );
+		copy_from_user ( &cmplt.pkt, rx->cmplt, offset,
+				 sizeof ( cmplt.pkt ) );
+
+		/* Check sequence number */
+		if ( ( cmplt.pkt.seq & GVE_RX_SEQ_MASK ) != seq )
+			break;
+		seq = gve_next ( seq );
+
+		/* Parse completion */
+		len = be16_to_cpu ( cmplt.pkt.len );
+		DBGC2 ( gve, "GVE %p RX %#04x %#02x:%#02x len %#04zx at "
+			"%#08zx\n", gve, index, cmplt.pkt.seq, cmplt.pkt.flags,
+			len, gve_address ( rx, index ) );
+
+		/* Accumulate a complete packet */
+		if ( cmplt.pkt.flags & GVE_RXF_ERROR ) {
+			total = 0;
+		} else {
+			total += len;
+			if ( cmplt.pkt.flags & GVE_RXF_MORE )
+				continue;
+		}
+		gve->seq = seq;
+
+		/* Allocate and populate I/O buffer */
+		iobuf = ( total ? alloc_iob ( total ) : NULL );
+		for ( ; rx->cons != cons ; rx->cons++ ) {
+
+			/* Re-read completion length */
+			index = ( rx->cons & ( rx->count - 1 ) );
+			offset = ( ( index * sizeof ( cmplt ) ) +
+				   offsetof ( typeof ( cmplt ), pkt.len ) );
+			copy_from_user ( &cmplt.pkt, rx->cmplt, offset,
+					 sizeof ( cmplt.pkt.len ) );
+
+			/* Copy data */
+			if ( iobuf ) {
+				len = be16_to_cpu ( cmplt.pkt.len );
+				copy_from_user ( iob_put ( iobuf, len ),
+						 gve_buffer ( rx, rx->cons ),
+						 0, len );
+			}
+		}
+		assert ( ( iobuf == NULL ) || ( iob_len ( iobuf ) == total ) );
+		total = 0;
+
+		/* Hand off packet to network stack */
+		if ( iobuf ) {
+			iob_pull ( iobuf, GVE_RX_PAD );
+			netdev_rx ( netdev, iobuf );
+		} else {
+			rc = ( ( cmplt.pkt.flags & GVE_RXF_ERROR ) ?
+			       -EIO : -ENOMEM );
+			netdev_rx_err ( netdev, NULL, rc );
+		}
+
+		/* Sanity check */
+		assert ( rx->cons == cons );
+		assert ( gve->seq == seq );
+		assert ( total == 0 );
+	}
+}
+
+/**
+ * Refill receive queue
+ *
+ * @v netdev		Network device
+ */
+static void gve_refill_rx ( struct net_device *netdev ) {
+	struct gve_nic *gve = netdev->priv;
+	struct gve_queue *rx = &gve->rx;
+	unsigned int prod;
+
+	/* The receive descriptors are prepopulated at the time of
+	 * creating the receive queue (pointing to the preallocated
+	 * queue pages).  Refilling is therefore just a case of
+	 * ringing the doorbell if the device is not yet aware of any
+	 * available descriptors.
+	 */
+	prod = ( rx->cons + rx->fill );
+	if ( prod != rx->prod ) {
+		rx->prod = prod;
+		writel ( bswap_32 ( prod ), rx->db );
+		DBGC2 ( gve, "GVE %p RX %#04x ready\n", gve, rx->prod );
+	}
+}
+
+/**
+ * Poll for completed and received packets
+ *
+ * @v netdev		Network device
+ */
+static void gve_poll ( struct net_device *netdev ) {
+
+	/* Do nothing if queues are not yet set up */
+	if ( ! netdev_link_ok ( netdev ) )
+		return;
+
+	/* Poll for transmit completions */
+	gve_poll_tx ( netdev );
+
+	/* Poll for receive completions */
+	gve_poll_rx ( netdev );
+
+	/* Refill receive queue */
+	gve_refill_rx ( netdev );
+}
+
+/** GVE network device operations */
+static struct net_device_operations gve_operations = {
+	.open		= gve_open,
+	.close		= gve_close,
+	.transmit	= gve_transmit,
+	.poll		= gve_poll,
+};
+
+/******************************************************************************
+ *
+ * PCI interface
+ *
+ ******************************************************************************
+ */
+
+/** Transmit descriptor queue type */
+static const struct gve_queue_type gve_tx_type = {
+	.name = "TX",
+	.param = gve_create_tx_param,
+	.qpl = GVE_TX_QPL,
+	.irq = GVE_TX_IRQ,
+	.fill = GVE_TX_FILL,
+	.desc_len = sizeof ( struct gve_tx_descriptor ),
+	.create = GVE_ADMIN_CREATE_TX,
+	.destroy = GVE_ADMIN_DESTROY_TX,
+};
+
+/** Receive descriptor queue type */
+static const struct gve_queue_type gve_rx_type = {
+	.name = "RX",
+	.param = gve_create_rx_param,
+	.qpl = GVE_RX_QPL,
+	.irq = GVE_RX_IRQ,
+	.fill = GVE_RX_FILL,
+	.desc_len = sizeof ( struct gve_rx_descriptor ),
+	.cmplt_len = sizeof ( struct gve_rx_completion ),
+	.create = GVE_ADMIN_CREATE_RX,
+	.destroy = GVE_ADMIN_DESTROY_RX,
+};
+
+/**
+ * Set up admin queue and get device description
+ *
+ * @v gve		GVE device
+ * @ret rc		Return status code
+ */
+static int gve_setup ( struct gve_nic *gve ) {
+	unsigned int i;
+	int rc;
+
+	/* Attempt several times, since the device may decide to add
+	 * in a few spurious resets.
+	 */
+	for ( i = 0 ; i < GVE_RESET_MAX_RETRY ; i++ ) {
+
+		/* Reset device */
+		if ( ( rc = gve_reset ( gve ) ) != 0 )
+			continue;
+
+		/* Enable admin queue */
+		gve_admin_enable ( gve );
+
+		/* Fetch MAC address */
+		if ( ( rc = gve_describe ( gve ) ) != 0 )
+			continue;
+
+		/* Success */
+		return 0;
+	}
+
+	DBGC ( gve, "GVE %p failed to get device description: %s\n",
+	       gve, strerror ( rc ) );
+	return rc;
+}
+
+/** Device startup process descriptor */
+static struct process_descriptor gve_startup_desc =
+	PROC_DESC_ONCE ( struct gve_nic, startup, gve_startup );
+
+/**
+ * Probe PCI device
+ *
+ * @v pci		PCI device
+ * @ret rc		Return status code
+ */
+static int gve_probe ( struct pci_device *pci ) {
+	struct net_device *netdev;
+	struct gve_nic *gve;
+	unsigned long cfg_start;
+	unsigned long db_start;
+	unsigned long db_size;
+	int rc;
+
+	/* Allocate and initialise net device */
+	netdev = alloc_etherdev ( sizeof ( *gve ) );
+	if ( ! netdev ) {
+		rc = -ENOMEM;
+		goto err_alloc;
+	}
+	netdev_init ( netdev, &gve_operations );
+	gve = netdev->priv;
+	pci_set_drvdata ( pci, netdev );
+	netdev->dev = &pci->dev;
+	memset ( gve, 0, sizeof ( *gve ) );
+	gve->netdev = netdev;
+	gve->tx.type = &gve_tx_type;
+	gve->rx.type = &gve_rx_type;
+	process_init ( &gve->startup, &gve_startup_desc, &netdev->refcnt );
+	timer_init ( &gve->watchdog, gve_watchdog, &netdev->refcnt );
+
+	/* Fix up PCI device */
+	adjust_pci_device ( pci );
+
+	/* Check PCI revision */
+	pci_read_config_byte ( pci, PCI_REVISION, &gve->revision );
+	DBGC ( gve, "GVE %p is revision %#02x\n", gve, gve->revision );
+
+	/* Map configuration registers */
+	cfg_start = pci_bar_start ( pci, GVE_CFG_BAR );
+	gve->cfg = pci_ioremap ( pci, cfg_start, GVE_CFG_SIZE );
+	if ( ! gve->cfg ) {
+		rc = -ENODEV;
+		goto err_cfg;
+	}
+
+	/* Map doorbell registers */
+	db_start = pci_bar_start ( pci, GVE_DB_BAR );
+	db_size = pci_bar_size ( pci, GVE_DB_BAR );
+	gve->db = pci_ioremap ( pci, db_start, db_size );
+	if ( ! gve->db ) {
+		rc = -ENODEV;
+		goto err_db;
+	}
+
+	/* Configure DMA */
+	gve->dma = &pci->dma;
+	dma_set_mask_64bit ( gve->dma );
+	assert ( netdev->dma == NULL );
+
+	/* Allocate admin queue */
+	if ( ( rc = gve_admin_alloc ( gve ) ) != 0 )
+		goto err_admin;
+
+	/* Set up the device */
+	if ( ( rc = gve_setup ( gve ) ) != 0 )
+		goto err_setup;
+
+	/* Register network device */
+	if ( ( rc = register_netdev ( netdev ) ) != 0 )
+		goto err_register_netdev;
+
+	return 0;
+
+	unregister_netdev ( netdev );
+ err_register_netdev:
+ err_setup:
+	gve_reset ( gve );
+	gve_admin_free ( gve );
+ err_admin:
+	iounmap ( gve->db );
+ err_db:
+	iounmap ( gve->cfg );
+ err_cfg:
+	netdev_nullify ( netdev );
+	netdev_put ( netdev );
+ err_alloc:
+	return rc;
+}
+
+/**
+ * Remove PCI device
+ *
+ * @v pci		PCI device
+ */
+static void gve_remove ( struct pci_device *pci ) {
+	struct net_device *netdev = pci_get_drvdata ( pci );
+	struct gve_nic *gve = netdev->priv;
+
+	/* Unregister network device */
+	unregister_netdev ( netdev );
+
+	/* Reset device */
+	gve_reset ( gve );
+
+	/* Free admin queue */
+	gve_admin_free ( gve );
+
+	/* Unmap registers */
+	iounmap ( gve->db );
+	iounmap ( gve->cfg );
+
+	/* Free network device */
+	netdev_nullify ( netdev );
+	netdev_put ( netdev );
+}
+
+/** GVE PCI device IDs */
+static struct pci_device_id gve_nics[] = {
+	PCI_ROM ( 0x1ae0, 0x0042, "gve", "gVNIC", 0 ),
+};
+
+/** GVE PCI driver */
+struct pci_driver gve_driver __pci_driver = {
+	.ids = gve_nics,
+	.id_count = ( sizeof ( gve_nics ) / sizeof ( gve_nics[0] ) ),
+	.probe = gve_probe,
+	.remove = gve_remove,
+};
diff --git a/src/drivers/net/gve.h b/src/drivers/net/gve.h
new file mode 100644
index 0000000..2845699
--- /dev/null
+++ b/src/drivers/net/gve.h
@@ -0,0 +1,702 @@
+#ifndef _GVE_H
+#define _GVE_H
+
+/** @file
+ *
+ * Google Virtual Ethernet network driver
+ *
+ * The Google Virtual Ethernet NIC (GVE or gVNIC) is found only in
+ * Google Cloud instances.  There is essentially zero documentation
+ * available beyond the mostly uncommented source code in the Linux
+ * kernel.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+#include <ipxe/dma.h>
+#include <ipxe/pci.h>
+#include <ipxe/in.h>
+#include <ipxe/uaccess.h>
+#include <ipxe/process.h>
+#include <ipxe/retry.h>
+
+struct gve_nic;
+
+/**
+ * A Google Cloud MAC address
+ *
+ * Google Cloud locally assigned MAC addresses encode the local IPv4
+ * address in the trailing 32 bits, presumably as a performance
+ * optimisation to allow ARP resolution to be skipped by a suitably
+ * aware network stack.
+ */
+struct google_mac {
+	/** Reserved */
+	uint8_t reserved[2];
+	/** Local IPv4 address */
+	struct in_addr in;
+} __attribute__ (( packed ));
+
+/** Page size */
+#define GVE_PAGE_SIZE 0x1000
+
+/**
+ * Address alignment
+ *
+ * All DMA data structure base addresses seem to need to be aligned to
+ * a page boundary.  (This is not documented anywhere, but is inferred
+ * from existing source code and experimentation.)
+ */
+#define GVE_ALIGN GVE_PAGE_SIZE
+
+/**
+ * Length alignment
+ *
+ * All DMA data structure lengths seem to need to be aligned to a
+ * multiple of 64 bytes.  (This is not documented anywhere, but is
+ * inferred from existing source code and experimentation.)
+ */
+#define GVE_LEN_ALIGN 64
+
+/** Maximum number of pages per queue (must be a power of two) */
+#define GVE_QPL_MAX 16
+
+/** Configuration BAR */
+#define GVE_CFG_BAR PCI_BASE_ADDRESS_0
+
+/**
+ * Configuration BAR size
+ *
+ * All registers within the configuration BAR are big-endian.
+ */
+#define GVE_CFG_SIZE 0x1000
+
+/** Device status */
+#define GVE_CFG_DEVSTAT 0x0000
+#define GVE_CFG_DEVSTAT_RESET 0x00000010UL	/**< Device is reset */
+
+/** Driver status */
+#define GVE_CFG_DRVSTAT 0x0004
+#define GVE_CFG_DRVSTAT_RUN 0x00000001UL	/**< Run admin queue */
+
+/** Maximum time to wait for reset */
+#define GVE_RESET_MAX_WAIT_MS 500
+
+/** Admin queue page frame number (for older devices) */
+#define GVE_CFG_ADMIN_PFN 0x0010
+
+/** Admin queue doorbell */
+#define GVE_CFG_ADMIN_DB 0x0014
+
+/** Admin queue event counter */
+#define GVE_CFG_ADMIN_EVT 0x0018
+
+/** Driver version (8-bit register) */
+#define GVE_CFG_VERSION 0x001f
+
+/** Admin queue base address high 32 bits */
+#define GVE_CFG_ADMIN_BASE_HI 0x0020
+
+/** Admin queue base address low 32 bits */
+#define GVE_CFG_ADMIN_BASE_LO 0x0024
+
+/** Admin queue base address length (16-bit register) */
+#define GVE_CFG_ADMIN_LEN 0x0028
+
+/** Doorbell BAR */
+#define GVE_DB_BAR PCI_BASE_ADDRESS_2
+
+/**
+ * Admin queue entry header
+ *
+ * All values within admin queue entries are big-endian.
+ */
+struct gve_admin_header {
+	/** Reserved */
+	uint8_t reserved[3];
+	/** Operation code */
+	uint8_t opcode;
+	/** Status */
+	uint32_t status;
+} __attribute__ (( packed ));
+
+/** Command succeeded */
+#define GVE_ADMIN_STATUS_OK 0x00000001
+
+/** Simple admin command */
+struct gve_admin_simple {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** ID */
+	uint32_t id;
+} __attribute__ (( packed ));
+
+/** Describe device command */
+#define GVE_ADMIN_DESCRIBE 0x0001
+
+/** Describe device command */
+struct gve_admin_describe {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Descriptor buffer address */
+	uint64_t addr;
+	/** Descriptor version */
+	uint32_t ver;
+	/** Descriptor maximum length */
+	uint32_t len;
+} __attribute__ (( packed ));
+
+/** Device descriptor version */
+#define GVE_ADMIN_DESCRIBE_VER 1
+
+/** Device descriptor */
+struct gve_device_descriptor {
+	/** Reserved */
+	uint8_t reserved_a[10];
+	/** Number of transmit queue entries */
+	uint16_t tx_count;
+	/** Number of receive queue entries */
+	uint16_t rx_count;
+	/** Reserved */
+	uint8_t reserved_b[2];
+	/** Maximum transmit unit */
+	uint16_t mtu;
+	/** Number of event counters */
+	uint16_t counters;
+	/** Reserved */
+	uint8_t reserved_c[4];
+	/** MAC address */
+	struct google_mac mac;
+	/** Reserved */
+	uint8_t reserved_d[10];
+} __attribute__ (( packed ));
+
+/** Configure device resources command */
+#define GVE_ADMIN_CONFIGURE 0x0002
+
+/** Configure device resources command */
+struct gve_admin_configure {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Event counter array */
+	uint64_t events;
+	/** IRQ doorbell address */
+	uint64_t irqs;
+	/** Number of event counters */
+	uint32_t num_events;
+	/** Number of IRQ doorbells */
+	uint32_t num_irqs;
+	/** IRQ doorbell stride */
+	uint32_t irq_stride;
+} __attribute__ (( packed ));
+
+/** Register page list command */
+#define GVE_ADMIN_REGISTER 0x0003
+
+/** Register page list command */
+struct gve_admin_register {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Page list ID */
+	uint32_t id;
+	/** Number of pages */
+	uint32_t count;
+	/** Address list address */
+	uint64_t addr;
+	/** Page size */
+	uint64_t size;
+} __attribute__ (( packed ));
+
+/** Page list */
+struct gve_pages {
+	/** Page address */
+	uint64_t addr[GVE_QPL_MAX];
+} __attribute__ (( packed ));
+
+/** Unregister page list command */
+#define GVE_ADMIN_UNREGISTER 0x0004
+
+/** Create transmit queue command */
+#define GVE_ADMIN_CREATE_TX 0x0005
+
+/** Create transmit queue command */
+struct gve_admin_create_tx {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Queue ID */
+	uint32_t id;
+	/** Reserved */
+	uint8_t reserved_a[4];
+	/** Queue resources address */
+	uint64_t res;
+	/** Descriptor ring address */
+	uint64_t desc;
+	/** Queue page list ID */
+	uint32_t qpl_id;
+	/** Notification channel ID */
+	uint32_t notify_id;
+} __attribute__ (( packed ));
+
+/** Create receive queue command */
+#define GVE_ADMIN_CREATE_RX 0x0006
+
+/** Create receive queue command */
+struct gve_admin_create_rx {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Queue ID */
+	uint32_t id;
+	/** Index */
+	uint32_t index;
+	/** Reserved */
+	uint8_t reserved_a[4];
+	/** Notification channel ID */
+	uint32_t notify_id;
+	/** Queue resources address */
+	uint64_t res;
+	/** Completion ring address */
+	uint64_t cmplt;
+	/** Descriptor ring address */
+	uint64_t desc;
+	/** Queue page list ID */
+	uint32_t qpl_id;
+	/** Reserved */
+	uint8_t reserved_b[2];
+	/** Packet buffer size */
+	uint16_t bufsz;
+} __attribute__ (( packed ));
+
+/** Destroy transmit queue command */
+#define GVE_ADMIN_DESTROY_TX 0x0007
+
+/** Destroy receive queue command */
+#define GVE_ADMIN_DESTROY_RX 0x0008
+
+/** Deconfigure device resources command */
+#define GVE_ADMIN_DECONFIGURE 0x0009
+
+/** An admin queue command */
+union gve_admin_command {
+	/** Header */
+	struct gve_admin_header hdr;
+	/** Simple command */
+	struct gve_admin_simple simple;
+	/** Describe device */
+	struct gve_admin_describe desc;
+	/** Configure device resources */
+	struct gve_admin_configure conf;
+	/** Register page list */
+	struct gve_admin_register reg;
+	/** Create transmit queue */
+	struct gve_admin_create_tx create_tx;
+	/** Create receive queue */
+	struct gve_admin_create_rx create_rx;
+	/** Padding */
+	uint8_t pad[64];
+};
+
+/**
+ * Number of admin queue commands
+ *
+ * This is theoretically a policy decision.  However, older revisions
+ * of the hardware seem to have only the "admin queue page frame
+ * number" register and no "admin queue length" register, with the
+ * implication that the admin queue must be exactly one page in
+ * length.
+ *
+ * Choose to use a one page (4kB) admin queue for both older and newer
+ * versions of the hardware, to minimise variability.
+ */
+#define GVE_ADMIN_COUNT ( GVE_PAGE_SIZE / sizeof ( union gve_admin_command ) )
+
+/** Admin queue */
+struct gve_admin {
+	/** Commands */
+	union gve_admin_command *cmd;
+	/** Producer counter */
+	uint32_t prod;
+	/** DMA mapping */
+	struct dma_mapping map;
+};
+
+/** Scratch buffer for admin queue commands */
+struct gve_scratch {
+	/** Buffer contents */
+	union {
+		/** Device descriptor */
+		struct gve_device_descriptor desc;
+		/** Page address list */
+		struct gve_pages pages;
+	} *buf;
+	/** DMA mapping */
+	struct dma_mapping map;
+};
+
+/**
+ * An event counter
+ *
+ * Written by the device to indicate completions.  The device chooses
+ * which counter to use for each transmit queue, and stores the index
+ * of the chosen counter in the queue resources.
+ */
+struct gve_event {
+	/** Number of events that have occurred */
+	volatile uint32_t count;
+} __attribute__ (( packed ));
+
+/**
+ * Maximum number of event counters
+ *
+ * We tell the device how many event counters we have provided via the
+ * "configure device resources" admin queue command.  The device will
+ * accept being given only a single counter, but will subsequently
+ * fail to create a receive queue.
+ *
+ * There is, of course, no documentation indicating how may event
+ * counters actually need to be provided.  In the absence of evidence
+ * to the contrary, assume that 16 counters (i.e. the smallest number
+ * we can allocate, given the length alignment constraint on
+ * allocations) will be sufficient.
+ */
+#define GVE_EVENT_MAX ( GVE_LEN_ALIGN / sizeof ( struct gve_event ) )
+
+/** Event counter array */
+struct gve_events {
+	/** Event counters */
+	struct gve_event *event;
+	/** DMA mapping */
+	struct dma_mapping map;
+	/** Actual number of event counters */
+	unsigned int count;
+};
+
+/** An interrupt channel */
+struct gve_irq {
+	/** Interrupt doorbell index (within doorbell BAR) */
+	uint32_t db_idx;
+	/** Reserved */
+	uint8_t reserved[60];
+} __attribute__ (( packed ));
+
+/**
+ * Number of interrupt channels
+ *
+ * We tell the device how many interrupt channels we have provided via
+ * the "configure device resources" admin queue command.  The device
+ * will accept being given zero interrupt channels, but will
+ * subsequently fail to create more than a single queue (either
+ * transmit or receive).
+ *
+ * There is, of course, no documentation indicating how may interrupt
+ * channels actually need to be provided.  In the absence of evidence
+ * to the contrary, assume that two channels (one for transmit, one
+ * for receive) will be sufficient.
+ */
+#define GVE_IRQ_COUNT 2
+
+/** Interrupt channel array */
+struct gve_irqs {
+	/** Interrupt channels */
+	struct gve_irq *irq;
+	/** DMA mapping */
+	struct dma_mapping map;
+	/** Interrupt doorbells */
+	volatile uint32_t *db[GVE_IRQ_COUNT];
+};
+
+/** Disable interrupts */
+#define GVE_IRQ_DISABLE 0x40000000UL
+
+/**
+ * Queue resources
+ *
+ * Written by the device to indicate the indices of the chosen event
+ * counter and descriptor doorbell register.
+ *
+ * This appears to be a largely pointless data structure: the relevant
+ * information is static for the lifetime of the queue and could
+ * trivially have been returned in the response for the "create
+ * transmit/receive queue" command, instead of requiring yet another
+ * page-aligned coherent DMA buffer allocation.
+ */
+struct gve_resources {
+	/** Descriptor doorbell index (within doorbell BAR) */
+	uint32_t db_idx;
+	/** Event counter index (within event counter array) */
+	uint32_t evt_idx;
+	/** Reserved */
+	uint8_t reserved[56];
+} __attribute__ (( packed ));
+
+/**
+ * Queue data buffer size
+ *
+ * In theory, we may specify the size of receive buffers.  However,
+ * the original version of the device seems not to have a parameter
+ * for this, and assumes the use of half-page (2kB) buffers.  Choose
+ * to use this as the buffer size, on the assumption that older
+ * devices will not support any other buffer size.
+ */
+#define GVE_BUF_SIZE ( GVE_PAGE_SIZE / 2 )
+
+/** Number of data buffers per page */
+#define GVE_BUF_PER_PAGE ( GVE_PAGE_SIZE / GVE_BUF_SIZE )
+
+/**
+ * Queue page list
+ *
+ * The device uses preregistered pages for fast-path DMA operations
+ * (i.e. transmit and receive buffers).  A list of device addresses
+ * for each page must be registered before the transmit or receive
+ * queue is created, and cannot subsequently be modified.
+ *
+ * The Linux driver allocates pages as DMA_TO_DEVICE or
+ * DMA_FROM_DEVICE as appropriate, and uses dma_sync_single_for_cpu()
+ * etc to ensure that data is copied to/from bounce buffers as needed.
+ *
+ * Unfortunately there is no such sync operation available within our
+ * DMA API, since we are constrained by the limitations imposed by
+ * EFI_PCI_IO_PROTOCOL.  There is no way to synchronise a buffer
+ * without also [un]mapping it, and no way to force the reuse of the
+ * same device address for a subsequent remapping.  We are therefore
+ * constrained to use only DMA-coherent buffers, since this is the
+ * only way we can repeatedly reuse the same device address.
+ *
+ * Newer versions of the gVNIC device support "raw DMA addressing
+ * (RDA)", which is essentially a prebuilt queue page list covering
+ * the whole of the guest address space.  Unfortunately we cannot rely
+ * on this, since older versions will not support it.
+ *
+ * Experimentation suggests that the device will accept a request to
+ * create a queue page list covering the whole of the guest address
+ * space via two giant "pages" of 2^63 bytes each.  However,
+ * experimentation also suggests that the device will accept any old
+ * garbage value as the "page size".  In the total absence of any
+ * documentation, it is probably unsafe to conclude that the device is
+ * bothering to look at or respect the "page size" parameter: it is
+ * most likely just presuming the use of 4kB pages.
+ */
+struct gve_qpl {
+	/** Page addresses */
+	userptr_t data;
+	/** Page mapping */
+	struct dma_mapping map;
+	/** Number of pages */
+	unsigned int count;
+	/** Queue page list ID */
+	unsigned int id;
+};
+
+/**
+ * Maximum number of transmit buffers
+ *
+ * This is a policy decision.
+ */
+#define GVE_TX_FILL 8
+
+/** Transmit queue page list ID */
+#define GVE_TX_QPL 0x18ae5458
+
+/** Tranmsit queue interrupt channel */
+#define GVE_TX_IRQ 0
+
+/** A transmit or receive buffer descriptor */
+struct gve_buffer {
+	/** Address (within queue page list address space) */
+	uint64_t addr;
+} __attribute__ (( packed ));
+
+/** A transmit packet descriptor */
+struct gve_tx_packet {
+	/** Type */
+	uint8_t type;
+	/** Reserved */
+	uint8_t reserved_a[2];
+	/** Number of descriptors in this packet */
+	uint8_t count;
+	/** Total length of this packet */
+	uint16_t total;
+	/** Length of this descriptor */
+	uint16_t len;
+} __attribute__ (( packed ));
+
+/** A transmit descriptor */
+struct gve_tx_descriptor {
+	/** Packet descriptor */
+	struct gve_tx_packet pkt;
+	/** Buffer descriptor */
+	struct gve_buffer buf;
+} __attribute__ (( packed ));
+
+/** Start of packet transmit descriptor type */
+#define GVE_TX_TYPE_START 0x00
+
+/** Continuation of packet transmit descriptor type */
+#define GVE_TX_TYPE_CONT 0x20
+
+/**
+ * Maximum number of receive buffers
+ *
+ * This is a policy decision.
+ */
+#define GVE_RX_FILL 16
+
+/** Receive queue page list ID */
+#define GVE_RX_QPL 0x18ae5258
+
+/** Receive queue interrupt channel */
+#define GVE_RX_IRQ 1
+
+/** A receive descriptor */
+struct gve_rx_descriptor {
+	/** Buffer descriptor */
+	struct gve_buffer buf;
+} __attribute__ (( packed ));
+
+/** A receive packet descriptor */
+struct gve_rx_packet {
+	/** Length */
+	uint16_t len;
+	/** Flags */
+	uint8_t flags;
+	/** Sequence number */
+	uint8_t seq;
+} __attribute__ (( packed ));
+
+/** Receive error */
+#define GVE_RXF_ERROR 0x08
+
+/** Receive packet continues into next descriptor */
+#define GVE_RXF_MORE 0x20
+
+/** Receive sequence number mask */
+#define GVE_RX_SEQ_MASK 0x07
+
+/** A receive completion descriptor */
+struct gve_rx_completion {
+	/** Reserved */
+	uint8_t reserved[60];
+	/** Packet descriptor */
+	struct gve_rx_packet pkt;
+} __attribute__ (( packed ));
+
+/** Padding at the start of all received packets */
+#define GVE_RX_PAD 2
+
+/** A descriptor queue */
+struct gve_queue {
+	/** Descriptor ring */
+	userptr_t desc;
+	/** Completion ring */
+	userptr_t cmplt;
+	/** Queue resources */
+	struct gve_resources *res;
+
+	/** Queue type */
+	const struct gve_queue_type *type;
+	/** Number of descriptors (must be a power of two) */
+	unsigned int count;
+	/** Maximum fill level (must be a power of two) */
+	unsigned int fill;
+
+	/** Descriptor mapping */
+	struct dma_mapping desc_map;
+	/** Completion mapping */
+	struct dma_mapping cmplt_map;
+	/** Queue resources mapping */
+	struct dma_mapping res_map;
+
+	/** Doorbell register */
+	volatile uint32_t *db;
+	/** Event counter */
+	struct gve_event *event;
+
+	/** Producer counter */
+	uint32_t prod;
+	/** Consumer counter */
+	uint32_t cons;
+
+	/** Queue page list */
+	struct gve_qpl qpl;
+};
+
+/** A descriptor queue type */
+struct gve_queue_type {
+	/** Name */
+	const char *name;
+	/**
+	 * Populate command parameters to create queue
+	 *
+	 * @v queue		Descriptor queue
+	 * @v cmd		Admin queue command
+	 */
+	void ( * param ) ( struct gve_queue *queue,
+			   union gve_admin_command *cmd );
+	/** Queue page list ID */
+	uint32_t qpl;
+	/** Interrupt channel */
+	uint8_t irq;
+	/** Maximum fill level */
+	uint8_t fill;
+	/** Descriptor size */
+	uint8_t desc_len;
+	/** Completion size */
+	uint8_t cmplt_len;
+	/** Command to create queue */
+	uint8_t create;
+	/** Command to destroy queue */
+	uint8_t destroy;
+};
+
+/** A Google Virtual Ethernet NIC */
+struct gve_nic {
+	/** Configuration registers */
+	void *cfg;
+	/** Doorbell registers */
+	void *db;
+	/** PCI revision */
+	uint8_t revision;
+	/** Network device */
+	struct net_device *netdev;
+	/** DMA device */
+	struct dma_device *dma;
+
+	/** Admin queue */
+	struct gve_admin admin;
+	/** Interrupt channels */
+	struct gve_irqs irqs;
+	/** Event counters */
+	struct gve_events events;
+	/** Scratch buffer */
+	struct gve_scratch scratch;
+
+	/** Transmit queue */
+	struct gve_queue tx;
+	/** Receive queue */
+	struct gve_queue rx;
+	/** Transmit I/O buffers */
+	struct io_buffer *tx_iobuf[GVE_TX_FILL];
+	/** Receive sequence number */
+	unsigned int seq;
+
+	/** Startup process */
+	struct process startup;
+	/** Startup process retry counter */
+	unsigned int retries;
+	/** Reset recovery watchdog timer */
+	struct retry_timer watchdog;
+	/** Reset recovery recorded activity counter */
+	uint32_t activity;
+};
+
+/** Maximum time to wait for admin queue commands */
+#define GVE_ADMIN_MAX_WAIT_MS 500
+
+/** Maximum number of times to reattempt device reset */
+#define GVE_RESET_MAX_RETRY 5
+
+/** Time between reset recovery checks */
+#define GVE_WATCHDOG_TIMEOUT ( 1 * TICKS_PER_SEC )
+
+#endif /* _GVE_H */
diff --git a/src/include/ipxe/errfile.h b/src/include/ipxe/errfile.h
index fcb4f0e..7615075 100644
--- a/src/include/ipxe/errfile.h
+++ b/src/include/ipxe/errfile.h
@@ -224,6 +224,7 @@ FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
 #define ERRFILE_ecam		     ( ERRFILE_DRIVER | 0x00d30000 )
 #define ERRFILE_pcibridge	     ( ERRFILE_DRIVER | 0x00d40000 )
 #define ERRFILE_mnpnet		     ( ERRFILE_DRIVER | 0x00d50000 )
+#define ERRFILE_gve		     ( ERRFILE_DRIVER | 0x00d60000 )
 
 #define ERRFILE_aoe			( ERRFILE_NET | 0x00000000 )
 #define ERRFILE_arp			( ERRFILE_NET | 0x00010000 )
author	Michael Brown <mcb30@ipxe.org>	2024-07-24 14:30:58 +0100
committer	Michael Brown <mcb30@ipxe.org>	2024-07-24 14:45:46 +0100
commit	c7b76e3adc3b4365aa3b490f24ae22375901c559 (patch)
tree	201e7d44fedbee4d2f6b20afc2c73f907b5b6dee
parent	5a9f476d4f1395e69cbb845d7379b0e3591028c0 (diff)
download	ipxe-c7b76e3adc3b4365aa3b490f24ae22375901c559.zip ipxe-c7b76e3adc3b4365aa3b490f24ae22375901c559.tar.gz ipxe-c7b76e3adc3b4365aa3b490f24ae22375901c559.tar.bz2