aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAkihiko Odaki <akihiko.odaki@daynix.com>2023-02-23 19:20:09 +0900
committerJason Wang <jasowang@redhat.com>2023-03-10 15:35:38 +0800
commit02ef5fdc092bd495d6afd3c0212ff2e45931886d (patch)
tree0f0869f8222e0708164fb2694447825e0a420a4b
parentffbd2dbd8e647b68406179697c06d2668438b789 (diff)
downloadqemu-02ef5fdc092bd495d6afd3c0212ff2e45931886d.zip
qemu-02ef5fdc092bd495d6afd3c0212ff2e45931886d.tar.gz
qemu-02ef5fdc092bd495d6afd3c0212ff2e45931886d.tar.bz2
hw/net/net_tx_pkt: Implement TCP segmentation
There was no proper implementation of TCP segmentation before this change, and net_tx_pkt relied solely on IPv4 fragmentation. Not only this is not aligned with the specification, but it also resulted in corrupted IPv6 packets. This is particularly problematic for the igb, a new proposed device implementation; igb provides loopback feature for VMDq and the feature relies on software segmentation. Implement proper TCP segmentation in net_tx_pkt to fix such a scenario. Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com> Signed-off-by: Jason Wang <jasowang@redhat.com>
-rw-r--r--hw/net/net_tx_pkt.c248
-rw-r--r--include/net/eth.h5
-rw-r--r--net/eth.c27
3 files changed, 206 insertions, 74 deletions
diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index 6afd3f6..4a35e84 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -326,7 +326,8 @@ bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool tso_enable,
case VIRTIO_NET_HDR_GSO_TCPV6:
bytes_read = iov_to_buf(&pkt->vec[NET_TX_PKT_PL_START_FRAG],
pkt->payload_frags, 0, &l4hdr, sizeof(l4hdr));
- if (bytes_read < sizeof(l4hdr)) {
+ if (bytes_read < sizeof(l4hdr) ||
+ l4hdr.th_off * sizeof(uint32_t) < sizeof(l4hdr)) {
return false;
}
@@ -466,15 +467,14 @@ void net_tx_pkt_reset(struct NetTxPkt *pkt)
pkt->l4proto = 0;
}
-static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
+static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt,
+ struct iovec *iov, uint32_t iov_len,
+ uint16_t csl)
{
- struct iovec *iov = &pkt->vec[NET_TX_PKT_L2HDR_FRAG];
uint32_t csum_cntr;
uint16_t csum = 0;
uint32_t cso;
/* num of iovec without vhdr */
- uint32_t iov_len = pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1;
- uint16_t csl;
size_t csum_offset = pkt->virt_hdr.csum_start + pkt->virt_hdr.csum_offset;
uint16_t l3_proto = eth_get_l3_proto(iov, 1, iov->iov_len);
@@ -482,8 +482,6 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
iov_from_buf(iov, iov_len, csum_offset, &csum, sizeof csum);
/* Calculate L4 TCP/UDP checksum */
- csl = pkt->payload_len;
-
csum_cntr = 0;
cso = 0;
/* add pseudo header to csum */
@@ -509,14 +507,13 @@ static void net_tx_pkt_do_sw_csum(struct NetTxPkt *pkt)
#define NET_MAX_FRAG_SG_LIST (64)
static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
- int *src_idx, size_t *src_offset, struct iovec *dst, int *dst_idx)
+ int *src_idx, size_t *src_offset, size_t src_len,
+ struct iovec *dst, int *dst_idx)
{
size_t fetched = 0;
struct iovec *src = pkt->vec;
- *dst_idx = NET_TX_PKT_PL_START_FRAG;
-
- while (fetched < IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size)) {
+ while (fetched < src_len) {
/* no more place in fragment iov */
if (*dst_idx == NET_MAX_FRAG_SG_LIST) {
@@ -531,7 +528,7 @@ static size_t net_tx_pkt_fetch_fragment(struct NetTxPkt *pkt,
dst[*dst_idx].iov_base = src[*src_idx].iov_base + *src_offset;
dst[*dst_idx].iov_len = MIN(src[*src_idx].iov_len - *src_offset,
- IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size) - fetched);
+ src_len - fetched);
*src_offset += dst[*dst_idx].iov_len;
fetched += dst[*dst_idx].iov_len;
@@ -560,58 +557,223 @@ static void net_tx_pkt_sendv(
}
}
+static bool net_tx_pkt_tcp_fragment_init(struct NetTxPkt *pkt,
+ struct iovec *fragment,
+ int *pl_idx,
+ size_t *l4hdr_len,
+ int *src_idx,
+ size_t *src_offset,
+ size_t *src_len)
+{
+ struct iovec *l4 = fragment + NET_TX_PKT_PL_START_FRAG;
+ size_t bytes_read = 0;
+ struct tcp_hdr *th;
+
+ if (!pkt->payload_frags) {
+ return false;
+ }
+
+ l4->iov_len = pkt->virt_hdr.hdr_len - pkt->hdr_len;
+ l4->iov_base = g_malloc(l4->iov_len);
+
+ *src_idx = NET_TX_PKT_PL_START_FRAG;
+ while (pkt->vec[*src_idx].iov_len < l4->iov_len - bytes_read) {
+ memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base,
+ pkt->vec[*src_idx].iov_len);
+
+ bytes_read += pkt->vec[*src_idx].iov_len;
+
+ (*src_idx)++;
+ if (*src_idx >= pkt->payload_frags + NET_TX_PKT_PL_START_FRAG) {
+ g_free(l4->iov_base);
+ return false;
+ }
+ }
+
+ *src_offset = l4->iov_len - bytes_read;
+ memcpy((char *)l4->iov_base + bytes_read, pkt->vec[*src_idx].iov_base,
+ *src_offset);
+
+ th = l4->iov_base;
+ th->th_flags &= ~(TH_FIN | TH_PUSH);
+
+ *pl_idx = NET_TX_PKT_PL_START_FRAG + 1;
+ *l4hdr_len = l4->iov_len;
+ *src_len = pkt->virt_hdr.gso_size;
+
+ return true;
+}
+
+static void net_tx_pkt_tcp_fragment_deinit(struct iovec *fragment)
+{
+ g_free(fragment[NET_TX_PKT_PL_START_FRAG].iov_base);
+}
+
+static void net_tx_pkt_tcp_fragment_fix(struct NetTxPkt *pkt,
+ struct iovec *fragment,
+ size_t fragment_len,
+ uint8_t gso_type)
+{
+ struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
+ struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG;
+ struct ip_header *ip = l3hdr->iov_base;
+ struct ip6_header *ip6 = l3hdr->iov_base;
+ size_t len = l3hdr->iov_len + l4hdr->iov_len + fragment_len;
+
+ switch (gso_type) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ ip->ip_len = cpu_to_be16(len);
+ eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len);
+ break;
+
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ len -= sizeof(struct ip6_header);
+ ip6->ip6_ctlun.ip6_un1.ip6_un1_plen = cpu_to_be16(len);
+ break;
+ }
+}
+
+static void net_tx_pkt_tcp_fragment_advance(struct NetTxPkt *pkt,
+ struct iovec *fragment,
+ size_t fragment_len,
+ uint8_t gso_type)
+{
+ struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
+ struct iovec *l4hdr = fragment + NET_TX_PKT_PL_START_FRAG;
+ struct ip_header *ip = l3hdr->iov_base;
+ struct tcp_hdr *th = l4hdr->iov_base;
+
+ if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4) {
+ ip->ip_id = cpu_to_be16(be16_to_cpu(ip->ip_id) + 1);
+ }
+
+ th->th_seq = cpu_to_be32(be32_to_cpu(th->th_seq) + fragment_len);
+ th->th_flags &= ~TH_CWR;
+}
+
+static void net_tx_pkt_udp_fragment_init(struct NetTxPkt *pkt,
+ int *pl_idx,
+ size_t *l4hdr_len,
+ int *src_idx, size_t *src_offset,
+ size_t *src_len)
+{
+ *pl_idx = NET_TX_PKT_PL_START_FRAG;
+ *l4hdr_len = 0;
+ *src_idx = NET_TX_PKT_PL_START_FRAG;
+ *src_offset = 0;
+ *src_len = IP_FRAG_ALIGN_SIZE(pkt->virt_hdr.gso_size);
+}
+
+static void net_tx_pkt_udp_fragment_fix(struct NetTxPkt *pkt,
+ struct iovec *fragment,
+ size_t fragment_offset,
+ size_t fragment_len)
+{
+ bool more_frags = fragment_offset + fragment_len < pkt->payload_len;
+ uint16_t orig_flags;
+ struct iovec *l3hdr = fragment + NET_TX_PKT_L3HDR_FRAG;
+ struct ip_header *ip = l3hdr->iov_base;
+ uint16_t frag_off_units = fragment_offset / IP_FRAG_UNIT_SIZE;
+ uint16_t new_ip_off;
+
+ assert(fragment_offset % IP_FRAG_UNIT_SIZE == 0);
+ assert((frag_off_units & ~IP_OFFMASK) == 0);
+
+ orig_flags = be16_to_cpu(ip->ip_off) & ~(IP_OFFMASK | IP_MF);
+ new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
+ ip->ip_off = cpu_to_be16(new_ip_off);
+ ip->ip_len = cpu_to_be16(l3hdr->iov_len + fragment_len);
+
+ eth_fix_ip4_checksum(l3hdr->iov_base, l3hdr->iov_len);
+}
+
static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt *pkt,
NetTxPktCallback callback,
void *context)
{
+ uint8_t gso_type = pkt->virt_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN;
+
struct iovec fragment[NET_MAX_FRAG_SG_LIST];
- size_t fragment_len = 0;
- bool more_frags = false;
-
- /* some pointers for shorter code */
- void *l2_iov_base, *l3_iov_base;
- size_t l2_iov_len, l3_iov_len;
- int src_idx = NET_TX_PKT_PL_START_FRAG, dst_idx;
- size_t src_offset = 0;
+ size_t fragment_len;
+ size_t l4hdr_len;
+ size_t src_len;
+
+ int src_idx, dst_idx, pl_idx;
+ size_t src_offset;
size_t fragment_offset = 0;
struct virtio_net_hdr virt_hdr = {
.flags = pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM ?
VIRTIO_NET_HDR_F_DATA_VALID : 0
};
- l2_iov_base = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base;
- l2_iov_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len;
- l3_iov_base = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_base;
- l3_iov_len = pkt->vec[NET_TX_PKT_L3HDR_FRAG].iov_len;
-
/* Copy headers */
fragment[NET_TX_PKT_VHDR_FRAG].iov_base = &virt_hdr;
fragment[NET_TX_PKT_VHDR_FRAG].iov_len = sizeof(virt_hdr);
- fragment[NET_TX_PKT_L2HDR_FRAG].iov_base = l2_iov_base;
- fragment[NET_TX_PKT_L2HDR_FRAG].iov_len = l2_iov_len;
- fragment[NET_TX_PKT_L3HDR_FRAG].iov_base = l3_iov_base;
- fragment[NET_TX_PKT_L3HDR_FRAG].iov_len = l3_iov_len;
+ fragment[NET_TX_PKT_L2HDR_FRAG] = pkt->vec[NET_TX_PKT_L2HDR_FRAG];
+ fragment[NET_TX_PKT_L3HDR_FRAG] = pkt->vec[NET_TX_PKT_L3HDR_FRAG];
+ switch (gso_type) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ if (!net_tx_pkt_tcp_fragment_init(pkt, fragment, &pl_idx, &l4hdr_len,
+ &src_idx, &src_offset, &src_len)) {
+ return false;
+ }
+ break;
- /* Put as much data as possible and send */
- do {
- fragment_len = net_tx_pkt_fetch_fragment(pkt, &src_idx, &src_offset,
- fragment, &dst_idx);
+ case VIRTIO_NET_HDR_GSO_UDP:
+ net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
+ pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1,
+ pkt->payload_len);
+ net_tx_pkt_udp_fragment_init(pkt, &pl_idx, &l4hdr_len,
+ &src_idx, &src_offset, &src_len);
+ break;
- more_frags = (fragment_offset + fragment_len < pkt->payload_len);
+ default:
+ abort();
+ }
- eth_setup_ip4_fragmentation(l2_iov_base, l2_iov_len, l3_iov_base,
- l3_iov_len, fragment_len, fragment_offset, more_frags);
+ /* Put as much data as possible and send */
+ while (true) {
+ dst_idx = pl_idx;
+ fragment_len = net_tx_pkt_fetch_fragment(pkt,
+ &src_idx, &src_offset, src_len, fragment, &dst_idx);
+ if (!fragment_len) {
+ break;
+ }
- eth_fix_ip4_checksum(l3_iov_base, l3_iov_len);
+ switch (gso_type) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ net_tx_pkt_tcp_fragment_fix(pkt, fragment, fragment_len, gso_type);
+ net_tx_pkt_do_sw_csum(pkt, fragment + NET_TX_PKT_L2HDR_FRAG,
+ dst_idx - NET_TX_PKT_L2HDR_FRAG,
+ l4hdr_len + fragment_len);
+ break;
+
+ case VIRTIO_NET_HDR_GSO_UDP:
+ net_tx_pkt_udp_fragment_fix(pkt, fragment, fragment_offset,
+ fragment_len);
+ break;
+ }
callback(context,
fragment + NET_TX_PKT_L2HDR_FRAG, dst_idx - NET_TX_PKT_L2HDR_FRAG,
fragment + NET_TX_PKT_VHDR_FRAG, dst_idx - NET_TX_PKT_VHDR_FRAG);
+ if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
+ gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
+ net_tx_pkt_tcp_fragment_advance(pkt, fragment, fragment_len,
+ gso_type);
+ }
+
fragment_offset += fragment_len;
+ }
- } while (fragment_len && more_frags);
+ if (gso_type == VIRTIO_NET_HDR_GSO_TCPV4 ||
+ gso_type == VIRTIO_NET_HDR_GSO_TCPV6) {
+ net_tx_pkt_tcp_fragment_deinit(fragment);
+ }
return true;
}
@@ -627,10 +789,6 @@ bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload,
{
assert(pkt);
- if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- net_tx_pkt_do_sw_csum(pkt);
- }
-
/*
* Since underlying infrastructure does not support IP datagrams longer
* than 64K we should drop such packets and don't even try to send
@@ -644,6 +802,12 @@ bool net_tx_pkt_send_custom(struct NetTxPkt *pkt, bool offload,
}
if (offload || pkt->virt_hdr.gso_type == VIRTIO_NET_HDR_GSO_NONE) {
+ if (!offload && pkt->virt_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ net_tx_pkt_do_sw_csum(pkt, &pkt->vec[NET_TX_PKT_L2HDR_FRAG],
+ pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - 1,
+ pkt->payload_len);
+ }
+
net_tx_pkt_fix_ip6_payload_len(pkt);
callback(context, pkt->vec + NET_TX_PKT_L2HDR_FRAG,
pkt->payload_frags + NET_TX_PKT_PL_START_FRAG - NET_TX_PKT_L2HDR_FRAG,
diff --git a/include/net/eth.h b/include/net/eth.h
index 6e699b0..2b4374f 100644
--- a/include/net/eth.h
+++ b/include/net/eth.h
@@ -400,11 +400,6 @@ void eth_get_protocols(const struct iovec *iov, int iovcnt,
eth_ip4_hdr_info *ip4hdr_info,
eth_l4_hdr_info *l4hdr_info);
-void eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
- void *l3hdr, size_t l3hdr_len,
- size_t l3payload_len,
- size_t frag_offset, bool more_frags);
-
void
eth_fix_ip4_checksum(void *l3hdr, size_t l3hdr_len);
diff --git a/net/eth.c b/net/eth.c
index f074b2f..36d39b4 100644
--- a/net/eth.c
+++ b/net/eth.c
@@ -315,33 +315,6 @@ eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff,
}
void
-eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
- void *l3hdr, size_t l3hdr_len,
- size_t l3payload_len,
- size_t frag_offset, bool more_frags)
-{
- const struct iovec l2vec = {
- .iov_base = (void *) l2hdr,
- .iov_len = l2hdr_len
- };
-
- if (eth_get_l3_proto(&l2vec, 1, l2hdr_len) == ETH_P_IP) {
- uint16_t orig_flags;
- struct ip_header *iphdr = (struct ip_header *) l3hdr;
- uint16_t frag_off_units = frag_offset / IP_FRAG_UNIT_SIZE;
- uint16_t new_ip_off;
-
- assert(frag_offset % IP_FRAG_UNIT_SIZE == 0);
- assert((frag_off_units & ~IP_OFFMASK) == 0);
-
- orig_flags = be16_to_cpu(iphdr->ip_off) & ~(IP_OFFMASK|IP_MF);
- new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
- iphdr->ip_off = cpu_to_be16(new_ip_off);
- iphdr->ip_len = cpu_to_be16(l3payload_len + l3hdr_len);
- }
-}
-
-void
eth_fix_ip4_checksum(void *l3hdr, size_t l3hdr_len)
{
struct ip_header *iphdr = (struct ip_header *) l3hdr;