From: Jason Wang <jasowang@redhat.com>
To: peter.maydell@linaro.org, qemu-devel@nongnu.org
Cc: Ed Swierk <eswierk@skyportsystems.com>, Jason Wang <jasowang@redhat.com>
Subject: [Qemu-devel] [PULL 02/18] e1000: Separate TSO and non-TSO contexts, fixing UDP TX corruption
Date: Fri, 22 Dec 2017 10:15:21 +0800 [thread overview]
Message-ID: <1513908937-16034-3-git-send-email-jasowang@redhat.com> (raw)
In-Reply-To: <1513908937-16034-1-git-send-email-jasowang@redhat.com>
From: Ed Swierk via Qemu-devel <qemu-devel@nongnu.org>
The device is supposed to maintain two distinct contexts for transmit
offloads: one has parameters for both segmentation and checksum
offload, the other only for checksum offload. The guest driver can
send two context descriptors, one for each context (the TSE flag
specifies which). Then the guest can refer to one or the other context
in subsequent transmit data descriptors, depending on what offloads it
wants applied to each packet.
Currently the e1000 device stores just one context, and misinterprets
the TSE flags in the context and data descriptors. This is often okay:
Linux happens to send a fresh context descriptor before every data
descriptor, so forgetting the other context doesn't matter. Windows
does rely on separate contexts for TSO vs. non-TSO packets, but for
mostly-TCP traffic the two contexts have identical TCP-specific
offload parameters so confusing them doesn't matter.
One case where this confusion matters is when a Windows guest sets up
a TSO context for TCP and a non-TSO context for UDP, and then
transmits both TCP and UDP traffic in parallel. The e1000 device
sometimes ends up using TCP-specific parameters while doing checksum
offload on a UDP datagram: it writes the checksum to offset 16 (the
correct location for a TCP checksum), stomping on two bytes of UDP
data, and leaving the wrong value in the actual UDP checksum field at
offset 6. (Even worse, the host network stack may then recompute the
UDP checksum, "correcting" it to match the corrupt data before sending
it out a physical interface.)
Correct this by tracking the TSO context independently of the non-TSO
context, and selecting the appropriate context based on the TSE flag
in each transmit data descriptor.
Signed-off-by: Ed Swierk <eswierk@skyportsystems.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
hw/net/e1000.c | 70 +++++++++++++++++++++++++++++++++-------------------------
1 file changed, 40 insertions(+), 30 deletions(-)
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 30aef93..804ec08 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -101,6 +101,7 @@ typedef struct E1000State_st {
unsigned char sum_needed;
bool cptse;
e1000x_txd_props props;
+ e1000x_txd_props tso_props;
uint16_t tso_frames;
} tx;
@@ -541,35 +542,37 @@ xmit_seg(E1000State *s)
uint16_t len;
unsigned int frames = s->tx.tso_frames, css, sofar;
struct e1000_tx *tp = &s->tx;
+ struct e1000x_txd_props *props = tp->cptse ? &tp->tso_props : &tp->props;
- if (tp->props.tse && tp->cptse) {
- css = tp->props.ipcss;
+ if (tp->cptse) {
+ css = props->ipcss;
DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
frames, tp->size, css);
- if (tp->props.ip) { /* IPv4 */
+ if (props->ip) { /* IPv4 */
stw_be_p(tp->data+css+2, tp->size - css);
stw_be_p(tp->data+css+4,
lduw_be_p(tp->data + css + 4) + frames);
} else { /* IPv6 */
stw_be_p(tp->data+css+4, tp->size - css);
}
- css = tp->props.tucss;
+ css = props->tucss;
len = tp->size - css;
- DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->props.tcp, css, len);
- if (tp->props.tcp) {
- sofar = frames * tp->props.mss;
+ DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len);
+ if (props->tcp) {
+ sofar = frames * props->mss;
stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
- if (tp->props.paylen - sofar > tp->props.mss) {
+ if (props->paylen - sofar > props->mss) {
tp->data[css + 13] &= ~9; /* PSH, FIN */
} else if (frames) {
e1000x_inc_reg_if_not_full(s->mac_reg, TSCTC);
}
- } else /* UDP */
+ } else { /* UDP */
stw_be_p(tp->data+css+4, len);
+ }
if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
unsigned int phsum;
// add pseudo-header length before checksum calculation
- void *sp = tp->data + tp->props.tucso;
+ void *sp = tp->data + props->tucso;
phsum = lduw_be_p(sp) + len;
phsum = (phsum >> 16) + (phsum & 0xffff);
@@ -579,12 +582,10 @@ xmit_seg(E1000State *s)
}
if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
- putsum(tp->data, tp->size, tp->props.tucso,
- tp->props.tucss, tp->props.tucse);
+ putsum(tp->data, tp->size, props->tucso, props->tucss, props->tucse);
}
if (tp->sum_needed & E1000_TXD_POPTS_IXSM) {
- putsum(tp->data, tp->size, tp->props.ipcso,
- tp->props.ipcss, tp->props.ipcse);
+ putsum(tp->data, tp->size, props->ipcso, props->ipcss, props->ipcse);
}
if (tp->vlan_needed) {
memmove(tp->vlan, tp->data, 4);
@@ -616,11 +617,11 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
if (dtype == E1000_TXD_CMD_DEXT) { /* context descriptor */
- e1000x_read_tx_ctx_descr(xp, &tp->props);
- tp->tso_frames = 0;
- if (tp->props.tucso == 0) { /* this is probably wrong */
- DBGOUT(TXSUM, "TCP/UDP: cso 0!\n");
- tp->props.tucso = tp->props.tucss + (tp->props.tcp ? 16 : 6);
+ if (le32_to_cpu(xp->cmd_and_length) & E1000_TXD_CMD_TSE) {
+ e1000x_read_tx_ctx_descr(xp, &tp->tso_props);
+ tp->tso_frames = 0;
+ } else {
+ e1000x_read_tx_ctx_descr(xp, &tp->props);
}
return;
} else if (dtype == (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D)) {
@@ -645,8 +646,8 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
}
addr = le64_to_cpu(dp->buffer_addr);
- if (tp->props.tse && tp->cptse) {
- msh = tp->props.hdr_len + tp->props.mss;
+ if (tp->cptse) {
+ msh = tp->tso_props.hdr_len + tp->tso_props.mss;
do {
bytes = split_size;
if (tp->size + bytes > msh)
@@ -655,21 +656,19 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
bytes = MIN(sizeof(tp->data) - tp->size, bytes);
pci_dma_read(d, addr, tp->data + tp->size, bytes);
sz = tp->size + bytes;
- if (sz >= tp->props.hdr_len && tp->size < tp->props.hdr_len) {
- memmove(tp->header, tp->data, tp->props.hdr_len);
+ if (sz >= tp->tso_props.hdr_len
+ && tp->size < tp->tso_props.hdr_len) {
+ memmove(tp->header, tp->data, tp->tso_props.hdr_len);
}
tp->size = sz;
addr += bytes;
if (sz == msh) {
xmit_seg(s);
- memmove(tp->data, tp->header, tp->props.hdr_len);
- tp->size = tp->props.hdr_len;
+ memmove(tp->data, tp->header, tp->tso_props.hdr_len);
+ tp->size = tp->tso_props.hdr_len;
}
split_size -= bytes;
} while (bytes && split_size);
- } else if (!tp->props.tse && tp->cptse) {
- // context descriptor TSE is not set, while data descriptor TSE is set
- DBGOUT(TXERR, "TCP segmentation error\n");
} else {
split_size = MIN(sizeof(tp->data) - tp->size, split_size);
pci_dma_read(d, addr, tp->data + tp->size, split_size);
@@ -678,7 +677,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
if (!(txd_lower & E1000_TXD_CMD_EOP))
return;
- if (!(tp->props.tse && tp->cptse && tp->size < tp->props.hdr_len)) {
+ if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
xmit_seg(s);
}
tp->tso_frames = 0;
@@ -1437,7 +1436,7 @@ static const VMStateDescription vmstate_e1000_full_mac_state = {
static const VMStateDescription vmstate_e1000 = {
.name = "e1000",
- .version_id = 2,
+ .version_id = 3,
.minimum_version_id = 1,
.pre_save = e1000_pre_save,
.post_load = e1000_post_load,
@@ -1510,6 +1509,17 @@ static const VMStateDescription vmstate_e1000 = {
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, RA, 32),
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
+ VMSTATE_UINT8_V(tx.tso_props.ipcss, E1000State, 3),
+ VMSTATE_UINT8_V(tx.tso_props.ipcso, E1000State, 3),
+ VMSTATE_UINT16_V(tx.tso_props.ipcse, E1000State, 3),
+ VMSTATE_UINT8_V(tx.tso_props.tucss, E1000State, 3),
+ VMSTATE_UINT8_V(tx.tso_props.tucso, E1000State, 3),
+ VMSTATE_UINT16_V(tx.tso_props.tucse, E1000State, 3),
+ VMSTATE_UINT32_V(tx.tso_props.paylen, E1000State, 3),
+ VMSTATE_UINT8_V(tx.tso_props.hdr_len, E1000State, 3),
+ VMSTATE_UINT16_V(tx.tso_props.mss, E1000State, 3),
+ VMSTATE_INT8_V(tx.tso_props.ip, E1000State, 3),
+ VMSTATE_INT8_V(tx.tso_props.tcp, E1000State, 3),
VMSTATE_END_OF_LIST()
},
.subsections = (const VMStateDescription*[]) {
--
2.7.4
next prev parent reply other threads:[~2017-12-22 2:15 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-12-22 2:15 [Qemu-devel] [PULL 00/18] Net patches Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 01/18] e1000, e1000e: Move per-packet TX offload flags out of context state Jason Wang
2017-12-22 2:15 ` Jason Wang [this message]
2017-12-22 2:15 ` [Qemu-devel] [PULL 03/18] net: move CRC32 calculation from compute_mcast_idx() into its own net_crc32() function Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 04/18] net: introduce net_crc32_le() function Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 05/18] pcnet: switch pcnet over to use net_crc32_le() Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 06/18] eepro100: switch eepro100 e100_compute_mcast_idx() over to use net_crc32() Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 07/18] sunhme: switch sunhme over to use net_crc32_le() Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 08/18] sungem: fix multicast filter CRC calculation Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 09/18] eepro100: use inline net_crc32() and bitshift instead of compute_mcast_idx() Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 10/18] opencores_eth: " Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 11/18] lan9118: " Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 12/18] ftgmac100: " Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 13/18] ne2000: " Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 14/18] rtl8139: " Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 15/18] net: remove unused compute_mcast_idx() function Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 16/18] net: Remove the legacy "-net channel" parameter Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 17/18] qemu-doc: The "-net nic" option can be used with "netdev=...", too Jason Wang
2017-12-22 2:15 ` [Qemu-devel] [PULL 18/18] qemu-doc: Update the deprecation information of -tftp, -bootp, -redir and -smb Jason Wang
2018-01-08 10:16 ` [Qemu-devel] [PULL 00/18] Net patches Peter Maydell
2018-01-08 13:30 ` Ed Swierk
2018-01-08 15:10 ` Eric Blake
2018-01-08 15:18 ` Eric Blake
2018-01-08 15:25 ` Eric Blake
2018-01-08 15:54 ` Ed Swierk
2018-01-08 16:33 ` Eric Blake
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1513908937-16034-3-git-send-email-jasowang@redhat.com \
--to=jasowang@redhat.com \
--cc=eswierk@skyportsystems.com \
--cc=peter.maydell@linaro.org \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).