linuxppc-dev.lists.ozlabs.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 2/2] ehea: Receive SKB Aggregation
@ 2007-05-31 11:54 Thomas Klein
  2007-05-31 13:41 ` Christoph Hellwig
  2007-05-31 16:37 ` Stephen Hemminger
  0 siblings, 2 replies; 10+ messages in thread
From: Thomas Klein @ 2007-05-31 11:54 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Thomas Klein, Jan-Bernd Themann, netdev, linux-kernel,
	Christoph Raisch, Stefan Roscher, linux-ppc, Marcus Eder

After there were no technical concerns about this patch I'm resending it with
all whitespace issues fixed which were mentioned by Stephen Rothwell.

This patch enables the receive side processing to aggregate TCP packets within
the HEA device driver. It analyses the packets already received after an
interrupt arrived and forwards these as chains of SKBs for the same TCP
connection with modified header field. We have seen a lower CPU load and
improved throughput for small numbers of parallel TCP connections.

We added a disabled module parameter to prevent disruption of normal driver
operation.
We currently consider this as "experimental" until further review and tests
have been passed.


Signed-off-by: Thomas Klein <tklein@de.ibm.com>
---


diff -Nurp -X dontdiff linux-2.6.22-rc3/drivers/net/ehea/ehea.h patched_kernel/drivers/net/ehea/ehea.h
--- linux-2.6.22-rc3/drivers/net/ehea/ehea.h	2007-05-30 13:25:43.000000000 +0200
+++ patched_kernel/drivers/net/ehea/ehea.h	2007-05-30 13:28:16.000000000 +0200
@@ -39,7 +39,7 @@
 #include <asm/io.h>
 
 #define DRV_NAME	"ehea"
-#define DRV_VERSION	"EHEA_0062"
+#define DRV_VERSION	"EHEA_0063"
 
 #define EHEA_MSG_DEFAULT (NETIF_MSG_LINK | NETIF_MSG_TIMER \
 	| NETIF_MSG_RX_ERR | NETIF_MSG_TX_ERR)
@@ -49,6 +49,7 @@
 #define EHEA_MAX_ENTRIES_RQ3 16383
 #define EHEA_MAX_ENTRIES_SQ  32767
 #define EHEA_MIN_ENTRIES_QP  127
+#define EHEA_LRO_MAX_PKTS 60
 
 #define EHEA_SMALL_QUEUES
 #define EHEA_NUM_TX_QP 1
@@ -78,6 +79,9 @@
 #define EHEA_RQ2_PKT_SIZE       1522
 #define EHEA_L_PKT_SIZE         256	/* low latency */
 
+#define MAX_LRO_DESCRIPTORS 8
+#define LRO_DESC_MASK 0xFFFFFFFF
+
 /* Send completion signaling */
 
 /* Protection Domain Identifier */
@@ -334,6 +338,29 @@ struct ehea_q_skb_arr {
 };
 
 /*
+ * Large Receive Offload (LRO) descriptor for a tcp seesion
+ */
+struct ehea_lro {
+	struct sk_buff *parent;
+	struct sk_buff *last_skb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+
+	u32 tcp_rcv_tsecr;
+	u32 tcp_rcv_tsval;
+	u32 tcp_ack;
+	u32 tcp_next_seq;
+	u32 skb_tot_frags_len;
+	u16 ip_tot_len;
+	u16 tcp_saw_tstamp; 		/* timestamps enabled */
+	u16 tcp_window;
+	u16 vlan_tag;
+	int skb_sg_cnt;			/* counts aggregated skbs */
+	int vlan_packet;
+	int active;
+};
+
+/*
  * Port resources
  */
 struct ehea_port_res {
@@ -362,6 +389,9 @@ struct ehea_port_res {
 	u64 tx_packets;
 	u64 rx_packets;
 	u32 poll_counter;
+	struct ehea_lro lro[MAX_LRO_DESCRIPTORS];
+	u64 lro_desc;
+	struct port_stats p_state;
 };
 
 
@@ -411,6 +441,7 @@ struct ehea_port {
 	u32 msg_enable;
 	u32 sig_comp_iv;
 	u32 state;
+	u32 lro_max_aggr;
 	u8 full_duplex;
 	u8 autoneg;
 	u8 num_def_qps;
diff -Nurp -X dontdiff linux-2.6.22-rc3/drivers/net/ehea/ehea_main.c patched_kernel/drivers/net/ehea/ehea_main.c
--- linux-2.6.22-rc3/drivers/net/ehea/ehea_main.c	2007-05-30 13:25:43.000000000 +0200
+++ patched_kernel/drivers/net/ehea/ehea_main.c	2007-05-30 13:28:16.000000000 +0200
@@ -34,6 +34,7 @@
 #include <linux/list.h>
 #include <linux/if_ether.h>
 #include <net/ip.h>
+#include <net/tcp.h>
 
 #include "ehea.h"
 #include "ehea_qmr.h"
@@ -52,6 +53,8 @@ static int rq2_entries = EHEA_DEF_ENTRIE
 static int rq3_entries = EHEA_DEF_ENTRIES_RQ3;
 static int sq_entries = EHEA_DEF_ENTRIES_SQ;
 static int use_mcs = 0;
+static int use_lro = 0;
+static int lro_max_pkts = EHEA_LRO_MAX_PKTS;
 static int num_tx_qps = EHEA_NUM_TX_QP;
 
 module_param(msg_level, int, 0);
@@ -60,6 +63,8 @@ module_param(rq2_entries, int, 0);
 module_param(rq3_entries, int, 0);
 module_param(sq_entries, int, 0);
 module_param(use_mcs, int, 0);
+module_param(use_lro, int, 0);
+module_param(lro_max_pkts, int, 0);
 module_param(num_tx_qps, int, 0);
 
 MODULE_PARM_DESC(num_tx_qps, "Number of TX-QPS");
@@ -77,6 +82,9 @@ MODULE_PARM_DESC(sq_entries, " Number of
 		 "[2^x - 1], x = [6..14]. Default = "
 		 __MODULE_STRING(EHEA_DEF_ENTRIES_SQ) ")");
 MODULE_PARM_DESC(use_mcs, " 0:NAPI, 1:Multiple receive queues, Default = 1 ");
+MODULE_PARM_DESC(lro_max_pkts, "LRO: Max packets to be aggregated. Default = "
+		 __MODULE_STRING(EHEA_LRO_MAX_PKTS));
+MODULE_PARM_DESC(use_lro, "1: enable, 0: disable Large Reveive Offload ");
 
 static int port_name_cnt = 0;
 
@@ -380,6 +388,282 @@ static int ehea_treat_poll_error(struct 
 	return 0;
 }
 
+static int try_get_ip_tcp_hdr(struct ehea_cqe *cqe, struct sk_buff *skb,
+			      struct iphdr **iph, struct tcphdr **tcph)
+{
+	int ip_len;
+
+	/* non tcp/udp packets */
+	if (!cqe->header_length)
+		return -1;
+
+	/* non tcp packet */
+	*iph = (struct iphdr *)(skb->data);
+	if ((*iph)->protocol != IPPROTO_TCP)
+		return -1;
+
+	ip_len = (u8)((*iph)->ihl);
+	ip_len <<= 2;
+	*tcph = (struct tcphdr *)(((u64)*iph) + ip_len);
+
+	return 0;
+}
+
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+(ntohs(iph->tot_len) - (iph->ihl << 2) - (tcph->doff << 2))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+static int lro_tcp_check(struct iphdr *iph, struct tcphdr *tcph,
+			 int tcp_data_len, struct ehea_lro *lro)
+{
+	if (tcp_data_len == 0)
+		return -1;
+
+	if (iph->ihl != IPH_LEN_WO_OPTIONS)
+		return -1;
+
+	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || tcph->psh
+	    || tcph->rst || tcph->syn || tcph->fin)
+		return -1;
+
+	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+		return -1;
+
+	if (tcph->doff != TCPH_LEN_WO_OPTIONS
+	    && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+		return -1;
+
+	/* check tcp options (only timestamp allowed) */
+	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+		u32 *topt = (u32 *)(tcph + 1);
+
+		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+				   | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+			return -1;
+
+		/* timestamp should be in right order */
+		topt++;
+		if (lro && (ntohl(lro->tcp_rcv_tsval) > ntohl(*topt)))
+			return -1;
+
+		/* timestamp reply should not be zero */
+		topt++;
+		if (*topt == 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+static void update_tcp_ip_header(struct ehea_lro *lro)
+{
+	struct iphdr *iph = lro->iph;
+	struct tcphdr *tcph = lro->tcph;
+	u32 *p;
+
+	tcph->ack_seq = lro->tcp_ack;
+	tcph->window = lro->tcp_window;
+
+	if (lro->tcp_saw_tstamp) {
+		p = (u32 *)(tcph + 1);
+		*(p+2) = lro->tcp_rcv_tsecr;
+	}
+
+	iph->tot_len = htons(lro->ip_tot_len);
+	iph->check = 0;
+	iph->check = ip_fast_csum((u8 *)lro->iph, iph->ihl);
+}
+
+static void init_lro_desc(struct ehea_lro *lro, struct ehea_cqe *cqe,
+			  struct sk_buff *skb, struct iphdr *iph,
+			  struct tcphdr *tcph, u32 tcp_data_len)
+{
+	u32 *ptr;
+
+	lro->parent = skb;
+	lro->iph = iph;
+	lro->tcph = tcph;
+	lro->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+	lro->tcp_ack = ntohl(tcph->ack_seq);
+	
+	lro->skb_sg_cnt = 1;
+	lro->ip_tot_len = ntohs(iph->tot_len);
+	
+	if (tcph->doff == 8) {
+		ptr = (u32 *)(tcph+1);
+		lro->tcp_saw_tstamp = 1;
+		lro->tcp_rcv_tsval = *(ptr+1);
+		lro->tcp_rcv_tsecr = *(ptr+2);
+	}
+
+	if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT) {
+		lro->vlan_packet = 1;
+		lro->vlan_tag = cqe->vlan_tag;
+	}
+
+	lro->active = 1;
+}
+
+static inline void clear_lro_desc(struct ehea_lro *lro)
+{
+	memset(lro, 0, sizeof(struct ehea_lro));
+}
+
+static void lro_add_packet(struct ehea_lro *lro, struct sk_buff *skb,
+			   struct tcphdr *tcph, u32 tcp_len)
+{
+	struct sk_buff *parent = lro->parent;
+	u32 *topt;
+
+	lro->skb_sg_cnt++;
+	
+	lro->ip_tot_len += tcp_len;
+	lro->tcp_next_seq += tcp_len;
+	lro->tcp_window = lro->tcph->window;
+	lro->tcp_ack = lro->tcph->ack_seq;
+	
+	if (lro->tcp_saw_tstamp) {
+		topt = (u32 *) (tcph + 1);
+		lro->tcp_rcv_tsval = *(topt + 1);
+		lro->tcp_rcv_tsecr = *(topt + 2);
+	}
+
+	parent->len += tcp_len;
+	parent->data_len += tcp_len;
+
+	skb_pull(skb, (skb->len - tcp_len));
+	parent->truesize += skb->truesize;
+
+	if (lro->last_skb)
+		lro->last_skb->next = skb;
+	else
+		skb_shinfo(parent)->frag_list = skb;
+
+	lro->last_skb = skb;
+
+	return;
+}
+
+static int check_tcp_conn(struct ehea_lro *lro, struct iphdr *iph,
+			 struct tcphdr *tcph)
+{
+	// fixme: compare source and destination at the same time? u64 compare?
+	if ((lro->iph->saddr != iph->saddr) || (lro->iph->daddr != iph->daddr) ||
+	    (lro->tcph->source != tcph->source) || (lro->tcph->dest != tcph->dest))
+		return -1;
+	return 0;
+}
+
+static void flush_lro(struct ehea_port_res *pr, struct ehea_lro *lro)
+{
+	update_tcp_ip_header(lro);
+
+	if (lro->vlan_packet)
+		vlan_hwaccel_receive_skb(lro->parent, pr->port->vgrp,
+					 lro->vlan_tag);
+	else
+		netif_receive_skb(lro->parent);
+
+	clear_lro_desc(lro);
+}
+
+static void flush_all_lro(struct ehea_port_res *pr)
+{
+	int i;
+	struct ehea_lro *lro;
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		lro = &pr->lro[i];
+		if (lro->active)
+			flush_lro(pr, lro);
+	}
+}
+
+static struct ehea_lro *ehea_get_lro(struct ehea_port_res *pr,
+				     struct iphdr *iph, struct tcphdr *tcph)
+{
+	struct ehea_lro *lro = NULL;
+	struct ehea_lro *tmp;
+	int i;
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		tmp = &pr->lro[i];
+		if (tmp->active)
+			if (!check_tcp_conn(tmp, iph, tcph)) {
+				lro = tmp;
+				goto out;
+			}
+	}
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		if(!pr->lro[i].active) {
+			lro = &pr->lro[i];
+			goto out;
+		}
+	}
+
+out:
+	return lro;
+}
+
+static void ehea_proc_skb(struct ehea_port_res *pr, struct ehea_cqe *cqe,
+		     struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct ehea_lro *lro;
+	int tcp_data_len;
+	int skip_orig_skb = 0;
+
+	if (use_lro) {
+		if (try_get_ip_tcp_hdr(cqe, skb, &iph, &tcph))
+			goto out;
+
+		lro = ehea_get_lro(pr, iph, tcph);
+		if (!lro)
+			goto out;
+
+		tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+		if (!lro->active) {
+			if (lro_tcp_check(iph, tcph, tcp_data_len, NULL))
+				goto out;
+
+			init_lro_desc(lro, cqe, skb, iph, tcph, tcp_data_len);
+			return;
+		}
+
+		if (lro->tcp_next_seq != ntohl(tcph->seq)) {
+			flush_lro(pr, lro);
+			goto out;
+		}
+
+		if (lro_tcp_check(iph, tcph, tcp_data_len, lro)) {
+			flush_lro(pr, lro);
+			goto out;
+		}
+
+		lro_add_packet(lro, skb, tcph, tcp_data_len);
+
+		if (lro->skb_sg_cnt > pr->port->lro_max_aggr)
+			flush_lro(pr, lro);
+
+		skip_orig_skb = 1;
+	}
+
+out:
+	if (skip_orig_skb)
+		return;
+
+	if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT)
+		vlan_hwaccel_receive_skb(skb, pr->port->vgrp, cqe->vlan_tag);
+	else
+		netif_receive_skb(skb);
+}
+
 static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 					struct ehea_port_res *pr,
 					int *budget)
@@ -426,6 +710,7 @@ static struct ehea_cqe *ehea_proc_rwqes(
 					if (!skb)
 						break;
 				}
+				skb_reserve(skb, NET_IP_ALIGN);
 				skb_copy_to_linear_data(skb, ((char*)cqe) + 64,
 						 cqe->num_bytes_transfered - 4);
 				ehea_fill_skb(port->netdev, skb, cqe);
@@ -451,11 +736,7 @@ static struct ehea_cqe *ehea_proc_rwqes(
 				processed_rq3++;
 			}
 
-			if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT)
-				vlan_hwaccel_receive_skb(skb, port->vgrp,
-							 cqe->vlan_tag);
-			else
-				netif_receive_skb(skb);
+			ehea_proc_skb(pr, cqe, skb);
 		} else {
 			pr->p_stats.poll_receive_errors++;
 			port_reset = ehea_treat_poll_error(pr, rq, cqe,
@@ -467,6 +748,8 @@ static struct ehea_cqe *ehea_proc_rwqes(
 		cqe = ehea_poll_rq1(qp, &wqe_index);
 	}
 
+	flush_all_lro(pr);
+
 	pr->rx_packets += processed;
 	*budget -= processed;
 
@@ -1683,9 +1966,15 @@ out:
 
 static int ehea_change_mtu(struct net_device *dev, int new_mtu)
 {
+	struct ehea_port *port = netdev_priv(dev);
+
 	if ((new_mtu < 68) || (new_mtu > EHEA_MAX_PACKET_SIZE))
 		return -EINVAL;
 	dev->mtu = new_mtu;
+
+	if (use_lro)
+		port->lro_max_aggr = (0xFFFF / new_mtu);
+
 	return 0;
 }
 
@@ -2493,6 +2782,7 @@ struct ehea_port *ehea_setup_single_port
 	struct ehea_port *port;
 	struct device *port_dev;
 	int jumbo;
+	int lro_pkts;
 
 	/* allocate memory for the port structures */
 	dev = alloc_etherdev(sizeof(struct ehea_port));
@@ -2567,6 +2857,12 @@ struct ehea_port *ehea_setup_single_port
 		goto out_unreg_port;
 	}
 
+	lro_pkts = (0xFFFF / dev->mtu);
+	if (lro_pkts < lro_max_pkts)
+		port->lro_max_aggr = lro_pkts;
+	else
+		port->lro_max_aggr = lro_max_pkts;
+
 	ret = ehea_get_jumboframe_status(port, &jumbo);
 	if (ret)
 		ehea_error("failed determining jumbo frame status for %s",

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ehea: Receive SKB Aggregation
  2007-05-31 11:54 [PATCH 2/2] ehea: " Thomas Klein
@ 2007-05-31 13:41 ` Christoph Hellwig
  2007-06-04 12:09   ` Christoph Raisch
  2007-05-31 16:37 ` Stephen Hemminger
  1 sibling, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2007-05-31 13:41 UTC (permalink / raw)
  To: Thomas Klein
  Cc: Thomas Klein, Jeff Garzik, Jan-Bernd Themann, netdev,
	linux-kernel, Christoph Raisch, Stefan Roscher, linux-ppc,
	Marcus Eder

I'm still very unhappy with having all this in various drivers.  There's
a lot of code that can be turned into generic library functions, and even
more code that could be made generic with some amount of refactoring.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ehea: Receive SKB Aggregation
  2007-05-31 11:54 [PATCH 2/2] ehea: " Thomas Klein
  2007-05-31 13:41 ` Christoph Hellwig
@ 2007-05-31 16:37 ` Stephen Hemminger
  2007-06-04 12:09   ` Christoph Raisch
  1 sibling, 1 reply; 10+ messages in thread
From: Stephen Hemminger @ 2007-05-31 16:37 UTC (permalink / raw)
  To: Thomas Klein
  Cc: Thomas Klein, Jeff Garzik, Jan-Bernd Themann, netdev, Stefan,
	linux-kernel, Christoph Raisch, Roscher, linux-ppc, Marcus Eder


> 
>  
> +static int try_get_ip_tcp_hdr(struct ehea_cqe *cqe, struct sk_buff *skb,
> +			      struct iphdr **iph, struct tcphdr **tcph)
> +{
> +	int ip_len;
> +
> +	/* non tcp/udp packets */
> +	if (!cqe->header_length)
> +		return -1;
> +
> +	/* non tcp packet */
> +	*iph = (struct iphdr *)(skb->data);

Why the indirection, copying of headers..

> +	if ((*iph)->protocol != IPPROTO_TCP)
> +		return -1;
> +
> +	ip_len = (u8)((*iph)->ihl);
> +	ip_len <<= 2;
> +	*tcph = (struct tcphdr *)(((u64)*iph) + ip_len);
> +
> +	return 0;
> +}
> +
> 

This code seems to be duplicating a lot  (but not all) of the TCP/IP
input path validation checks. This is a security problem if nothing else...

Also, how do you prevent DoS attacks from hostile TCP senders that send
huge number of back to back frames?

-- 
Stephen Hemminger <shemminger@linux-foundation.org>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ehea: Receive SKB Aggregation
  2007-05-31 16:37 ` Stephen Hemminger
@ 2007-06-04 12:09   ` Christoph Raisch
  0 siblings, 0 replies; 10+ messages in thread
From: Christoph Raisch @ 2007-06-04 12:09 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Thomas Q Klein, ossthema, Jeff Garzik, Jan-Bernd Themann, netdev,
	linux-kernel, Stefan Roscher, linux-ppc, Marcus Eder, tklein



Stephen Hemminger  wrote on 31.05.2007 18:37:03:

>
> >
> >
> > +static int try_get_ip_tcp_hdr(struct ehea_cqe *cqe, struct sk_buff
*skb,
> > +               struct iphdr **iph, struct tcphdr **tcph)
> > +{
> > +   int ip_len;
> > +
> > +   /* non tcp/udp packets */
> > +   if (!cqe->header_length)
> > +      return -1;
> > +
> > +   /* non tcp packet */
> > +   *iph = (struct iphdr *)(skb->data);
>
> Why the indirection, copying of headers..
This interacts with the header split function in the hardware.
>
> > +   if ((*iph)->protocol != IPPROTO_TCP)
> > +      return -1;
> > +
> > +   ip_len = (u8)((*iph)->ihl);
> > +   ip_len <<= 2;
> > +   *tcph = (struct tcphdr *)(((u64)*iph) + ip_len);
> > +
> > +   return 0;
> > +}
> > +
> >
>
> This code seems to be duplicating a lot  (but not all) of the TCP/IP
> input path validation checks. This is a security problem if nothing
else...
>
We should only do aggregation in the driver if this really is a TCP header,
otherwise things will get worse.
You're right, we should at least check that tcph is within the received
frame.

> Also, how do you prevent DoS attacks from hostile TCP senders that send
> huge number of back to back frames?

Actually a huge number of back to back frames is what we would want to
receive
at 10 gbit ;-)
How is it possible to figure out if this is what you want or just DoS?
It doesn't change anything compared to a non LRO driver, we process a
certain
maximum amount of frames before waiting for the next interrupt,
the packet filters/DoS should still see all traffic (which is above the
driver).
Any suggestions how to handle this better/different?

>
> --
> Stephen Hemminger

Gruss / Regards
Christoph Raisch

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] ehea: Receive SKB Aggregation
  2007-05-31 13:41 ` Christoph Hellwig
@ 2007-06-04 12:09   ` Christoph Raisch
  0 siblings, 0 replies; 10+ messages in thread
From: Christoph Raisch @ 2007-06-04 12:09 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Thomas Q Klein, ossthema, Jeff Garzik, Jan-Bernd Themann, netdev,
	linux-kernel, Stefan Roscher, linux-ppc, Marcus Eder, tklein


Christoph Hellwig wrote on 31.05.2007 15:41:18:

> I'm still very unhappy with having all this in various drivers.  There's
> a lot of code that can be turned into generic library functions, and even
> more code that could be made generic with some amount of refactoring.

Yes, we'd also prefer to use a generic function, but we first would want to
get
some "real world" experience how our driver behaves with LRO to be even
able to
define requirements for such a generic function. A lot of this is tied into
pathlengths,
caching, and why does that help compared to a different TCP receive side
processing?
In a perfect world we shouldn't see a diffference if this is enabled or
not,
but measurements indicate something completely different at 10gbit.

Gruss / Regards
Christoph Raisch

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH 2/2] eHEA: Receive SKB Aggregation
@ 2007-07-04 10:09 Jan-Bernd Themann
  2007-07-10 17:00 ` Jeff Garzik
  0 siblings, 1 reply; 10+ messages in thread
From: Jan-Bernd Themann @ 2007-07-04 10:09 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Thomas Klein, Jan-Bernd Themann, netdev, linux-kernel, linux-ppc,
	Christoph Raisch, Marcus Eder, Stefan Roscher

This patch enables the receive side processing to aggregate TCP packets within
the HEA device driver. It analyses the packets already received after an
interrupt arrived and forwards these as chains of SKBs for the same TCP
connection with modified header field. We have seen a lower CPU load and
improved throughput for small numbers of parallel TCP connections.
As this feature is considered as experimental it is switched off by default
and can be activated via a module parameter.

Some additional security checks have been added since the last posting.

Signed-off-by: Jan-Bernd Themann <themann@de.ibm.com>
---
 drivers/net/ehea/ehea.h      |   30 ++++
 drivers/net/ehea/ehea_main.c |  324 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 348 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index 809e3f2..5a53b79 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -55,6 +55,7 @@
 #define EHEA_MAX_ENTRIES_RQ3 16383
 #define EHEA_MAX_ENTRIES_SQ  32767
 #define EHEA_MIN_ENTRIES_QP  127
+#define EHEA_LRO_MAX_PKTS 60
 
 #define EHEA_SMALL_QUEUES
 #define EHEA_NUM_TX_QP 1
@@ -84,6 +85,8 @@
 #define EHEA_RQ2_PKT_SIZE       1522
 #define EHEA_L_PKT_SIZE         256	/* low latency */
 
+#define MAX_LRO_DESCRIPTORS 8
+
 /* Send completion signaling */
 
 /* Protection Domain Identifier */
@@ -340,6 +343,29 @@ struct ehea_q_skb_arr {
 };
 
 /*
+ * Large Receive Offload (LRO) descriptor for a tcp session
+ */
+struct ehea_lro {
+	struct sk_buff *parent;
+	struct sk_buff *last_skb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+
+	u32 tcp_rcv_tsecr;
+	u32 tcp_rcv_tsval;
+	u32 tcp_ack;
+	u32 tcp_next_seq;
+	u32 skb_tot_frags_len;
+	u16 ip_tot_len;
+	u16 tcp_saw_tstamp; 		/* timestamps enabled */
+	u16 tcp_window;
+	u16 vlan_tag;
+	int skb_sg_cnt;			/* counts aggregated skbs */
+	int vlan_packet;
+	int active;
+};
+
+/*
  * Port resources
  */
 struct ehea_port_res {
@@ -368,6 +394,9 @@ struct ehea_port_res {
 	u64 tx_packets;
 	u64 rx_packets;
 	u32 poll_counter;
+	struct ehea_lro lro[MAX_LRO_DESCRIPTORS];
+	u64 lro_desc;
+	struct port_stats p_state;
 };
 
 
@@ -417,6 +446,7 @@ struct ehea_port {
 	u32 msg_enable;
 	u32 sig_comp_iv;
 	u32 state;
+	u32 lro_max_aggr;
 	u8 full_duplex;
 	u8 autoneg;
 	u8 num_def_qps;
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index f8c0908..5f889af 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -34,6 +34,7 @@
 #include <linux/list.h>
 #include <linux/if_ether.h>
 #include <net/ip.h>
+#include <net/tcp.h>
 
 #include "ehea.h"
 #include "ehea_qmr.h"
@@ -52,6 +53,8 @@ static int rq2_entries = EHEA_DEF_ENTRIES_RQ2;
 static int rq3_entries = EHEA_DEF_ENTRIES_RQ3;
 static int sq_entries = EHEA_DEF_ENTRIES_SQ;
 static int use_mcs = 0;
+static int use_lro = 0;
+static int lro_max_pkts = EHEA_LRO_MAX_PKTS;
 static int num_tx_qps = EHEA_NUM_TX_QP;
 
 module_param(msg_level, int, 0);
@@ -60,6 +63,8 @@ module_param(rq2_entries, int, 0);
 module_param(rq3_entries, int, 0);
 module_param(sq_entries, int, 0);
 module_param(use_mcs, int, 0);
+module_param(use_lro, int, 0);
+module_param(lro_max_pkts, int, 0);
 module_param(num_tx_qps, int, 0);
 
 MODULE_PARM_DESC(num_tx_qps, "Number of TX-QPS");
@@ -77,6 +82,10 @@ MODULE_PARM_DESC(sq_entries, " Number of entries for the Send Queue  "
 		 "[2^x - 1], x = [6..14]. Default = "
 		 __MODULE_STRING(EHEA_DEF_ENTRIES_SQ) ")");
 MODULE_PARM_DESC(use_mcs, " 0:NAPI, 1:Multiple receive queues, Default = 1 ");
+MODULE_PARM_DESC(lro_max_pkts, " LRO: Max packets to be aggregated. Default = "
+		 __MODULE_STRING(EHEA_LRO_MAX_PKTS));
+MODULE_PARM_DESC(use_lro, " Large Receive Offload, 1: enable, 0: disable, "
+		 "Default = 0");
 
 static int port_name_cnt = 0;
 
@@ -380,6 +389,297 @@ static int ehea_treat_poll_error(struct ehea_port_res *pr, int rq,
 	return 0;
 }
 
+static int try_get_ip_tcp_hdr(struct ehea_cqe *cqe, struct sk_buff *skb,
+			      struct iphdr **iphdr, struct tcphdr **tcph)
+{
+	unsigned int ip_hdrlength;
+	struct iphdr *iph;
+
+	/* non tcp/udp packets */
+	if (!cqe->header_length)
+		return -1;
+
+	/* non tcp packet */
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+
+	ip_hdrlength = ip_hdrlen(skb);
+
+	/* check ip header: packet length */
+	if (iph->tot_len > cqe->num_bytes_transfered - ETH_HLEN)
+		return -1;
+
+	/* check ip header: minimal ip header received */
+	if (ip_hdrlength > iph->tot_len - 20)
+		return -1;
+
+	skb_set_transport_header(skb, ip_hdrlength);
+	*tcph = tcp_hdr(skb);
+
+	/* check if ip header and tcp header are complete */
+	if (iph->tot_len < ip_hdrlength + tcp_hdrlen(skb))
+		return -1;
+
+	*iphdr = iph;
+	return 0;
+}
+
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+(ntohs(iph->tot_len) - (iph->ihl << 2) - (tcph->doff << 2))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+static int lro_tcp_check(struct iphdr *iph, struct tcphdr *tcph,
+			 int tcp_data_len, struct ehea_lro *lro)
+{
+	if (tcp_data_len == 0)
+		return -1;
+
+	if (iph->ihl != IPH_LEN_WO_OPTIONS)
+		return -1;
+
+	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || tcph->psh
+	    || tcph->rst || tcph->syn || tcph->fin)
+		return -1;
+
+	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+		return -1;
+
+	if (tcph->doff != TCPH_LEN_WO_OPTIONS
+	    && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+		return -1;
+
+	/* check tcp options (only timestamp allowed) */
+	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+		u32 *topt = (u32 *)(tcph + 1);
+
+		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+				   | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+			return -1;
+
+		/* timestamp should be in right order */
+		topt++;
+		if (lro && (ntohl(lro->tcp_rcv_tsval) > ntohl(*topt)))
+			return -1;
+
+		/* timestamp reply should not be zero */
+		topt++;
+		if (*topt == 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+static void update_tcp_ip_header(struct ehea_lro *lro)
+{
+	struct iphdr *iph = lro->iph;
+	struct tcphdr *tcph = lro->tcph;
+	u32 *p;
+
+	tcph->ack_seq = lro->tcp_ack;
+	tcph->window = lro->tcp_window;
+
+	if (lro->tcp_saw_tstamp) {
+		p = (u32 *)(tcph + 1);
+		*(p+2) = lro->tcp_rcv_tsecr;
+	}
+
+	iph->tot_len = htons(lro->ip_tot_len);
+	iph->check = 0;
+	iph->check = ip_fast_csum((u8 *)lro->iph, iph->ihl);
+}
+
+static void init_lro_desc(struct ehea_lro *lro, struct ehea_cqe *cqe,
+			  struct sk_buff *skb, struct iphdr *iph,
+			  struct tcphdr *tcph, u32 tcp_data_len)
+{
+	u32 *ptr;
+
+	lro->parent = skb;
+	lro->iph = iph;
+	lro->tcph = tcph;
+	lro->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+	lro->tcp_ack = ntohl(tcph->ack_seq);
+
+	lro->skb_sg_cnt = 1;
+	lro->ip_tot_len = ntohs(iph->tot_len);
+
+	if (tcph->doff == 8) {
+		ptr = (u32 *)(tcph+1);
+		lro->tcp_saw_tstamp = 1;
+		lro->tcp_rcv_tsval = *(ptr+1);
+		lro->tcp_rcv_tsecr = *(ptr+2);
+	}
+
+	if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT) {
+		lro->vlan_packet = 1;
+		lro->vlan_tag = cqe->vlan_tag;
+	}
+
+	lro->active = 1;
+}
+
+static inline void clear_lro_desc(struct ehea_lro *lro)
+{
+	memset(lro, 0, sizeof(struct ehea_lro));
+}
+
+static void lro_add_packet(struct ehea_lro *lro, struct sk_buff *skb,
+			   struct tcphdr *tcph, u32 tcp_len)
+{
+	struct sk_buff *parent = lro->parent;
+	u32 *topt;
+
+	lro->skb_sg_cnt++;
+
+	lro->ip_tot_len += tcp_len;
+	lro->tcp_next_seq += tcp_len;
+	lro->tcp_window = lro->tcph->window;
+	lro->tcp_ack = lro->tcph->ack_seq;
+
+	if (lro->tcp_saw_tstamp) {
+		topt = (u32 *) (tcph + 1);
+		lro->tcp_rcv_tsval = *(topt + 1);
+		lro->tcp_rcv_tsecr = *(topt + 2);
+	}
+
+	parent->len += tcp_len;
+	parent->data_len += tcp_len;
+
+	skb_pull(skb, (skb->len - tcp_len));
+	parent->truesize += skb->truesize;
+
+	if (lro->last_skb)
+		lro->last_skb->next = skb;
+	else
+		skb_shinfo(parent)->frag_list = skb;
+
+	lro->last_skb = skb;
+
+	return;
+}
+
+static int check_tcp_conn(struct ehea_lro *lro, struct iphdr *iph,
+			 struct tcphdr *tcph)
+{
+	if ((lro->iph->saddr != iph->saddr) || (lro->iph->daddr != iph->daddr) ||
+	    (lro->tcph->source != tcph->source) || (lro->tcph->dest != tcph->dest))
+		return -1;
+	return 0;
+}
+
+static void flush_lro(struct ehea_port_res *pr, struct ehea_lro *lro)
+{
+	update_tcp_ip_header(lro);
+
+	if (lro->vlan_packet && pr->port->vgrp)
+		vlan_hwaccel_receive_skb(lro->parent, pr->port->vgrp,
+					 lro->vlan_tag);
+	else
+		netif_receive_skb(lro->parent);
+
+	clear_lro_desc(lro);
+}
+
+static void flush_all_lro(struct ehea_port_res *pr)
+{
+	int i;
+	struct ehea_lro *lro;
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		lro = &pr->lro[i];
+		if (lro->active)
+			flush_lro(pr, lro);
+	}
+}
+
+static struct ehea_lro *ehea_get_lro(struct ehea_port_res *pr,
+				     struct iphdr *iph, struct tcphdr *tcph)
+{
+	struct ehea_lro *lro = NULL;
+	struct ehea_lro *tmp;
+	int i;
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		tmp = &pr->lro[i];
+		if (tmp->active)
+			if (!check_tcp_conn(tmp, iph, tcph)) {
+				lro = tmp;
+				goto out;
+			}
+	}
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		if(!pr->lro[i].active) {
+			lro = &pr->lro[i];
+			goto out;
+		}
+	}
+
+out:
+	return lro;
+}
+
+static void ehea_proc_skb(struct ehea_port_res *pr, struct ehea_cqe *cqe,
+		     struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct ehea_lro *lro;
+	int tcp_data_len;
+	int skip_orig_skb = 0;
+
+	if (use_lro) {
+		if (try_get_ip_tcp_hdr(cqe, skb, &iph, &tcph))
+			goto out;
+
+		lro = ehea_get_lro(pr, iph, tcph);
+		if (!lro)
+			goto out;
+
+		tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+		if (!lro->active) {
+			if (lro_tcp_check(iph, tcph, tcp_data_len, NULL))
+				goto out;
+
+			init_lro_desc(lro, cqe, skb, iph, tcph, tcp_data_len);
+			return;
+		}
+
+		if (lro->tcp_next_seq != ntohl(tcph->seq)) {
+			flush_lro(pr, lro);
+			goto out;
+		}
+
+		if (lro_tcp_check(iph, tcph, tcp_data_len, lro)) {
+			flush_lro(pr, lro);
+			goto out;
+		}
+
+		lro_add_packet(lro, skb, tcph, tcp_data_len);
+
+		if (lro->skb_sg_cnt > pr->port->lro_max_aggr)
+			flush_lro(pr, lro);
+
+		skip_orig_skb = 1;
+	}
+
+out:
+	if (skip_orig_skb)
+		return;
+
+	if ((cqe->status & EHEA_CQE_VLAN_TAG_XTRACT) && pr->port->vgrp)
+		vlan_hwaccel_receive_skb(skb, pr->port->vgrp, cqe->vlan_tag);
+	else
+		netif_receive_skb(skb);
+}
+
 static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 					struct ehea_port_res *pr,
 					int *budget)
@@ -426,6 +726,7 @@ static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 					if (!skb)
 						break;
 				}
+				skb_reserve(skb, NET_IP_ALIGN);
 				skb_copy_to_linear_data(skb, ((char*)cqe) + 64,
 						 cqe->num_bytes_transfered - 4);
 				ehea_fill_skb(port->netdev, skb, cqe);
@@ -451,12 +752,7 @@ static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 				processed_rq3++;
 			}
 
-			if ((cqe->status & EHEA_CQE_VLAN_TAG_XTRACT)
-			    && port->vgrp)
-				vlan_hwaccel_receive_skb(skb, port->vgrp,
-							 cqe->vlan_tag);
-			else
-				netif_receive_skb(skb);
+			ehea_proc_skb(pr, cqe, skb);
 		} else {
 			pr->p_stats.poll_receive_errors++;
 			port_reset = ehea_treat_poll_error(pr, rq, cqe,
@@ -468,6 +764,9 @@ static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 		cqe = ehea_poll_rq1(qp, &wqe_index);
 	}
 
+	if (use_lro)
+		flush_all_lro(pr);
+
 	pr->rx_packets += processed;
 	*budget -= processed;
 
@@ -1684,9 +1983,15 @@ out:
 
 static int ehea_change_mtu(struct net_device *dev, int new_mtu)
 {
+	struct ehea_port *port = netdev_priv(dev);
+
 	if ((new_mtu < 68) || (new_mtu > EHEA_MAX_PACKET_SIZE))
 		return -EINVAL;
 	dev->mtu = new_mtu;
+
+	if (use_lro)
+		port->lro_max_aggr = (0xFFFF / new_mtu);
+
 	return 0;
 }
 
@@ -2491,6 +2796,7 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 	struct ehea_port *port;
 	struct device *port_dev;
 	int jumbo;
+	int lro_pkts;
 
 	/* allocate memory for the port structures */
 	dev = alloc_etherdev(sizeof(struct ehea_port));
@@ -2565,6 +2871,12 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 		goto out_unreg_port;
 	}
 
+	lro_pkts = (0xFFFF / dev->mtu);
+	if (lro_pkts < lro_max_pkts)
+		port->lro_max_aggr = lro_pkts;
+	else
+		port->lro_max_aggr = lro_max_pkts;
+
 	ret = ehea_get_jumboframe_status(port, &jumbo);
 	if (ret)
 		ehea_error("failed determining jumbo frame status for %s",
-- 
1.5.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/2] eHEA: Receive SKB Aggregation
@ 2007-07-05  7:26 Jan-Bernd Themann
       [not found] ` <20070705082056.GA358@2ka.mipt.ru>
  0 siblings, 1 reply; 10+ messages in thread
From: Jan-Bernd Themann @ 2007-07-05  7:26 UTC (permalink / raw)
  To: Jeff Garzik
  Cc: Thomas Klein, Jan-Bernd Themann, netdev, linux-kernel, linux-ppc,
	Christoph Raisch, Marcus Eder, Stefan Roscher

This patch enables the receive side processing to aggregate TCP packets within
the HEA device driver. It analyses the packets already received after an
interrupt arrived and forwards these as chains of SKBs for the same TCP
connection with modified header field. We have seen a lower CPU load and
improved throughput for small numbers of parallel TCP connections.
As this feature is considered as experimental it is switched off by default
and can be activated via a module parameter.

Signed-off-by: Jan-Bernd Themann <themann@de.ibm.com>
---
 drivers/net/ehea/ehea.h      |   30 ++++
 drivers/net/ehea/ehea_main.c |  324 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 348 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index f03f070..65e6c8e 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -55,6 +55,7 @@
 #define EHEA_MAX_ENTRIES_RQ3 16383
 #define EHEA_MAX_ENTRIES_SQ  32767
 #define EHEA_MIN_ENTRIES_QP  127
+#define EHEA_LRO_MAX_PKTS 60
 
 #define EHEA_SMALL_QUEUES
 #define EHEA_NUM_TX_QP 1
@@ -84,6 +85,8 @@
 #define EHEA_RQ2_PKT_SIZE       1522
 #define EHEA_L_PKT_SIZE         256	/* low latency */
 
+#define MAX_LRO_DESCRIPTORS 8
+
 /* Send completion signaling */
 
 /* Protection Domain Identifier */
@@ -340,6 +343,29 @@ struct ehea_q_skb_arr {
 };
 
 /*
+ * Large Receive Offload (LRO) descriptor for a tcp session
+ */
+struct ehea_lro {
+	struct sk_buff *parent;
+	struct sk_buff *last_skb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+
+	u32 tcp_rcv_tsecr;
+	u32 tcp_rcv_tsval;
+	u32 tcp_ack;
+	u32 tcp_next_seq;
+	u32 skb_tot_frags_len;
+	u16 ip_tot_len;
+	u16 tcp_saw_tstamp; 		/* timestamps enabled */
+	u16 tcp_window;
+	u16 vlan_tag;
+	int skb_sg_cnt;			/* counts aggregated skbs */
+	int vlan_packet;
+	int active;
+};
+
+/*
  * Port resources
  */
 struct ehea_port_res {
@@ -368,6 +394,9 @@ struct ehea_port_res {
 	u64 tx_packets;
 	u64 rx_packets;
 	u32 poll_counter;
+	struct ehea_lro lro[MAX_LRO_DESCRIPTORS];
+	u64 lro_desc;
+	struct port_stats p_state;
 };
 
 
@@ -417,6 +446,7 @@ struct ehea_port {
 	u32 msg_enable;
 	u32 sig_comp_iv;
 	u32 state;
+	u32 lro_max_aggr;
 	u8 full_duplex;
 	u8 autoneg;
 	u8 num_def_qps;
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index 383144d..c283643 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -34,6 +34,7 @@
 #include <linux/list.h>
 #include <linux/if_ether.h>
 #include <net/ip.h>
+#include <net/tcp.h>
 
 #include "ehea.h"
 #include "ehea_qmr.h"
@@ -52,6 +53,8 @@ static int rq2_entries = EHEA_DEF_ENTRIES_RQ2;
 static int rq3_entries = EHEA_DEF_ENTRIES_RQ3;
 static int sq_entries = EHEA_DEF_ENTRIES_SQ;
 static int use_mcs = 0;
+static int use_lro = 0;
+static int lro_max_pkts = EHEA_LRO_MAX_PKTS;
 static int num_tx_qps = EHEA_NUM_TX_QP;
 
 module_param(msg_level, int, 0);
@@ -60,6 +63,8 @@ module_param(rq2_entries, int, 0);
 module_param(rq3_entries, int, 0);
 module_param(sq_entries, int, 0);
 module_param(use_mcs, int, 0);
+module_param(use_lro, int, 0);
+module_param(lro_max_pkts, int, 0);
 module_param(num_tx_qps, int, 0);
 
 MODULE_PARM_DESC(num_tx_qps, "Number of TX-QPS");
@@ -77,6 +82,10 @@ MODULE_PARM_DESC(sq_entries, " Number of entries for the Send Queue  "
 		 "[2^x - 1], x = [6..14]. Default = "
 		 __MODULE_STRING(EHEA_DEF_ENTRIES_SQ) ")");
 MODULE_PARM_DESC(use_mcs, " 0:NAPI, 1:Multiple receive queues, Default = 1 ");
+MODULE_PARM_DESC(lro_max_pkts, " LRO: Max packets to be aggregated. Default = "
+		 __MODULE_STRING(EHEA_LRO_MAX_PKTS));
+MODULE_PARM_DESC(use_lro, " Large Receive Offload, 1: enable, 0: disable, "
+		 "Default = 0");
 
 static int port_name_cnt = 0;
 
@@ -380,6 +389,297 @@ static int ehea_treat_poll_error(struct ehea_port_res *pr, int rq,
 	return 0;
 }
 
+static int try_get_ip_tcp_hdr(struct ehea_cqe *cqe, struct sk_buff *skb,
+			      struct iphdr **iphdr, struct tcphdr **tcph)
+{
+	unsigned int ip_hdrlength;
+	struct iphdr *iph;
+
+	/* non tcp/udp packets */
+	if (!cqe->header_length)
+		return -1;
+
+	/* non tcp packet */
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+
+	ip_hdrlength = ip_hdrlen(skb);
+
+	/* check ip header: packet length */
+	if (iph->tot_len > cqe->num_bytes_transfered - ETH_HLEN)
+		return -1;
+
+	/* check ip header: minimal ip header received */
+	if (ip_hdrlength > iph->tot_len - 20)
+		return -1;
+
+	skb_set_transport_header(skb, ip_hdrlength);
+	*tcph = tcp_hdr(skb);
+
+	/* check if ip header and tcp header are complete */
+	if (iph->tot_len < ip_hdrlength + tcp_hdrlen(skb))
+		return -1;
+
+	*iphdr = iph;
+	return 0;
+}
+
+#define TCP_PAYLOAD_LENGTH(iph, tcph) \
+(ntohs(iph->tot_len) - (iph->ihl << 2) - (tcph->doff << 2))
+
+#define IPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_WO_OPTIONS 5
+#define TCPH_LEN_W_TIMESTAMP 8
+
+static int lro_tcp_check(struct iphdr *iph, struct tcphdr *tcph,
+			 int tcp_data_len, struct ehea_lro *lro)
+{
+	if (tcp_data_len == 0)
+		return -1;
+
+	if (iph->ihl != IPH_LEN_WO_OPTIONS)
+		return -1;
+
+	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || tcph->psh
+	    || tcph->rst || tcph->syn || tcph->fin)
+		return -1;
+
+	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
+		return -1;
+
+	if (tcph->doff != TCPH_LEN_WO_OPTIONS
+	    && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+		return -1;
+
+	/* check tcp options (only timestamp allowed) */
+	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
+		u32 *topt = (u32 *)(tcph + 1);
+
+		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+				   | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+			return -1;
+
+		/* timestamp should be in right order */
+		topt++;
+		if (lro && (ntohl(lro->tcp_rcv_tsval) > ntohl(*topt)))
+			return -1;
+
+		/* timestamp reply should not be zero */
+		topt++;
+		if (*topt == 0)
+			return -1;
+	}
+
+	return 0;
+}
+
+static void update_tcp_ip_header(struct ehea_lro *lro)
+{
+	struct iphdr *iph = lro->iph;
+	struct tcphdr *tcph = lro->tcph;
+	u32 *p;
+
+	tcph->ack_seq = lro->tcp_ack;
+	tcph->window = lro->tcp_window;
+
+	if (lro->tcp_saw_tstamp) {
+		p = (u32 *)(tcph + 1);
+		*(p+2) = lro->tcp_rcv_tsecr;
+	}
+
+	iph->tot_len = htons(lro->ip_tot_len);
+	iph->check = 0;
+	iph->check = ip_fast_csum((u8 *)lro->iph, iph->ihl);
+}
+
+static void init_lro_desc(struct ehea_lro *lro, struct ehea_cqe *cqe,
+			  struct sk_buff *skb, struct iphdr *iph,
+			  struct tcphdr *tcph, u32 tcp_data_len)
+{
+	u32 *ptr;
+
+	lro->parent = skb;
+	lro->iph = iph;
+	lro->tcph = tcph;
+	lro->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
+	lro->tcp_ack = ntohl(tcph->ack_seq);
+
+	lro->skb_sg_cnt = 1;
+	lro->ip_tot_len = ntohs(iph->tot_len);
+
+	if (tcph->doff == 8) {
+		ptr = (u32 *)(tcph+1);
+		lro->tcp_saw_tstamp = 1;
+		lro->tcp_rcv_tsval = *(ptr+1);
+		lro->tcp_rcv_tsecr = *(ptr+2);
+	}
+
+	if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT) {
+		lro->vlan_packet = 1;
+		lro->vlan_tag = cqe->vlan_tag;
+	}
+
+	lro->active = 1;
+}
+
+static inline void clear_lro_desc(struct ehea_lro *lro)
+{
+	memset(lro, 0, sizeof(struct ehea_lro));
+}
+
+static void lro_add_packet(struct ehea_lro *lro, struct sk_buff *skb,
+			   struct tcphdr *tcph, u32 tcp_len)
+{
+	struct sk_buff *parent = lro->parent;
+	u32 *topt;
+
+	lro->skb_sg_cnt++;
+
+	lro->ip_tot_len += tcp_len;
+	lro->tcp_next_seq += tcp_len;
+	lro->tcp_window = lro->tcph->window;
+	lro->tcp_ack = lro->tcph->ack_seq;
+
+	if (lro->tcp_saw_tstamp) {
+		topt = (u32 *) (tcph + 1);
+		lro->tcp_rcv_tsval = *(topt + 1);
+		lro->tcp_rcv_tsecr = *(topt + 2);
+	}
+
+	parent->len += tcp_len;
+	parent->data_len += tcp_len;
+
+	skb_pull(skb, (skb->len - tcp_len));
+	parent->truesize += skb->truesize;
+
+	if (lro->last_skb)
+		lro->last_skb->next = skb;
+	else
+		skb_shinfo(parent)->frag_list = skb;
+
+	lro->last_skb = skb;
+
+	return;
+}
+
+static int check_tcp_conn(struct ehea_lro *lro, struct iphdr *iph,
+			 struct tcphdr *tcph)
+{
+	if ((lro->iph->saddr != iph->saddr) || (lro->iph->daddr != iph->daddr) ||
+	    (lro->tcph->source != tcph->source) || (lro->tcph->dest != tcph->dest))
+		return -1;
+	return 0;
+}
+
+static void flush_lro(struct ehea_port_res *pr, struct ehea_lro *lro)
+{
+	update_tcp_ip_header(lro);
+
+	if (lro->vlan_packet && pr->port->vgrp)
+		vlan_hwaccel_receive_skb(lro->parent, pr->port->vgrp,
+					 lro->vlan_tag);
+	else
+		netif_receive_skb(lro->parent);
+
+	clear_lro_desc(lro);
+}
+
+static void flush_all_lro(struct ehea_port_res *pr)
+{
+	int i;
+	struct ehea_lro *lro;
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		lro = &pr->lro[i];
+		if (lro->active)
+			flush_lro(pr, lro);
+	}
+}
+
+static struct ehea_lro *ehea_get_lro(struct ehea_port_res *pr,
+				     struct iphdr *iph, struct tcphdr *tcph)
+{
+	struct ehea_lro *lro = NULL;
+	struct ehea_lro *tmp;
+	int i;
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		tmp = &pr->lro[i];
+		if (tmp->active)
+			if (!check_tcp_conn(tmp, iph, tcph)) {
+				lro = tmp;
+				goto out;
+			}
+	}
+
+	for (i = 0; i < MAX_LRO_DESCRIPTORS; i++) {
+		if(!pr->lro[i].active) {
+			lro = &pr->lro[i];
+			goto out;
+		}
+	}
+
+out:
+	return lro;
+}
+
+static void ehea_proc_skb(struct ehea_port_res *pr, struct ehea_cqe *cqe,
+		     struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct ehea_lro *lro;
+	int tcp_data_len;
+	int skip_orig_skb = 0;
+
+	if (use_lro) {
+		if (try_get_ip_tcp_hdr(cqe, skb, &iph, &tcph))
+			goto out;
+
+		lro = ehea_get_lro(pr, iph, tcph);
+		if (!lro)
+			goto out;
+
+		tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
+
+		if (!lro->active) {
+			if (lro_tcp_check(iph, tcph, tcp_data_len, NULL))
+				goto out;
+
+			init_lro_desc(lro, cqe, skb, iph, tcph, tcp_data_len);
+			return;
+		}
+
+		if (lro->tcp_next_seq != ntohl(tcph->seq)) {
+			flush_lro(pr, lro);
+			goto out;
+		}
+
+		if (lro_tcp_check(iph, tcph, tcp_data_len, lro)) {
+			flush_lro(pr, lro);
+			goto out;
+		}
+
+		lro_add_packet(lro, skb, tcph, tcp_data_len);
+
+		if (lro->skb_sg_cnt > pr->port->lro_max_aggr)
+			flush_lro(pr, lro);
+
+		skip_orig_skb = 1;
+	}
+
+out:
+	if (skip_orig_skb)
+		return;
+
+	if ((cqe->status & EHEA_CQE_VLAN_TAG_XTRACT) && pr->port->vgrp)
+		vlan_hwaccel_receive_skb(skb, pr->port->vgrp, cqe->vlan_tag);
+	else
+		netif_receive_skb(skb);
+}
+
 static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 					struct ehea_port_res *pr,
 					int *budget)
@@ -426,6 +726,7 @@ static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 					if (!skb)
 						break;
 				}
+				skb_reserve(skb, NET_IP_ALIGN);
 				skb_copy_to_linear_data(skb, ((char*)cqe) + 64,
 						 cqe->num_bytes_transfered - 4);
 				ehea_fill_skb(port->netdev, skb, cqe);
@@ -451,12 +752,7 @@ static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 				processed_rq3++;
 			}
 
-			if ((cqe->status & EHEA_CQE_VLAN_TAG_XTRACT)
-			    && port->vgrp)
-				vlan_hwaccel_receive_skb(skb, port->vgrp,
-							 cqe->vlan_tag);
-			else
-				netif_receive_skb(skb);
+			ehea_proc_skb(pr, cqe, skb);
 		} else {
 			pr->p_stats.poll_receive_errors++;
 			port_reset = ehea_treat_poll_error(pr, rq, cqe,
@@ -468,6 +764,9 @@ static struct ehea_cqe *ehea_proc_rwqes(struct net_device *dev,
 		cqe = ehea_poll_rq1(qp, &wqe_index);
 	}
 
+	if (use_lro)
+		flush_all_lro(pr);
+
 	pr->rx_packets += processed;
 	*budget -= processed;
 
@@ -1684,9 +1983,15 @@ out:
 
 static int ehea_change_mtu(struct net_device *dev, int new_mtu)
 {
+	struct ehea_port *port = netdev_priv(dev);
+
 	if ((new_mtu < 68) || (new_mtu > EHEA_MAX_PACKET_SIZE))
 		return -EINVAL;
 	dev->mtu = new_mtu;
+
+	if (use_lro)
+		port->lro_max_aggr = (0xFFFF / new_mtu);
+
 	return 0;
 }
 
@@ -2491,6 +2796,7 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 	struct ehea_port *port;
 	struct device *port_dev;
 	int jumbo;
+	int lro_pkts;
 
 	/* allocate memory for the port structures */
 	dev = alloc_etherdev(sizeof(struct ehea_port));
@@ -2565,6 +2871,12 @@ struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 		goto out_unreg_port;
 	}
 
+	lro_pkts = (0xFFFF / dev->mtu);
+	if (lro_pkts < lro_max_pkts)
+		port->lro_max_aggr = lro_pkts;
+	else
+		port->lro_max_aggr = lro_max_pkts;
+
 	ret = ehea_get_jumboframe_status(port, &jumbo);
 	if (ret)
 		ehea_error("failed determining jumbo frame status for %s",
-- 
1.5.2

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] eHEA: Receive SKB Aggregation, generic LRO helper functions
       [not found] ` <20070705082056.GA358@2ka.mipt.ru>
@ 2007-07-05 14:24   ` Jan-Bernd Themann
  2007-07-05 15:45     ` Evgeniy Polyakov
  0 siblings, 1 reply; 10+ messages in thread
From: Jan-Bernd Themann @ 2007-07-05 14:24 UTC (permalink / raw)
  To: Evgeniy Polyakov
  Cc: Thomas Klein, Jeff Garzik, Jan-Bernd Themann, netdev,
	linux-kernel, Christoph Hellwig, linux-ppc, Christoph Raisch,
	Marcus Eder, Stefan Roscher

Hi, 

thanks for your comment. 

Jan-Bernd

On Thursday 05 July 2007 10:20, you wrote:
> Hi Jan-Bernd.
> 
> [ Dropped spambot/i.e. unrelated mail lists ]
> 
> On Thu, Jul 05, 2007 at 09:26:30AM +0200, Jan-Bernd Themann (ossthema@de.ibm.com) wrote:
> > This patch enables the receive side processing to aggregate TCP packets within
> > the HEA device driver. It analyses the packets already received after an
> > interrupt arrived and forwards these as chains of SKBs for the same TCP
> > connection with modified header field. We have seen a lower CPU load and
> > improved throughput for small numbers of parallel TCP connections.
> > As this feature is considered as experimental it is switched off by default
> > and can be activated via a module parameter.
> 
> I've couple of comments on the driver, but mainly the fact of decreased
> CPU usage itself - what was the magnitude of the win with this driver,
> it looks like because of per-packet receive code path invocation is the
> place of the latency...
> Your implementation looks generic enough to be used by any driver, don't
> you want to push it separately from eHEA driver?
> 

We can try to come up with a generic file with these helperfunctions.
What do you think about putting them into /net/ipv4/inet_lro.c and 
/include/linux/inet_lro.h ?

Latency: We didn't measure it so far, but it leads to a significant improvement
concerning the throughput. Our LRO algorithm only handles X packets a time
(depends on MTU and budget), so the upper bound delay is X*processing a single
packet from driver perspective. 

> 
> > +static int lro_tcp_check(struct iphdr *iph, struct tcphdr *tcph,
> > +			 int tcp_data_len, struct ehea_lro *lro)
> > +{
> > +	if (tcp_data_len == 0)
> > +		return -1;
> > +
> > +	if (iph->ihl != IPH_LEN_WO_OPTIONS)
> > +		return -1;
> > +
> > +	if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || tcph->psh
> > +	    || tcph->rst || tcph->syn || tcph->fin)
> > +		return -1;
> > +
> > +	if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
> > +		return -1;
> > +
> > +	if (tcph->doff != TCPH_LEN_WO_OPTIONS
> > +	    && tcph->doff != TCPH_LEN_W_TIMESTAMP)
> > +		return -1;
> > +
> > +	/* check tcp options (only timestamp allowed) */
> > +	if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
> > +		u32 *topt = (u32 *)(tcph + 1);
> > +
> > +		if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
> > +				   | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
> > +			return -1;
> > +
> > +		/* timestamp should be in right order */
> > +		topt++;
> > +		if (lro && (ntohl(lro->tcp_rcv_tsval) > ntohl(*topt)))
> > +			return -1;
> 
> This should use before/after technique like PAWS in TCP code or there will be a
> problem with wrapper timestamps.
> 

good point, will look into this

> > +
> > +		/* timestamp reply should not be zero */
> > +		topt++;
> > +		if (*topt == 0)
> > +			return -1;
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> > +static void update_tcp_ip_header(struct ehea_lro *lro)
> > +{
> > +	struct iphdr *iph = lro->iph;
> > +	struct tcphdr *tcph = lro->tcph;
> > +	u32 *p;
> > +
> > +	tcph->ack_seq = lro->tcp_ack;
> > +	tcph->window = lro->tcp_window;
> > +
> > +	if (lro->tcp_saw_tstamp) {
> > +		p = (u32 *)(tcph + 1);
> > +		*(p+2) = lro->tcp_rcv_tsecr;
> > +	}
> > +
> > +	iph->tot_len = htons(lro->ip_tot_len);
> > +	iph->check = 0;
> > +	iph->check = ip_fast_csum((u8 *)lro->iph, iph->ihl);
> > +}
> > +
> > +static void init_lro_desc(struct ehea_lro *lro, struct ehea_cqe *cqe,
> > +			  struct sk_buff *skb, struct iphdr *iph,
> > +			  struct tcphdr *tcph, u32 tcp_data_len)
> > +{
> > +	u32 *ptr;
> > +
> > +	lro->parent = skb;
> > +	lro->iph = iph;
> > +	lro->tcph = tcph;
> > +	lro->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
> > +	lro->tcp_ack = ntohl(tcph->ack_seq);
> 
> How do you handle misordering or duplicate acks or resends?
> 

we just forward these packets and leave it to the network stack to
handle them. As soon as we get a packet which does not match we
just flush the LRO session and the current packet 
(forward to stack as separate SKBs)

> > +	lro->skb_sg_cnt = 1;
> > +	lro->ip_tot_len = ntohs(iph->tot_len);
> > +
> > +	if (tcph->doff == 8) {
> > +		ptr = (u32 *)(tcph+1);
> > +		lro->tcp_saw_tstamp = 1;
> > +		lro->tcp_rcv_tsval = *(ptr+1);
> > +		lro->tcp_rcv_tsecr = *(ptr+2);
> > +	}
> > +
> > +	if (cqe->status & EHEA_CQE_VLAN_TAG_XTRACT) {
> > +		lro->vlan_packet = 1;
> > +		lro->vlan_tag = cqe->vlan_tag;
> > +	}
> > +
> > +	lro->active = 1;
> > +}
> 

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] eHEA: Receive SKB Aggregation, generic LRO helper functions
  2007-07-05 14:24   ` [PATCH 2/2] eHEA: Receive SKB Aggregation, generic LRO helper functions Jan-Bernd Themann
@ 2007-07-05 15:45     ` Evgeniy Polyakov
  0 siblings, 0 replies; 10+ messages in thread
From: Evgeniy Polyakov @ 2007-07-05 15:45 UTC (permalink / raw)
  To: Jan-Bernd Themann
  Cc: Thomas Klein, Jeff Garzik, Jan-Bernd Themann, netdev,
	linux-kernel, Christoph Hellwig, linux-ppc, Christoph Raisch,
	Marcus Eder, Stefan Roscher

On Thu, Jul 05, 2007 at 04:24:46PM +0200, Jan-Bernd Themann (ossthema@de.ibm.com) wrote:
> > I've couple of comments on the driver, but mainly the fact of decreased
> > CPU usage itself - what was the magnitude of the win with this driver,
> > it looks like because of per-packet receive code path invocation is the
> > place of the latency...
> > Your implementation looks generic enough to be used by any driver, don't
> > you want to push it separately from eHEA driver?
> > 
> 
> We can try to come up with a generic file with these helperfunctions.
> What do you think about putting them into /net/ipv4/inet_lro.c and 
> /include/linux/inet_lro.h ?

The more I think, the more it looks as appropriate to be used by all hardware
drivers with proper API. As far as I recall this is third implementation
in the linux drivers :)

-- 
	Evgeniy Polyakov

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/2] eHEA: Receive SKB Aggregation
  2007-07-04 10:09 [PATCH 2/2] eHEA: Receive SKB Aggregation Jan-Bernd Themann
@ 2007-07-10 17:00 ` Jeff Garzik
  0 siblings, 0 replies; 10+ messages in thread
From: Jeff Garzik @ 2007-07-10 17:00 UTC (permalink / raw)
  To: Jan-Bernd Themann
  Cc: Thomas Klein, Jan-Bernd Themann, netdev, linux-kernel, linux-ppc,
	Christoph Raisch, Marcus Eder, Stefan Roscher

Jan-Bernd Themann wrote:
> This patch enables the receive side processing to aggregate TCP packets within
> the HEA device driver. It analyses the packets already received after an
> interrupt arrived and forwards these as chains of SKBs for the same TCP
> connection with modified header field. We have seen a lower CPU load and
> improved throughput for small numbers of parallel TCP connections.
> As this feature is considered as experimental it is switched off by default
> and can be activated via a module parameter.
> 
> Some additional security checks have been added since the last posting.
> 
> Signed-off-by: Jan-Bernd Themann <themann@de.ibm.com>

as noted in comments, we don't need to recreate this in every driver. 
do something more generic.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2007-07-10 17:00 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-07-05  7:26 [PATCH 2/2] eHEA: Receive SKB Aggregation Jan-Bernd Themann
     [not found] ` <20070705082056.GA358@2ka.mipt.ru>
2007-07-05 14:24   ` [PATCH 2/2] eHEA: Receive SKB Aggregation, generic LRO helper functions Jan-Bernd Themann
2007-07-05 15:45     ` Evgeniy Polyakov
  -- strict thread matches above, loose matches on Subject: below --
2007-07-04 10:09 [PATCH 2/2] eHEA: Receive SKB Aggregation Jan-Bernd Themann
2007-07-10 17:00 ` Jeff Garzik
2007-05-31 11:54 [PATCH 2/2] ehea: " Thomas Klein
2007-05-31 13:41 ` Christoph Hellwig
2007-06-04 12:09   ` Christoph Raisch
2007-05-31 16:37 ` Stephen Hemminger
2007-06-04 12:09   ` Christoph Raisch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).