All of lore.kernel.org
 help / color / mirror / Atom feed
From: Ed Cashin <ecashin@coraid.com>
To: linux-kernel@vger.kernel.org
Subject: Re: [PATCH 01/14] aoe: for performance support larger packet payloads
Date: Thu, 23 Aug 2012 16:32:44 -0400	[thread overview]
Message-ID: <m2lih5l577.fsf@coraid.com> (raw)
In-Reply-To: 7c895879cfd1e15dd76c2469b417b48d0462d7df.1345743801.git.ecashin@coraid.com

I probably should have Cc'ed the netdev list for this patch.

Ed Cashin <ecashin@coraid.com> writes:

> This patch adds the ability to work with large packets composed of a
> number of segments, using the scatter gather feature of the block
> layer (biovecs) and the network layer (skb frag array).  The
> motivation is the performance gained by using a packet data payload
> greater than a page size and by using the network card's scatter
> gather feature.
>
> Users of the out-of-tree aoe driver already had these changes, but
> since early 2011, they have complained of increased memory utilization
> and higher CPU utilization during heavy writes.[1] The commit below
> appears related, as it disables scatter gather on non-IP protocols
> inside the harmonize_features function, even when the NIC supports sg.
>
>   commit f01a5236bd4b140198fbcc550f085e8361fd73fa
>   Author: Jesse Gross <jesse@nicira.com>
>   Date:   Sun Jan 9 06:23:31 2011 +0000
>
>       net offloading: Generalize netif_get_vlan_features().
>
> With that regression in place, transmits always linearize sg AoE
> packets, but in-kernel users did not have this patch.  Before 2.6.38,
> though, these changes were working to allow sg to increase
> performance.
>
> 1. http://www.spinics.net/lists/linux-mm/msg15184.html
>
> Signed-off-by: Ed Cashin <ecashin@coraid.com>
> ---
>  drivers/block/aoe/aoe.h    |    2 +
>  drivers/block/aoe/aoeblk.c |    3 +
>  drivers/block/aoe/aoecmd.c |  138 ++++++++++++++++++++++++++++++-------------
>  drivers/block/aoe/aoedev.c |    1 +
>  drivers/block/aoe/aoenet.c |   13 +++-
>  5 files changed, 111 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
> index db195ab..8ca8c8a 100644
> --- a/drivers/block/aoe/aoe.h
> +++ b/drivers/block/aoe/aoe.h
> @@ -119,6 +119,8 @@ struct frame {
>  	ulong bcnt;
>  	sector_t lba;
>  	struct sk_buff *skb;
> +	struct bio_vec *bv;
> +	ulong bv_off;
>  };
>  
>  struct aoeif {
> diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
> index 321de7b..1471f81 100644
> --- a/drivers/block/aoe/aoeblk.c
> +++ b/drivers/block/aoe/aoeblk.c
> @@ -279,6 +279,9 @@ aoeblk_gdalloc(void *vp)
>  	if (bdi_init(&d->blkq->backing_dev_info))
>  		goto err_blkq;
>  	spin_lock_irqsave(&d->lock, flags);
> +	blk_queue_max_hw_sectors(d->blkq, BLK_DEF_MAX_SECTORS);
> +	d->blkq->backing_dev_info.ra_pages = BLK_DEF_MAX_SECTORS * 1024;
> +	d->blkq->backing_dev_info.ra_pages /= PAGE_CACHE_SIZE;
>  	gd->major = AOE_MAJOR;
>  	gd->first_minor = d->sysminor * AOE_PARTITIONS;
>  	gd->fops = &aoe_bdops;
> diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
> index de0435e..f10ab49 100644
> --- a/drivers/block/aoe/aoecmd.c
> +++ b/drivers/block/aoe/aoecmd.c
> @@ -164,7 +164,8 @@ freeframe(struct aoedev *d)
>  						rf = f;
>  					continue;
>  				}
> -gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
> +gotone:				skb->truesize -= skb->data_len;
> +				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
>  				skb_trim(skb, 0);
>  				d->tgt = t;
>  				ifrotate(*t);
> @@ -200,6 +201,24 @@ gotone:				skb_shinfo(skb)->nr_frags = skb->data_len = 0;
>  	return NULL;
>  }
>  
> +static void
> +skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt)
> +{
> +	int frag = 0;
> +	ulong fcnt;
> +loop:
> +	fcnt = bv->bv_len - (off - bv->bv_offset);
> +	if (fcnt > cnt)
> +		fcnt = cnt;
> +	skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt);
> +	cnt -= fcnt;
> +	if (cnt <= 0)
> +		return;
> +	bv++;
> +	off = bv->bv_offset;
> +	goto loop;
> +}
> +
>  static int
>  aoecmd_ata_rw(struct aoedev *d)
>  {
> @@ -210,7 +229,7 @@ aoecmd_ata_rw(struct aoedev *d)
>  	struct bio_vec *bv;
>  	struct aoetgt *t;
>  	struct sk_buff *skb;
> -	ulong bcnt;
> +	ulong bcnt, fbcnt;
>  	char writebit, extbit;
>  
>  	writebit = 0x10;
> @@ -225,8 +244,28 @@ aoecmd_ata_rw(struct aoedev *d)
>  	bcnt = t->ifp->maxbcnt;
>  	if (bcnt == 0)
>  		bcnt = DEFAULTBCNT;
> -	if (bcnt > buf->bv_resid)
> -		bcnt = buf->bv_resid;
> +	if (bcnt > buf->resid)
> +		bcnt = buf->resid;
> +	fbcnt = bcnt;
> +	f->bv = buf->bv;
> +	f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid);
> +	do {
> +		if (fbcnt < buf->bv_resid) {
> +			buf->bv_resid -= fbcnt;
> +			buf->resid -= fbcnt;
> +			break;
> +		}
> +		fbcnt -= buf->bv_resid;
> +		buf->resid -= buf->bv_resid;
> +		if (buf->resid == 0) {
> +			d->inprocess = NULL;
> +			break;
> +		}
> +		buf->bv++;
> +		buf->bv_resid = buf->bv->bv_len;
> +		WARN_ON(buf->bv_resid == 0);
> +	} while (fbcnt);
> +
>  	/* initialize the headers & frame */
>  	skb = f->skb;
>  	h = (struct aoe_hdr *) skb_mac_header(skb);
> @@ -237,7 +276,6 @@ aoecmd_ata_rw(struct aoedev *d)
>  	t->nout++;
>  	f->waited = 0;
>  	f->buf = buf;
> -	f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
>  	f->bcnt = bcnt;
>  	f->lba = buf->sector;
>  
> @@ -252,10 +290,11 @@ aoecmd_ata_rw(struct aoedev *d)
>  		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
>  	}
>  	if (bio_data_dir(buf->bio) == WRITE) {
> -		skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
> +		skb_fillup(skb, f->bv, f->bv_off, bcnt);
>  		ah->aflags |= AOEAFL_WRITE;
>  		skb->len += bcnt;
>  		skb->data_len = bcnt;
> +		skb->truesize += bcnt;
>  		t->wpkts++;
>  	} else {
>  		t->rpkts++;
> @@ -266,18 +305,7 @@ aoecmd_ata_rw(struct aoedev *d)
>  
>  	/* mark all tracking fields and load out */
>  	buf->nframesout += 1;
> -	buf->bv_off += bcnt;
> -	buf->bv_resid -= bcnt;
> -	buf->resid -= bcnt;
>  	buf->sector += bcnt >> 9;
> -	if (buf->resid == 0) {
> -		d->inprocess = NULL;
> -	} else if (buf->bv_resid == 0) {
> -		buf->bv = ++bv;
> -		buf->bv_resid = bv->bv_len;
> -		WARN_ON(buf->bv_resid == 0);
> -		buf->bv_off = bv->bv_offset;
> -	}
>  
>  	skb->dev = t->ifp->nd;
>  	skb = skb_clone(skb, GFP_ATOMIC);
> @@ -364,14 +392,12 @@ resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
>  		put_lba(ah, f->lba);
>  
>  		n = f->bcnt;
> -		if (n > DEFAULTBCNT)
> -			n = DEFAULTBCNT;
>  		ah->scnt = n >> 9;
>  		if (ah->aflags & AOEAFL_WRITE) {
> -			skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
> -				offset_in_page(f->bufaddr), n);
> +			skb_fillup(skb, f->bv, f->bv_off, n);
>  			skb->len = sizeof *h + sizeof *ah + n;
>  			skb->data_len = n;
> +			skb->truesize += n;
>  		}
>  	}
>  	skb->dev = t->ifp->nd;
> @@ -530,20 +556,6 @@ rexmit_timer(ulong vp)
>  				ejectif(t, ifp);
>  				ifp = NULL;
>  			}
> -
> -			if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
> -			&& ifp && ++ifp->lostjumbo > (t->nframes << 1)
> -			&& ifp->maxbcnt != DEFAULTBCNT) {
> -				printk(KERN_INFO
> -					"aoe: e%ld.%d: "
> -					"too many lost jumbo on "
> -					"%s:%pm - "
> -					"falling back to %d frames.\n",
> -					d->aoemajor, d->aoeminor,
> -					ifp->nd->name, t->addr,
> -					DEFAULTBCNT);
> -				ifp->maxbcnt = 0;
> -			}
>  			resend(d, t, f);
>  		}
>  
> @@ -736,6 +748,45 @@ diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector
>  	part_stat_unlock();
>  }
>  
> +static void
> +bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, ulong cnt)
> +{
> +	ulong fcnt;
> +	char *p;
> +	int soff = 0;
> +loop:
> +	fcnt = bv->bv_len - (off - bv->bv_offset);
> +	if (fcnt > cnt)
> +		fcnt = cnt;
> +	p = page_address(bv->bv_page) + off;
> +	skb_copy_bits(skb, soff, p, fcnt);
> +	soff += fcnt;
> +	cnt -= fcnt;
> +	if (cnt <= 0)
> +		return;
> +	bv++;
> +	off = bv->bv_offset;
> +	goto loop;
> +}
> +
> +static void
> +fadvance(struct frame *f, ulong cnt)
> +{
> +	ulong fcnt;
> +
> +	f->lba += cnt >> 9;
> +loop:
> +	fcnt = f->bv->bv_len - (f->bv_off - f->bv->bv_offset);
> +	if (fcnt > cnt) {
> +		f->bv_off += cnt;
> +		return;
> +	}
> +	cnt -= fcnt;
> +	f->bv++;
> +	f->bv_off = f->bv->bv_offset;
> +	goto loop;
> +}
> +
>  void
>  aoecmd_ata_rsp(struct sk_buff *skb)
>  {
> @@ -753,6 +804,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>  	u16 aoemajor;
>  
>  	hin = (struct aoe_hdr *) skb_mac_header(skb);
> +	skb_pull(skb, sizeof(*hin));
>  	aoemajor = get_unaligned_be16(&hin->major);
>  	d = aoedev_by_aoeaddr(aoemajor, hin->minor);
>  	if (d == NULL) {
> @@ -790,7 +842,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>  
>  	calc_rttavg(d, tsince(f->tag));
>  
> -	ahin = (struct aoe_atahdr *) (hin+1);
> +	ahin = (struct aoe_atahdr *) skb->data;
> +	skb_pull(skb, sizeof(*ahin));
>  	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
>  	ahout = (struct aoe_atahdr *) (hout+1);
>  	buf = f->buf;
> @@ -809,7 +862,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>  		switch (ahout->cmdstat) {
>  		case ATA_CMD_PIO_READ:
>  		case ATA_CMD_PIO_READ_EXT:
> -			if (skb->len - sizeof *hin - sizeof *ahin < n) {
> +			if (skb->len < n) {
>  				printk(KERN_ERR
>  					"aoe: %s.  skb->len=%d need=%ld\n",
>  					"runt data size in read", skb->len, n);
> @@ -817,7 +870,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>  				spin_unlock_irqrestore(&d->lock, flags);
>  				return;
>  			}
> -			memcpy(f->bufaddr, ahin+1, n);
> +			bvcpy(f->bv, f->bv_off, skb, n);
>  		case ATA_CMD_PIO_WRITE:
>  		case ATA_CMD_PIO_WRITE_EXT:
>  			ifp = getif(t, skb->dev);
> @@ -827,21 +880,22 @@ aoecmd_ata_rsp(struct sk_buff *skb)
>  					ifp->lostjumbo = 0;
>  			}
>  			if (f->bcnt -= n) {
> -				f->lba += n >> 9;
> -				f->bufaddr += n;
> +				fadvance(f, n);
>  				resend(d, t, f);
>  				goto xmit;
>  			}
>  			break;
>  		case ATA_CMD_ID_ATA:
> -			if (skb->len - sizeof *hin - sizeof *ahin < 512) {
> +			if (skb->len < 512) {
>  				printk(KERN_INFO
>  					"aoe: runt data size in ataid.  skb->len=%d\n",
>  					skb->len);
>  				spin_unlock_irqrestore(&d->lock, flags);
>  				return;
>  			}
> -			ataid_complete(d, t, (char *) (ahin+1));
> +			if (skb_linearize(skb))
> +				break;
> +			ataid_complete(d, t, skb->data);
>  			break;
>  		default:
>  			printk(KERN_INFO
> diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
> index 6b5110a..b2d1fd3 100644
> --- a/drivers/block/aoe/aoedev.c
> +++ b/drivers/block/aoe/aoedev.c
> @@ -182,6 +182,7 @@ skbfree(struct sk_buff *skb)
>  			"cannot free skb -- memory leaked.");
>  		return;
>  	}
> +	skb->truesize -= skb->data_len;
>  	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
>  	skb_trim(skb, 0);
>  	dev_kfree_skb(skb);
> diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
> index 4d3bc0d..0787807 100644
> --- a/drivers/block/aoe/aoenet.c
> +++ b/drivers/block/aoe/aoenet.c
> @@ -102,7 +102,9 @@ static int
>  aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
>  {
>  	struct aoe_hdr *h;
> +	struct aoe_atahdr *ah;
>  	u32 n;
> +	int sn;
>  
>  	if (dev_net(ifp) != &init_net)
>  		goto exit;
> @@ -110,13 +112,16 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
>  	skb = skb_share_check(skb, GFP_ATOMIC);
>  	if (skb == NULL)
>  		return 0;
> -	if (skb_linearize(skb))
> -		goto exit;
>  	if (!is_aoe_netif(ifp))
>  		goto exit;
>  	skb_push(skb, ETH_HLEN);	/* (1) */
> -
> -	h = (struct aoe_hdr *) skb_mac_header(skb);
> +	sn = sizeof(*h) + sizeof(*ah);
> +	if (skb->len >= sn) {
> +		sn -= skb_headlen(skb);
> +		if (sn > 0 && !__pskb_pull_tail(skb, sn))
> +			goto exit;
> +	}
> +	h = (struct aoe_hdr *) skb->data;
>  	n = get_unaligned_be32(&h->tag);
>  	if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
>  		goto exit;

-- 
  Ed


  reply	other threads:[~2012-08-23 20:35 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-08-23 17:43 [PATCH 00/14] aoe driver v49 performance and usability improvements Ed Cashin
2012-08-18  1:18 ` [PATCH 01/14] aoe: for performance support larger packet payloads Ed Cashin
2012-08-18  1:18 ` Ed Cashin
2012-08-23 20:32   ` Ed Cashin [this message]
2012-08-23 20:46   ` Ed Cashin
2012-08-24 21:11   ` Andrew Morton
2012-08-18  1:24 ` [PATCH 02/14] aoe: kernel thread handles I/O completions for simple locking Ed Cashin
2012-08-24 21:22   ` Andrew Morton
2012-08-25  0:35     ` Ed Cashin
2012-08-26  1:31     ` ecashin
2012-08-18  1:26 ` [PATCH 03/14] aoe: become I/O request queue handler for increased user control Ed Cashin
2012-08-18  1:26 ` [PATCH 04/14] aoe: use a kernel thread for transmissions Ed Cashin
2012-08-18  1:27 ` [PATCH 05/14] aoe: use packets that work with the smallest-MTU local interface Ed Cashin
2012-08-18  1:27 ` [PATCH 06/14] aoe: failover remote interface based on aoe_deadsecs parameter Ed Cashin
2012-08-18  1:27 ` [PATCH 07/14] aoe: do revalidation steps in order Ed Cashin
2012-08-18  1:27 ` [PATCH 08/14] aoe: disallow unsupported AoE minor addresses Ed Cashin
2012-08-18  1:27 ` [PATCH 11/14] aoe: remove unused code and add cosmetic improvements Ed Cashin
2012-08-18  1:28 ` [PATCH 12/14] aoe: update internal version number to 49 Ed Cashin
2012-08-18  1:28 ` [PATCH 13/14] aoe: update copyright year in touched files Ed Cashin
2012-08-21 14:58 ` [PATCH 09/14] aoe: associate frames with the AoE storage target Ed Cashin
2012-08-21 15:07 ` [PATCH 10/14] aoe: increase net_device reference count while using it Ed Cashin
2012-08-23 17:36 ` [PATCH 14/14] aoe: update documentation with new URL and VM settings reference Ed Cashin
2012-08-24 23:57 ` [PATCH 00/14] aoe driver v49 performance and usability improvements Ed Cashin
  -- strict thread matches above, loose matches on Subject: below --
2012-08-28 12:53 Ed Cashin
2012-08-25 14:39 ` [PATCH 01/14] aoe: for performance support larger packet payloads Ed Cashin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m2lih5l577.fsf@coraid.com \
    --to=ecashin@coraid.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.