Netdev List
 help / color / mirror / Atom feed
* Re: [PATCH] dm9000: control debug level of the driver
From: Vladimir Zapolskiy @ 2011-08-19 20:26 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, ben-linux
In-Reply-To: <20110818.220902.1663745747317150778.davem@davemloft.net>

On 19.08.2011 08:09, David Miller wrote:
> From: Vladimir Zapolskiy<vz@mleia.com>
> Date: Mon, 15 Aug 2011 19:38:34 +0300
>
>> This change allows to get driver specific debug messages output
>> setting a default value for db->debug_level. As far as the maximum
>> level of verbosity is too high, it is demoted by default.
>>
>> Signed-off-by: Vladimir Zapolskiy<vz@mleia.com>
>> Cc: Ben Dooks<ben-linux@fluff.org>
>
> I would much rather see this config option eliminated entirely.
>
> The default can be set by the user at kernel boot or module
> load time with command line settings.

This definitely should be an improvement, initially I wanted to fix a 
particular problem, but let's do it even better.

--
With best wishes,
Vladimir

^ permalink raw reply

* [PATCH net-next v5 2/2] af-packet: TPACKET_V3 flexible buffer implementation.
From: Chetan Loke @ 2011-08-19 20:18 UTC (permalink / raw)
  To: netdev, davem; +Cc: Chetan Loke
In-Reply-To: <1313785096-911-1-git-send-email-loke.chetan@gmail.com>

1) Blocks can be configured with non-static frame-size.
2) Read/poll is at a block-level(as opposed to packet-level).
3) Added poll timeout to avoid indefinite user-space wait on idle links.
4) Added user-configurable knobs:
   4.1) block::timeout.
   4.2) tpkt_hdr::sk_rxhash.


Changes:
C1) tpacket_rcv()
    C1.1) packet_current_frame() is replaced by packet_current_rx_frame()
          The bulk of the processing is then moved in the following chain:
          packet_current_rx_frame()
            __packet_lookup_frame_in_block
              fill_curr_block()
              or
                retire_current_block
                dispatch_next_block
              or
              return NULL(queue is plugged/paused)

Signed-off-by: Chetan Loke <loke.chetan@gmail.com>
---
 net/packet/af_packet.c |  937 +++++++++++++++++++++++++++++++++++++++++++++---
 1 files changed, 891 insertions(+), 46 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c698cec..4371e3a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -40,6 +40,10 @@
  *					byte arrays at the end of sockaddr_ll
  *					and packet_mreq.
  *		Johann Baudy	:	Added TX RING.
+ *		Chetan Loke	:	Implemented TPACKET_V3 block abstraction
+ *					layer.
+ *					Copyright (C) 2011, <lokec@ccs.neu.edu>
+ *
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -161,9 +165,56 @@ struct packet_mreq_max {
 	unsigned char	mr_address[MAX_ADDR_LEN];
 };
 
-static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
+static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		int closing, int tx_ring);
 
+
+#define V3_ALIGNMENT	(8)
+
+#define BLK_HDR_LEN	(ALIGN(sizeof(struct block_desc), V3_ALIGNMENT))
+
+#define BLK_PLUS_PRIV(sz_of_priv) \
+	(BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
+
+/* kbdq - kernel block descriptor queue */
+struct kbdq_core {
+	struct pgv	*pkbdq;
+	unsigned int	feature_req_word;
+	unsigned int	hdrlen;
+	unsigned char	reset_pending_on_curr_blk;
+	unsigned char   delete_blk_timer;
+	unsigned short	kactive_blk_num;
+	unsigned short	blk_sizeof_priv;
+
+	/* last_kactive_blk_num:
+	 * trick to see if user-space has caught up
+	 * in order to avoid refreshing timer when every single pkt arrives.
+	 */
+	unsigned short	last_kactive_blk_num;
+
+	char		*pkblk_start;
+	char		*pkblk_end;
+	int		kblk_size;
+	unsigned int	knum_blocks;
+	uint64_t	knxt_seq_num;
+	char		*prev;
+	char		*nxt_offset;
+	struct sk_buff	*skb;
+
+	atomic_t	blk_fill_in_prog;
+
+	/* Default is set to 8ms */
+#define DEFAULT_PRB_RETIRE_TOV	(8)
+
+	unsigned short  retire_blk_tov;
+	unsigned short  version;
+	unsigned long	tov_in_jiffies;
+
+	/* timer to retire an outstanding block */
+	struct timer_list retire_blk_timer;
+};
+
+#define PGV_FROM_VMALLOC 1
 struct pgv {
 	char *buffer;
 };
@@ -179,12 +230,40 @@ struct packet_ring_buffer {
 	unsigned int		pg_vec_pages;
 	unsigned int		pg_vec_len;
 
+	struct kbdq_core	prb_bdqc;
 	atomic_t		pending;
 };
 
+#define BLOCK_STATUS(x)	((x)->hdr.bh1.block_status)
+#define BLOCK_NUM_PKTS(x)	((x)->hdr.bh1.num_pkts)
+#define BLOCK_O2FP(x)		((x)->hdr.bh1.offset_to_first_pkt)
+#define BLOCK_LEN(x)		((x)->hdr.bh1.blk_len)
+#define BLOCK_SNUM(x)		((x)->hdr.bh1.seq_num)
+#define BLOCK_O2PRIV(x)	((x)->offset_to_priv)
+#define BLOCK_PRIV(x)		((void *)((char *)(x) + BLOCK_O2PRIV(x)))
+
 struct packet_sock;
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
 
+static void *packet_previous_frame(struct packet_sock *po,
+		struct packet_ring_buffer *rb,
+		int status);
+static void packet_increment_head(struct packet_ring_buffer *buff);
+static int prb_curr_blk_in_use(struct kbdq_core *,
+			struct block_desc *);
+static void *prb_dispatch_next_block(struct kbdq_core *,
+			struct packet_sock *);
+static void prb_retire_current_block(struct kbdq_core *,
+		struct packet_sock *, unsigned int status);
+static int prb_queue_frozen(struct kbdq_core *);
+static void prb_open_block(struct kbdq_core *, struct block_desc *);
+static void prb_retire_rx_blk_timer_expired(unsigned long);
+static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *);
+static void prb_init_blk_timer(struct packet_sock *, struct kbdq_core *,
+				void (*func) (unsigned long));
+static void prb_fill_rxhash(struct kbdq_core *, struct tpacket3_hdr *);
+static void prb_clear_rxhash(struct kbdq_core *, struct tpacket3_hdr *);
+static void prb_fill_vlan_info(struct kbdq_core *, struct tpacket3_hdr *);
 static void packet_flush_mclist(struct sock *sk);
 
 struct packet_fanout;
@@ -193,6 +272,7 @@ struct packet_sock {
 	struct sock		sk;
 	struct packet_fanout	*fanout;
 	struct tpacket_stats	stats;
+	union  tpacket_stats_u	stats_u;
 	struct packet_ring_buffer	rx_ring;
 	struct packet_ring_buffer	tx_ring;
 	int			copy_thresh;
@@ -242,6 +322,15 @@ struct packet_skb_cb {
 
 #define PACKET_SKB_CB(__skb)	((struct packet_skb_cb *)((__skb)->cb))
 
+#define GET_PBDQC_FROM_RB(x)	((struct kbdq_core *)(&(x)->prb_bdqc))
+#define GET_PBLOCK_DESC(x, bid)	\
+	((struct block_desc *)((x)->pkbdq[(bid)].buffer))
+#define GET_CURR_PBLOCK_DESC_FROM_CORE(x)	\
+	((struct block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
+#define GET_NEXT_PRB_BLK_NUM(x) \
+	(((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
+	((x)->kactive_blk_num+1) : 0)
+
 static inline struct packet_sock *pkt_sk(struct sock *sk)
 {
 	return (struct packet_sock *)sk;
@@ -325,8 +414,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
 		h.h2->tp_status = status;
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		break;
+	case TPACKET_V3:
 	default:
-		pr_err("TPACKET version not supported\n");
+		WARN(1, "TPACKET version not supported.\n");
 		BUG();
 	}
 
@@ -351,8 +441,9 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
 	case TPACKET_V2:
 		flush_dcache_page(pgv_to_page(&h.h2->tp_status));
 		return h.h2->tp_status;
+	case TPACKET_V3:
 	default:
-		pr_err("TPACKET version not supported\n");
+		WARN(1, "TPACKET version not supported.\n");
 		BUG();
 		return 0;
 	}
@@ -389,6 +480,665 @@ static inline void *packet_current_frame(struct packet_sock *po,
 	return packet_lookup_frame(po, rb, rb->head, status);
 }
 
+static void prb_del_retire_blk_timer(struct kbdq_core *pkc)
+{
+	del_timer_sync(&pkc->retire_blk_timer);
+}
+
+static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
+		int tx_ring,
+		struct sk_buff_head *rb_queue)
+{
+	struct kbdq_core *pkc;
+
+	pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+
+	spin_lock(&rb_queue->lock);
+	pkc->delete_blk_timer = 1;
+	spin_unlock(&rb_queue->lock);
+
+	prb_del_retire_blk_timer(pkc);
+}
+
+static void prb_init_blk_timer(struct packet_sock *po,
+		struct kbdq_core *pkc,
+		void (*func) (unsigned long))
+{
+	init_timer(&pkc->retire_blk_timer);
+	pkc->retire_blk_timer.data = (long)po;
+	pkc->retire_blk_timer.function = func;
+	pkc->retire_blk_timer.expires = jiffies;
+}
+
+static void prb_setup_retire_blk_timer(struct packet_sock *po, int tx_ring)
+{
+	struct kbdq_core *pkc;
+
+	if (tx_ring)
+		BUG();
+
+	pkc = tx_ring ? &po->tx_ring.prb_bdqc : &po->rx_ring.prb_bdqc;
+	prb_init_blk_timer(po, pkc, prb_retire_rx_blk_timer_expired);
+}
+
+static int prb_calc_retire_blk_tmo(struct packet_sock *po,
+				int blk_size_in_bytes)
+{
+	struct net_device *dev;
+	unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
+
+	dev = dev_get_by_index(sock_net(&po->sk), po->ifindex);
+	if (unlikely(dev == NULL))
+		return DEFAULT_PRB_RETIRE_TOV;
+
+	if (dev->ethtool_ops && dev->ethtool_ops->get_settings) {
+		struct ethtool_cmd ecmd = { .cmd = ETHTOOL_GSET, };
+
+		if (!dev->ethtool_ops->get_settings(dev, &ecmd)) {
+			switch (ecmd.speed) {
+			case SPEED_10000:
+				msec = 1;
+				div = 10000/1000;
+				break;
+			case SPEED_1000:
+				msec = 1;
+				div = 1000/1000;
+				break;
+			/*
+			 * If the link speed is so slow you don't really
+			 * need to worry about perf anyways
+			 */
+			case SPEED_100:
+			case SPEED_10:
+			default:
+				return DEFAULT_PRB_RETIRE_TOV;
+			}
+		}
+	}
+
+	mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
+
+	if (div)
+		mbits /= div;
+
+	tmo = mbits * msec;
+
+	if (div)
+		return tmo+1;
+	return tmo;
+}
+
+static void prb_init_ft_ops(struct kbdq_core *p1,
+			union tpacket_req_u *req_u)
+{
+	p1->feature_req_word = req_u->req3.tp_feature_req_word;
+}
+
+static void init_prb_bdqc(struct packet_sock *po,
+			struct packet_ring_buffer *rb,
+			struct pgv *pg_vec,
+			union tpacket_req_u *req_u, int tx_ring)
+{
+	struct kbdq_core *p1 = &rb->prb_bdqc;
+	struct block_desc *pbd;
+
+	memset(p1, 0x0, sizeof(*p1));
+
+	p1->knxt_seq_num = 1;
+	p1->pkbdq = pg_vec;
+	pbd = (struct block_desc *)pg_vec[0].buffer;
+	p1->pkblk_start	= (char *)pg_vec[0].buffer;
+	p1->kblk_size = req_u->req3.tp_block_size;
+	p1->knum_blocks	= req_u->req3.tp_block_nr;
+	p1->hdrlen = po->tp_hdrlen;
+	p1->version = po->tp_version;
+	p1->last_kactive_blk_num = 0;
+	po->stats_u.stats3.tp_freeze_q_cnt = 0;
+	if (req_u->req3.tp_retire_blk_tov)
+		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
+	else
+		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
+						req_u->req3.tp_block_size);
+	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
+	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
+
+	prb_init_ft_ops(p1, req_u);
+	prb_setup_retire_blk_timer(po, tx_ring);
+	prb_open_block(p1, pbd);
+}
+
+/*  Do NOT update the last_blk_num first.
+ *  Assumes sk_buff_head lock is held.
+ */
+static void _prb_refresh_rx_retire_blk_timer(struct kbdq_core *pkc)
+{
+	mod_timer(&pkc->retire_blk_timer,
+			jiffies + pkc->tov_in_jiffies);
+	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
+}
+
+/*
+ * Timer logic:
+ * 1) We refresh the timer only when we open a block.
+ *    By doing this we don't waste cycles refreshing the timer
+ *	  on packet-by-packet basis.
+ *
+ * With a 1MB block-size, on a 1Gbps line, it will take
+ * i) ~8 ms to fill a block + ii) memcpy etc.
+ * In this cut we are not accounting for the memcpy time.
+ *
+ * So, if the user sets the 'tmo' to 10ms then the timer
+ * will never fire while the block is still getting filled
+ * (which is what we want). However, the user could choose
+ * to close a block early and that's fine.
+ *
+ * But when the timer does fire, we check whether or not to refresh it.
+ * Since the tmo granularity is in msecs, it is not too expensive
+ * to refresh the timer, lets say every '8' msecs.
+ * Either the user can set the 'tmo' or we can derive it based on
+ * a) line-speed and b) block-size.
+ * prb_calc_retire_blk_tmo() calculates the tmo.
+ *
+ */
+static void prb_retire_rx_blk_timer_expired(unsigned long data)
+{
+	struct packet_sock *po = (struct packet_sock *)data;
+	struct kbdq_core *pkc = &po->rx_ring.prb_bdqc;
+	unsigned int frozen;
+	struct block_desc *pbd;
+
+	spin_lock(&po->sk.sk_receive_queue.lock);
+
+	frozen = prb_queue_frozen(pkc);
+	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+	if (unlikely(pkc->delete_blk_timer))
+		goto out;
+
+	/* We only need to plug the race when the block is partially filled.
+	 * tpacket_rcv:
+	 *		lock(); increment BLOCK_NUM_PKTS; unlock()
+	 *		copy_bits() is in progress ...
+	 *		timer fires on other cpu:
+	 *		we can't retire the current block because copy_bits
+	 *		is in progress.
+	 *
+	 */
+	if (BLOCK_NUM_PKTS(pbd)) {
+		while (atomic_read(&pkc->blk_fill_in_prog)) {
+			/* Waiting for skb_copy_bits to finish... */
+			cpu_relax();
+		}
+	}
+
+	if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
+		if (!frozen) {
+			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
+			if (!prb_dispatch_next_block(pkc, po))
+				goto refresh_timer;
+			else
+				goto out;
+		} else {
+			/* Case 1. Queue was frozen because user-space was
+			 *	   lagging behind.
+			 */
+			if (prb_curr_blk_in_use(pkc, pbd)) {
+				/*
+				 * Ok, user-space is still behind.
+				 * So just refresh the timer.
+				 */
+				goto refresh_timer;
+			} else {
+			       /* Case 2. queue was frozen,user-space caught up,
+				* now the link went idle && the timer fired.
+				* We don't have a block to close.So we open this
+				* block and restart the timer.
+				* opening a block thaws the queue,restarts timer
+				* Thawing/timer-refresh is a side effect.
+				*/
+				prb_open_block(pkc, pbd);
+				goto out;
+			}
+		}
+	}
+
+refresh_timer:
+	_prb_refresh_rx_retire_blk_timer(pkc);
+
+out:
+	spin_unlock(&po->sk.sk_receive_queue.lock);
+}
+
+static inline void prb_flush_block(struct kbdq_core *pkc1,
+		struct block_desc *pbd1, __u32 status)
+{
+	/* Flush everything minus the block header */
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+	u8 *start, *end;
+
+	start = (u8 *)pbd1;
+
+	/* Skip the block header(we know header WILL fit in 4K) */
+	start += PAGE_SIZE;
+
+	end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
+	for (; start < end; start += PAGE_SIZE)
+		flush_dcache_page(pgv_to_page(start));
+
+	smp_wmb();
+#endif
+
+	/* Now update the block status. */
+
+	BLOCK_STATUS(pbd1) = status;
+
+	/* Flush the block header */
+
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+	start = (u8 *)pbd1;
+	flush_dcache_page(pgv_to_page(start));
+
+	smp_wmb();
+#endif
+}
+
+/*
+ * Side effect:
+ *
+ * 1) flush the block
+ * 2) Increment active_blk_num
+ *
+ * Note:We DONT refresh the timer on purpose.
+ *	Because almost always the next block will be opened.
+ */
+static void prb_close_block(struct kbdq_core *pkc1, struct block_desc *pbd1,
+		struct packet_sock *po, unsigned int stat)
+{
+	__u32 status = TP_STATUS_USER | stat;
+
+	struct tpacket3_hdr *last_pkt;
+	struct hdr_v1 *h1 = &pbd1->hdr.bh1;
+
+	if (po->stats.tp_drops)
+		status |= TP_STATUS_LOSING;
+
+	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
+	last_pkt->tp_next_offset = 0;
+
+	/* Get the ts of the last pkt */
+	if (BLOCK_NUM_PKTS(pbd1)) {
+		h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
+		h1->ts_last_pkt.ts_nsec	= last_pkt->tp_nsec;
+	} else {
+		/* Ok, we tmo'd - so get the current time */
+		struct timespec ts;
+		getnstimeofday(&ts);
+		h1->ts_last_pkt.ts_sec = ts.tv_sec;
+		h1->ts_last_pkt.ts_nsec	= ts.tv_nsec;
+	}
+
+	smp_wmb();
+
+	/* Flush the block */
+	prb_flush_block(pkc1, pbd1, status);
+
+	pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
+}
+
+static inline void prb_thaw_queue(struct kbdq_core *pkc)
+{
+	pkc->reset_pending_on_curr_blk = 0;
+}
+
+/*
+ * Side effect of opening a block:
+ *
+ * 1) prb_queue is thawed.
+ * 2) retire_blk_timer is refreshed.
+ *
+ */
+static void prb_open_block(struct kbdq_core *pkc1, struct block_desc *pbd1)
+{
+	struct timespec ts;
+	struct hdr_v1 *h1 = &pbd1->hdr.bh1;
+
+	smp_rmb();
+
+	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd1))) {
+
+		/* We could have just memset this but we will lose the
+		 * flexibility of making the priv area sticky
+		 */
+		BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
+		BLOCK_NUM_PKTS(pbd1) = 0;
+		BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+		getnstimeofday(&ts);
+		h1->ts_first_pkt.ts_sec = ts.tv_sec;
+		h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
+		pkc1->pkblk_start = (char *)pbd1;
+		pkc1->nxt_offset = (char *)(pkc1->pkblk_start +
+		BLK_PLUS_PRIV(pkc1->blk_sizeof_priv));
+		BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
+		BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
+		pbd1->version = pkc1->version;
+		pkc1->prev = pkc1->nxt_offset;
+		pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
+		prb_thaw_queue(pkc1);
+		_prb_refresh_rx_retire_blk_timer(pkc1);
+
+		smp_wmb();
+
+		return;
+	}
+
+	WARN(1, "ERROR block:%p is NOT FREE status:%d kactive_blk_num:%d\n",
+		pbd1, BLOCK_STATUS(pbd1), pkc1->kactive_blk_num);
+	dump_stack();
+	BUG();
+}
+
+/*
+ * Queue freeze logic:
+ * 1) Assume tp_block_nr = 8 blocks.
+ * 2) At time 't0', user opens Rx ring.
+ * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
+ * 4) user-space is either sleeping or processing block '0'.
+ * 5) tpacket_rcv is currently filling block '7', since there is no space left,
+ *    it will close block-7,loop around and try to fill block '0'.
+ *    call-flow:
+ *    __packet_lookup_frame_in_block
+ *      prb_retire_current_block()
+ *      prb_dispatch_next_block()
+ *        |->(BLOCK_STATUS == USER) evaluates to true
+ *    5.1) Since block-0 is currently in-use, we just freeze the queue.
+ * 6) Now there are two cases:
+ *    6.1) Link goes idle right after the queue is frozen.
+ *         But remember, the last open_block() refreshed the timer.
+ *         When this timer expires,it will refresh itself so that we can
+ *         re-open block-0 in near future.
+ *    6.2) Link is busy and keeps on receiving packets. This is a simple
+ *         case and __packet_lookup_frame_in_block will check if block-0
+ *         is free and can now be re-used.
+ */
+static inline void prb_freeze_queue(struct kbdq_core *pkc,
+				  struct packet_sock *po)
+{
+	pkc->reset_pending_on_curr_blk = 1;
+	po->stats_u.stats3.tp_freeze_q_cnt++;
+}
+
+#define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
+
+/*
+ * If the next block is free then we will dispatch it
+ * and return a good offset.
+ * Else, we will freeze the queue.
+ * So, caller must check the return value.
+ */
+static void *prb_dispatch_next_block(struct kbdq_core *pkc,
+		struct packet_sock *po)
+{
+	struct block_desc *pbd;
+
+	smp_rmb();
+
+	/* 1. Get current block num */
+	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+	/* 2. If this block is currently in_use then freeze the queue */
+	if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
+		prb_freeze_queue(pkc, po);
+		return NULL;
+	}
+
+	/*
+	 * 3.
+	 * open this block and return the offset where the first packet
+	 * needs to get stored.
+	 */
+	prb_open_block(pkc, pbd);
+	return (void *)pkc->nxt_offset;
+}
+
+static void prb_retire_current_block(struct kbdq_core *pkc,
+		struct packet_sock *po, unsigned int status)
+{
+	struct block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+	/* retire/close the current block */
+	if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
+		/*
+		 * Plug the case where copy_bits() is in progress on
+		 * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
+		 * have space to copy the pkt in the current block and
+		 * called prb_retire_current_block()
+		 *
+		 * We don't need to worry about the TMO case because
+		 * the timer-handler already handled this case.
+		 */
+		if (!(status & TP_STATUS_BLK_TMO)) {
+			while (atomic_read(&pkc->blk_fill_in_prog)) {
+				/* Waiting for skb_copy_bits to finish... */
+				cpu_relax();
+			}
+		}
+		prb_close_block(pkc, pbd, po, status);
+		return;
+	}
+
+	WARN(1, "ERROR-pbd[%d]:%p\n", pkc->kactive_blk_num, pbd);
+	dump_stack();
+	BUG();
+}
+
+static inline int prb_curr_blk_in_use(struct kbdq_core *pkc,
+				      struct block_desc *pbd)
+{
+	return TP_STATUS_USER & BLOCK_STATUS(pbd);
+}
+
+static inline int prb_queue_frozen(struct kbdq_core *pkc)
+{
+	return pkc->reset_pending_on_curr_blk;
+}
+
+static inline void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
+{
+	struct kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
+	atomic_dec(&pkc->blk_fill_in_prog);
+}
+
+static inline void prb_fill_rxhash(struct kbdq_core *pkc,
+			struct tpacket3_hdr *ppd)
+{
+	ppd->hv1.tp_rxhash = skb_get_rxhash(pkc->skb);
+}
+
+static inline void prb_clear_rxhash(struct kbdq_core *pkc,
+			struct tpacket3_hdr *ppd)
+{
+	ppd->hv1.tp_rxhash = 0;
+}
+
+static inline void prb_fill_vlan_info(struct kbdq_core *pkc,
+			struct tpacket3_hdr *ppd)
+{
+	if (vlan_tx_tag_present(pkc->skb)) {
+		ppd->hv1.tp_vlan_tci = vlan_tx_tag_get(pkc->skb);
+		ppd->tp_status = TP_STATUS_VLAN_VALID;
+	} else {
+		ppd->hv1.tp_vlan_tci = ppd->tp_status = 0;
+	}
+}
+
+static void prb_run_all_ft_ops(struct kbdq_core *pkc,
+			struct tpacket3_hdr *ppd)
+{
+	prb_fill_vlan_info(pkc, ppd);
+
+	if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
+		prb_fill_rxhash(pkc, ppd);
+	else
+		prb_clear_rxhash(pkc, ppd);
+}
+
+static inline void prb_fill_curr_block(char *curr, struct kbdq_core *pkc,
+				struct block_desc *pbd,
+				unsigned int len)
+{
+	struct tpacket3_hdr *ppd;
+
+	ppd  = (struct tpacket3_hdr *)curr;
+	ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
+	pkc->prev = curr;
+	pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
+	BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
+	BLOCK_NUM_PKTS(pbd) += 1;
+	atomic_inc(&pkc->blk_fill_in_prog);
+	prb_run_all_ft_ops(pkc, ppd);
+}
+
+/* Assumes caller has the sk->rx_queue.lock */
+static void *__packet_lookup_frame_in_block(struct packet_sock *po,
+					    struct sk_buff *skb,
+						int status,
+					    unsigned int len
+					    )
+{
+	struct kbdq_core *pkc;
+	struct block_desc *pbd;
+	char *curr, *end;
+
+	pkc = GET_PBDQC_FROM_RB(((struct packet_ring_buffer *)&po->rx_ring));
+	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+
+	/* Queue is frozen when user space is lagging behind */
+	if (prb_queue_frozen(pkc)) {
+		/*
+		 * Check if that last block which caused the queue to freeze,
+		 * is still in_use by user-space.
+		 */
+		if (prb_curr_blk_in_use(pkc, pbd)) {
+			/* Can't record this packet */
+			return NULL;
+		} else {
+			/*
+			 * Ok, the block was released by user-space.
+			 * Now let's open that block.
+			 * opening a block also thaws the queue.
+			 * Thawing is a side effect.
+			 */
+			prb_open_block(pkc, pbd);
+		}
+	}
+
+	smp_mb();
+	curr = pkc->nxt_offset;
+	pkc->skb = skb;
+	end = (char *) ((char *)pbd + pkc->kblk_size);
+
+	/* first try the current block */
+	if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
+		prb_fill_curr_block(curr, pkc, pbd, len);
+		return (void *)curr;
+	}
+
+	/* Ok, close the current block */
+	prb_retire_current_block(pkc, po, 0);
+
+	/* Now, try to dispatch the next block */
+	curr = (char *)prb_dispatch_next_block(pkc, po);
+	if (curr) {
+		pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
+		prb_fill_curr_block(curr, pkc, pbd, len);
+		return (void *)curr;
+	}
+
+	/*
+	 * No free blocks are available.user_space hasn't caught up yet.
+	 * Queue was just frozen and now this packet will get dropped.
+	 */
+	return NULL;
+}
+
+static inline void *packet_current_rx_frame(struct packet_sock *po,
+					    struct sk_buff *skb,
+					    int status, unsigned int len)
+{
+	char *curr = NULL;
+	switch (po->tp_version) {
+	case TPACKET_V1:
+	case TPACKET_V2:
+		curr = packet_lookup_frame(po, &po->rx_ring,
+					po->rx_ring.head, status);
+		return curr;
+	case TPACKET_V3:
+		return __packet_lookup_frame_in_block(po, skb, status, len);
+	default:
+		WARN(1, "TPACKET version not supported\n");
+		BUG();
+		return 0;
+	}
+}
+
+static inline void *prb_lookup_block(struct packet_sock *po,
+				     struct packet_ring_buffer *rb,
+				     unsigned int previous,
+				     int status)
+{
+	struct kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
+	struct block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
+
+	if (status != BLOCK_STATUS(pbd))
+		return NULL;
+	return pbd;
+}
+
+static inline int prb_previous_blk_num(struct packet_ring_buffer *rb)
+{
+	unsigned int prev;
+	if (rb->prb_bdqc.kactive_blk_num)
+		prev = rb->prb_bdqc.kactive_blk_num-1;
+	else
+		prev = rb->prb_bdqc.knum_blocks-1;
+	return prev;
+}
+
+/* Assumes caller has held the rx_queue.lock */
+static inline void *__prb_previous_block(struct packet_sock *po,
+					 struct packet_ring_buffer *rb,
+					 int status)
+{
+	unsigned int previous = prb_previous_blk_num(rb);
+	return prb_lookup_block(po, rb, previous, status);
+}
+
+static inline void *packet_previous_rx_frame(struct packet_sock *po,
+					     struct packet_ring_buffer *rb,
+					     int status)
+{
+	if (po->tp_version <= TPACKET_V2)
+		return packet_previous_frame(po, rb, status);
+
+	return __prb_previous_block(po, rb, status);
+}
+
+static inline void packet_increment_rx_head(struct packet_sock *po,
+					    struct packet_ring_buffer *rb)
+{
+	switch (po->tp_version) {
+	case TPACKET_V1:
+	case TPACKET_V2:
+		return packet_increment_head(rb);
+	case TPACKET_V3:
+	default:
+		WARN(1, "TPACKET version not supported.\n");
+		BUG();
+		return;
+	}
+}
+
 static inline void *packet_previous_frame(struct packet_sock *po,
 		struct packet_ring_buffer *rb,
 		int status)
@@ -982,12 +1732,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	union {
 		struct tpacket_hdr *h1;
 		struct tpacket2_hdr *h2;
+		struct tpacket3_hdr *h3;
 		void *raw;
 	} h;
 	u8 *skb_head = skb->data;
 	int skb_len = skb->len;
 	unsigned int snaplen, res;
-	unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
+	unsigned long status = TP_STATUS_USER;
 	unsigned short macoff, netoff, hdrlen;
 	struct sk_buff *copy_skb = NULL;
 	struct timeval tv;
@@ -1033,37 +1784,46 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 			po->tp_reserve;
 		macoff = netoff - maclen;
 	}
-
-	if (macoff + snaplen > po->rx_ring.frame_size) {
-		if (po->copy_thresh &&
-		    atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
-		    (unsigned)sk->sk_rcvbuf) {
-			if (skb_shared(skb)) {
-				copy_skb = skb_clone(skb, GFP_ATOMIC);
-			} else {
-				copy_skb = skb_get(skb);
-				skb_head = skb->data;
+	if (po->tp_version <= TPACKET_V2) {
+		if (macoff + snaplen > po->rx_ring.frame_size) {
+			if (po->copy_thresh &&
+				atomic_read(&sk->sk_rmem_alloc) + skb->truesize
+				< (unsigned)sk->sk_rcvbuf) {
+				if (skb_shared(skb)) {
+					copy_skb = skb_clone(skb, GFP_ATOMIC);
+				} else {
+					copy_skb = skb_get(skb);
+					skb_head = skb->data;
+				}
+				if (copy_skb)
+					skb_set_owner_r(copy_skb, sk);
 			}
-			if (copy_skb)
-				skb_set_owner_r(copy_skb, sk);
+			snaplen = po->rx_ring.frame_size - macoff;
+			if ((int)snaplen < 0)
+				snaplen = 0;
 		}
-		snaplen = po->rx_ring.frame_size - macoff;
-		if ((int)snaplen < 0)
-			snaplen = 0;
 	}
-
 	spin_lock(&sk->sk_receive_queue.lock);
-	h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
+	h.raw = packet_current_rx_frame(po, skb,
+					TP_STATUS_KERNEL, (macoff+snaplen));
 	if (!h.raw)
 		goto ring_is_full;
-	packet_increment_head(&po->rx_ring);
+	if (po->tp_version <= TPACKET_V2) {
+		packet_increment_rx_head(po, &po->rx_ring);
+	/*
+	 * LOSING will be reported till you read the stats,
+	 * because it's COR - Clear On Read.
+	 * Anyways, moving it for V1/V2 only as V3 doesn't need this
+	 * at packet level.
+	 */
+		if (po->stats.tp_drops)
+			status |= TP_STATUS_LOSING;
+	}
 	po->stats.tp_packets++;
 	if (copy_skb) {
 		status |= TP_STATUS_COPY;
 		__skb_queue_tail(&sk->sk_receive_queue, copy_skb);
 	}
-	if (!po->stats.tp_drops)
-		status &= ~TP_STATUS_LOSING;
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
@@ -1114,6 +1874,29 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		h.h2->tp_padding = 0;
 		hdrlen = sizeof(*h.h2);
 		break;
+	case TPACKET_V3:
+		/* tp_nxt_offset,vlan are already populated above.
+		 * So DONT clear those fields here
+		 */
+		h.h3->tp_status |= status;
+		h.h3->tp_len = skb->len;
+		h.h3->tp_snaplen = snaplen;
+		h.h3->tp_mac = macoff;
+		h.h3->tp_net = netoff;
+		if ((po->tp_tstamp & SOF_TIMESTAMPING_SYS_HARDWARE)
+				&& shhwtstamps->syststamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->syststamp);
+		else if ((po->tp_tstamp & SOF_TIMESTAMPING_RAW_HARDWARE)
+				&& shhwtstamps->hwtstamp.tv64)
+			ts = ktime_to_timespec(shhwtstamps->hwtstamp);
+		else if (skb->tstamp.tv64)
+			ts = ktime_to_timespec(skb->tstamp);
+		else
+			getnstimeofday(&ts);
+		h.h3->tp_sec  = ts.tv_sec;
+		h.h3->tp_nsec = ts.tv_nsec;
+		hdrlen = sizeof(*h.h3);
+		break;
 	default:
 		BUG();
 	}
@@ -1134,13 +1917,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	{
 		u8 *start, *end;
 
-		end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
-		for (start = h.raw; start < end; start += PAGE_SIZE)
-			flush_dcache_page(pgv_to_page(start));
+		if (po->tp_version <= TPACKET_V2) {
+			end = (u8 *)PAGE_ALIGN((unsigned long)h.raw
+				+ macoff + snaplen);
+			for (start = h.raw; start < end; start += PAGE_SIZE)
+				flush_dcache_page(pgv_to_page(start));
+		}
 		smp_wmb();
 	}
 #endif
-	__packet_set_status(po, h.raw, status);
+	if (po->tp_version <= TPACKET_V2)
+		__packet_set_status(po, h.raw, status);
+	else
+		prb_clear_blk_fill_status(&po->rx_ring);
 
 	sk->sk_data_ready(sk, 0);
 
@@ -1631,7 +2420,7 @@ static int packet_release(struct socket *sock)
 	struct sock *sk = sock->sk;
 	struct packet_sock *po;
 	struct net *net;
-	struct tpacket_req req;
+	union tpacket_req_u req_u;
 
 	if (!sk)
 		return 0;
@@ -1654,13 +2443,13 @@ static int packet_release(struct socket *sock)
 
 	packet_flush_mclist(sk);
 
-	memset(&req, 0, sizeof(req));
+	memset(&req_u, 0, sizeof(req_u));
 
 	if (po->rx_ring.pg_vec)
-		packet_set_ring(sk, &req, 1, 0);
+		packet_set_ring(sk, &req_u, 1, 0);
 
 	if (po->tx_ring.pg_vec)
-		packet_set_ring(sk, &req, 1, 1);
+		packet_set_ring(sk, &req_u, 1, 1);
 
 	fanout_release(sk);
 
@@ -2280,15 +3069,27 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 	case PACKET_RX_RING:
 	case PACKET_TX_RING:
 	{
-		struct tpacket_req req;
+		union tpacket_req_u req_u;
+		int len;
 
-		if (optlen < sizeof(req))
+		switch (po->tp_version) {
+		case TPACKET_V1:
+		case TPACKET_V2:
+			len = sizeof(req_u.req);
+			break;
+		case TPACKET_V3:
+		default:
+			len = sizeof(req_u.req3);
+			break;
+		}
+		if (optlen < len)
 			return -EINVAL;
 		if (pkt_sk(sk)->has_vnet_hdr)
 			return -EINVAL;
-		if (copy_from_user(&req, optval, sizeof(req)))
+		if (copy_from_user(&req_u.req, optval, len))
 			return -EFAULT;
-		return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
+		return packet_set_ring(sk, &req_u, 0,
+			optname == PACKET_TX_RING);
 	}
 	case PACKET_COPY_THRESH:
 	{
@@ -2315,6 +3116,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 		switch (val) {
 		case TPACKET_V1:
 		case TPACKET_V2:
+		case TPACKET_V3:
 			po->tp_version = val;
 			return 0;
 		default:
@@ -2424,6 +3226,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 	struct packet_sock *po = pkt_sk(sk);
 	void *data;
 	struct tpacket_stats st;
+	union tpacket_stats_u st_u;
 
 	if (level != SOL_PACKET)
 		return -ENOPROTOOPT;
@@ -2436,15 +3239,27 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 
 	switch (optname) {
 	case PACKET_STATISTICS:
-		if (len > sizeof(struct tpacket_stats))
-			len = sizeof(struct tpacket_stats);
+		if (po->tp_version == TPACKET_V3) {
+			len = sizeof(struct tpacket_stats_v3);
+		} else {
+			if (len > sizeof(struct tpacket_stats))
+				len = sizeof(struct tpacket_stats);
+		}
 		spin_lock_bh(&sk->sk_receive_queue.lock);
-		st = po->stats;
+		if (po->tp_version == TPACKET_V3) {
+			memcpy(&st_u.stats3, &po->stats,
+			sizeof(struct tpacket_stats));
+			st_u.stats3.tp_freeze_q_cnt =
+			po->stats_u.stats3.tp_freeze_q_cnt;
+			st_u.stats3.tp_packets += po->stats.tp_drops;
+			data = &st_u.stats3;
+		} else {
+			st = po->stats;
+			st.tp_packets += st.tp_drops;
+			data = &st;
+		}
 		memset(&po->stats, 0, sizeof(st));
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
-		st.tp_packets += st.tp_drops;
-
-		data = &st;
 		break;
 	case PACKET_AUXDATA:
 		if (len > sizeof(int))
@@ -2485,6 +3300,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 		case TPACKET_V2:
 			val = sizeof(struct tpacket2_hdr);
 			break;
+		case TPACKET_V3:
+			val = sizeof(struct tpacket3_hdr);
+			break;
 		default:
 			return -EINVAL;
 		}
@@ -2641,7 +3459,8 @@ static unsigned int packet_poll(struct file *file, struct socket *sock,
 
 	spin_lock_bh(&sk->sk_receive_queue.lock);
 	if (po->rx_ring.pg_vec) {
-		if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
+		if (!packet_previous_rx_frame(po, &po->rx_ring,
+			TP_STATUS_KERNEL))
 			mask |= POLLIN | POLLRDNORM;
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -2760,7 +3579,7 @@ out_free_pgvec:
 	goto out;
 }
 
-static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
+static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		int closing, int tx_ring)
 {
 	struct pgv *pg_vec = NULL;
@@ -2769,7 +3588,15 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 	struct packet_ring_buffer *rb;
 	struct sk_buff_head *rb_queue;
 	__be16 num;
-	int err;
+	int err = -EINVAL;
+	/* Added to avoid minimal code churn */
+	struct tpacket_req *req = &req_u->req;
+
+	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
+	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
+		WARN(1, "Tx-ring is not supported.\n");
+		goto out;
+	}
 
 	rb = tx_ring ? &po->tx_ring : &po->rx_ring;
 	rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -2795,6 +3622,9 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		case TPACKET_V2:
 			po->tp_hdrlen = TPACKET2_HDRLEN;
 			break;
+		case TPACKET_V3:
+			po->tp_hdrlen = TPACKET3_HDRLEN;
+			break;
 		}
 
 		err = -EINVAL;
@@ -2820,6 +3650,17 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		pg_vec = alloc_pg_vec(req, order);
 		if (unlikely(!pg_vec))
 			goto out;
+		switch (po->tp_version) {
+		case TPACKET_V3:
+		/* Transmit path is not supported. We checked
+		 * it above but just being paranoid
+		 */
+			if (!tx_ring)
+				init_prb_bdqc(po, rb, pg_vec, req_u, tx_ring);
+				break;
+		default:
+			break;
+		}
 	}
 	/* Done */
 	else {
@@ -2872,7 +3713,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
 		register_prot_hook(sk);
 	}
 	spin_unlock(&po->bind_lock);
-
+	if (closing && (po->tp_version > TPACKET_V2)) {
+		/* Because we don't support block-based V3 on tx-ring */
+		if (!tx_ring)
+			prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
+	}
 	release_sock(sk);
 
 	if (pg_vec)
-- 
1.7.5.2


^ permalink raw reply related

* [PATCH net-next v5 1/2] af-packet: Added TPACKET_V3 headers.
From: Chetan Loke @ 2011-08-19 20:18 UTC (permalink / raw)
  To: netdev, davem; +Cc: Chetan Loke
In-Reply-To: <1313785096-911-1-git-send-email-loke.chetan@gmail.com>

Added TPACKET_V3 definitions.

Signed-off-by: Chetan Loke <loke.chetan@gmail.com>
---
 include/linux/if_packet.h |  119 +++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 119 insertions(+), 0 deletions(-)

diff --git a/include/linux/if_packet.h b/include/linux/if_packet.h
index c148606..5926d59 100644
--- a/include/linux/if_packet.h
+++ b/include/linux/if_packet.h
@@ -61,6 +61,17 @@ struct tpacket_stats {
 	unsigned int	tp_drops;
 };
 
+struct tpacket_stats_v3 {
+	unsigned int	tp_packets;
+	unsigned int	tp_drops;
+	unsigned int	tp_freeze_q_cnt;
+};
+
+union tpacket_stats_u {
+	struct tpacket_stats stats1;
+	struct tpacket_stats_v3 stats3;
+};
+
 struct tpacket_auxdata {
 	__u32		tp_status;
 	__u32		tp_len;
@@ -78,6 +89,7 @@ struct tpacket_auxdata {
 #define TP_STATUS_LOSING	0x4
 #define TP_STATUS_CSUMNOTREADY	0x8
 #define TP_STATUS_VLAN_VALID   0x10 /* auxdata has valid tp_vlan_tci */
+#define TP_STATUS_BLK_TMO	0x20
 
 /* Tx ring - header status */
 #define TP_STATUS_AVAILABLE	0x0
@@ -85,6 +97,9 @@ struct tpacket_auxdata {
 #define TP_STATUS_SENDING	0x2
 #define TP_STATUS_WRONG_FORMAT	0x4
 
+/* Rx ring - feature request bits */
+#define TP_FT_REQ_FILL_RXHASH	0x1
+
 struct tpacket_hdr {
 	unsigned long	tp_status;
 	unsigned int	tp_len;
@@ -111,11 +126,100 @@ struct tpacket2_hdr {
 	__u16		tp_padding;
 };
 
+struct hdr_variant1 {
+	__u32	tp_rxhash;
+	__u32	tp_vlan_tci;
+};
+
+struct tpacket3_hdr {
+	__u32		tp_next_offset;
+	__u32		tp_sec;
+	__u32		tp_nsec;
+	__u32		tp_snaplen;
+	__u32		tp_len;
+	__u32		tp_status;
+	__u16		tp_mac;
+	__u16		tp_net;
+	/* pkt_hdr variants */
+	union {
+		struct hdr_variant1 hv1;
+	};
+};
+
+struct bd_ts {
+	unsigned int ts_sec;
+	union {
+		unsigned int ts_usec;
+		unsigned int ts_nsec;
+	};
+};
+
+struct hdr_v1 {
+	__u32	block_status;
+	__u32	num_pkts;
+	__u32	offset_to_first_pkt;
+
+	/* Number of valid bytes (including padding)
+	 * blk_len <= tp_block_size
+	 */
+	__u32	blk_len;
+
+	/*
+	 * Quite a few uses of sequence number:
+	 * 1. Make sure cache flush etc worked.
+	 *    Well, one can argue - why not use the increasing ts below?
+	 *    But look at 2. below first.
+	 * 2. When you pass around blocks to other user space decoders,
+	 *    you can see which blk[s] is[are] outstanding etc.
+	 * 3. Validate kernel code.
+	 */
+	aligned_u64	seq_num;
+
+	/*
+	 * ts_last_pkt:
+	 *
+	 * Case 1.	Block has 'N'(N >=1) packets and TMO'd(timed out)
+	 *		ts_last_pkt == 'time-stamp of last packet' and NOT the
+	 *		time when the timer fired and the block was closed.
+	 *		By providing the ts of the last packet we can absolutely
+	 *		guarantee that time-stamp wise, the first packet in the
+	 *		next block will never precede the last packet of the
+	 *		previous block.
+	 * Case 2.	Block has zero packets and TMO'd
+	 *		ts_last_pkt = time when the timer fired and the block
+	 *		was closed.
+	 * Case 3.	Block has 'N' packets and NO TMO.
+	 *		ts_last_pkt = time-stamp of the last pkt in the block.
+	 *
+	 * ts_first_pkt:
+	 *		Is always the time-stamp when the block was opened.
+	 *		Case a)	ZERO packets
+	 *			No packets to deal with but atleast you know the
+	 *			time-interval of this block.
+	 *		Case b) Non-zero packets
+	 *			Use the ts of the first packet in the block.
+	 *
+	 */
+	struct bd_ts	ts_first_pkt, ts_last_pkt;
+};
+
+union bd_header_u {
+	struct hdr_v1 bh1;
+};
+
+struct block_desc {
+	__u32 version;
+	__u32 offset_to_priv;
+	union bd_header_u hdr;
+};
+
 #define TPACKET2_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket2_hdr)) + sizeof(struct sockaddr_ll))
+#define TPACKET3_HDRLEN		(TPACKET_ALIGN(sizeof(struct tpacket3_hdr)) + sizeof(struct sockaddr_ll))
 
 enum tpacket_versions {
 	TPACKET_V1,
 	TPACKET_V2,
+	TPACKET_V3
 };
 
 /*
@@ -138,6 +242,21 @@ struct tpacket_req {
 	unsigned int	tp_frame_nr;	/* Total number of frames */
 };
 
+struct tpacket_req3 {
+	unsigned int	tp_block_size;	/* Minimal size of contiguous block */
+	unsigned int	tp_block_nr;	/* Number of blocks */
+	unsigned int	tp_frame_size;	/* Size of frame */
+	unsigned int	tp_frame_nr;	/* Total number of frames */
+	unsigned int	tp_retire_blk_tov; /* timeout in msecs */
+	unsigned int	tp_sizeof_priv; /* offset to private data area */
+	unsigned int	tp_feature_req_word;
+};
+
+union tpacket_req_u {
+	struct tpacket_req	req;
+	struct tpacket_req3	req3;
+};
+
 struct packet_mreq {
 	int		mr_ifindex;
 	unsigned short	mr_type;
-- 
1.7.5.2


^ permalink raw reply related

* [PATCH net-next v5 0/2] af-packet: Enhance af-packet to provide a flexible mmap ring buffer scheme.
From: Chetan Loke @ 2011-08-19 20:18 UTC (permalink / raw)
  To: netdev, davem; +Cc: Chetan Loke


Changes in v5:
1) Provide accurate patch description.			(Dave Miller)
   Tightened up patch descriptions.
2) Replace indirect calls with inline tests.		(Dave Miller)
3) Use distinct subject-line per patch.			(Dave Miller)

Changes in v4:
1) Used ALIGN macro                                     (Joe Perches)
2) Deleted duplicate field                              (Eric Dumazet)
3) Re-aligned tpacket fields for disk-capture

Changes in v3:
1) Stripped __packed__ attribute.                       (Dave Miller)
   Replaced with aligned_u64 and padding.
2) Added 'feature_request_word'.
3) Added rx_hash field to the v3-header.                (Chetan L)

Changes in v2:

1) Aligned bdqc members, pr_err to WARN, sob email      (Joe Perches)
2) Added tp_padding                                     (Eric Dumazet)
3) Nuked useless ;) white space                         (Stephen H)
4) Use __u types in headers                             (Ben Hutchings)
5) Added field for creating private area                (Chetan Loke)

Enhanced af-packet to provide a flexible mmap ring buffer scheme by:
A) eliminating fixed frame-size requirement.
B) providing block-level read/poll.

Benefits:
  B1) ~15-20% reduction in cpu-usage.
  B2) ~20% increase in packet capture rate.
  B3) ~2x  increase in packet density(higher capture visibility).
  B4) Capture entire packet payload.
  B5) Captures 99% 64-byte traffic as seen by the kernel.

Detailed description of the enhancement-need/test-setup/etc can be viewed at:
http://thread.gmane.org/gmane.linux.kernel/1158216

Test-suite:
git://lolpcap.git.sourceforge.net/gitroot/lolpcap/lolpcap


----------------------------------------

 include/linux/if_packet.h |  119 ++++++
 net/packet/af_packet.c    |  937 ++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 1010 insertions(+), 46 deletions(-)

-- 
1.7.5.2


^ permalink raw reply

* Re: [PATCH] net: add Documentation/networking/scaling.txt
From: Will de Bruijn @ 2011-08-19 19:50 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Rick Jones, Tom Herbert, rdunlap, linux-doc, davem, netdev
In-Reply-To: <1313524404.2725.50.camel@bwh-desktop>

> As I'm sure you're aware, there is often a trade-off between throughput
> and latency.  It might be useful to provide some guidelines for
> optimising each of the above.

The suggested configuration section on RSS already gives some vague
heuristics. In general, I hesitate to write down best guesses, and I
lack the hard experimental data that would make such advise more
sound.

> The default affinity for most IRQs is all-CPUs.  At least on x86, that
> really means CPU 0 only, so far as I can see.

Yes, I believe so, but since the specified behavior is all-CPUs and
other architectures are free to implement this differently, I prefer
to leave the weaker statement. Also, even on x86 there is the
possibility of migration, so starting on CPU0 is not equivalent to
having the affinity set to that CPU.

>> Requires that ntuple filtering be enabled?
> [...]
>
> As a matter of fact, n-tuple filtering is enabled by default where
> available.  So it might actually make more sense to say that RFS
> acceleration can be *disabled* by disabling n-tuple filtering using
> ethtool.

I didn't know (or check, clearly). I'll clarify that in a next
revision. Since the text is still factually correct and I don't want
to spam the list with frequent minor changes, I'll batch them.

Thanks, Ben.

^ permalink raw reply

* Re: Linux vs FreeBSD Which is correct.
From: Stephen Clark @ 2011-08-19 19:10 UTC (permalink / raw)
  To: Chris Friesen
  Cc: Pascal Hambourg, Rémi Denis-Courmont,
	Linux Kernel Network Developers
In-Reply-To: <4E4E8CEE.102@genband.com>

On 08/19/2011 12:18 PM, Chris Friesen wrote:
> On 08/18/2011 06:42 AM, Stephen Clark wrote:
>
>> I guess I don't really understand what reverse path filter stuff is all
>> about, much less making it weaker.
>> But using 2 made the pings responses be seen.
>
> It's described in RFC3704.  The idea is to block spoofed packets.
>
> From Documentation/networking/ip-sysctl.txt:
>
> rp_filter - INTEGER
> 0 - No source validation.
> 1 - Strict mode as defined in RFC3704 Strict Reverse Path
>     Each incoming packet is tested against the FIB and if the interface
>     is not the best reverse path the packet check will fail.
>     By default failed packets are discarded.
> 2 - Loose mode as defined in RFC3704 Loose Reverse Path
>     Each incoming packet's source address is also tested against the FIB
>     and if the source address is not reachable via any interface
>     the packet check will fail.
>
>    Current recommended practice in RFC3704 is to enable strict mode
>    to prevent IP spoofing from DDos attacks. If using asymmetric routing
>    or other complicated routing, then loose mode is recommended.
>
>    The max value from conf/{all,interface}/rp_filter is used
>    when doing source validation on the {interface}.
>
>    Default value is 0. Note that some distributions enable it
>    in startup scripts.
>
>
>
Thanks for taking the time to explain this. Much appreciated.


-- 

"They that give up essential liberty to obtain temporary safety,
deserve neither liberty nor safety."  (Ben Franklin)

"The course of history shows that as a government grows, liberty
decreases."  (Thomas Jefferson)




^ permalink raw reply

* [RFC][PATCH] Failed neighbors attached to routes are not released
From: Guy Yur @ 2011-08-19 18:53 UTC (permalink / raw)
  To: netdev

Hi,

The issue I am seeing is with a neighbor used as a gateway in a route,
when the neighbor gets nud FAILED it will not be removed from the
neighbor cache.
The reference count for the neighbor remains > 1
when neigh_periodic_work() is called.

Issue noticed with IPv6 neighbors on Arch Linux with kernel 3.0.3
kernel config includes CONFIG_IPV6_ROUTER_PREF

The problem affects routing when the neighbor loses connectivity and returns.

Scenario: Using a default route and a static route through different interfaces.
When the neighbor gateway of the static route goes down the traffic
will move to the default gateway as expected.
Once the static route neighbor comes back up it is not asked for
neighbor solicitation
because the route is marked as FAILED and the traffic will continue to
pass through the default gateway.

Steps to reproduce the route not being removed:
1. add an IPv6 address on an interface
2. add a route to a network through a gateway on the interface's network
3. make sure the gateway address is not reachable
4. ping6 a host in the route network
5. "ip -6 nei" will show the gateway neighbor as FAILED and it won't be released

Steps to reproduce the routing problem:
1. client and two gateway machines (A and B)
2. on the client define a static route through A and a default route through B
3. disconnect eth on A
4. ping6 a host in the network that should go through A
   after a while the traffic will move through B which is the default route
5. reconnect eth on A
6. ping6 a host in the network again, the traffic will still go through B
   "ip -6 nei" on the client will still show A as FAILED


Patch to change the nud state to NONE if the reference count > 1
allowing the neighbor to be released in a future pass.

--- linux/net/core/neighbour.c.orig	2011-08-19 13:15:35.041524169 +0300
+++ linux/net/core/neighbour.c	2011-08-19 18:19:07.271276096 +0300
@@ -802,14 +802,16 @@ static void neigh_periodic_work(struct w
 			if (time_before(n->used, n->confirmed))
 				n->used = n->confirmed;

-			if (atomic_read(&n->refcnt) == 1 &&
-			    (state == NUD_FAILED ||
+			if ((state == NUD_FAILED ||
 			     time_after(jiffies, n->used + n->parms->gc_staletime))) {
-				*np = n->next;
-				n->dead = 1;
-				write_unlock(&n->lock);
-				neigh_cleanup_and_release(n);
-				continue;
+				if (atomic_read(&n->refcnt) == 1) {
+					*np = n->next;
+					n->dead = 1;
+					write_unlock(&n->lock);
+					neigh_cleanup_and_release(n);
+					continue;
+				} else
+					n->nud_state = NUD_NONE;
 			}
 			write_unlock(&n->lock);

^ permalink raw reply

* Re: [PATCH 67/75] et131x: convert to SKB paged frag API.
From: Mark Einon @ 2011-08-19 17:59 UTC (permalink / raw)
  To: Ian Campbell; +Cc: netdev, linux-kernel, Greg Kroah-Hartman, Alan Cox, devel
In-Reply-To: <1313760467-8598-67-git-send-email-ian.campbell@citrix.com>

On 19 August 2011 14:27, Ian Campbell <ian.campbell@citrix.com> wrote:
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
> Cc: Greg Kroah-Hartman <gregkh@suse.de>
> Cc: Mark Einon <mark.einon@gmail.com>
> Cc: Alan Cox <alan@linux.intel.com>
> Cc: devel@driverdev.osuosl.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  drivers/staging/et131x/et1310_tx.c |   12 ++++++------
>  1 files changed, 6 insertions(+), 6 deletions(-)

Just a heads up - This patch is going to create some merge issues
if/when the outstanding changes I've sent for inclusion in the staging
tree get merged back.
Should be fairly trivial to fix though.

Cheers,
Mark

>
> diff --git a/drivers/staging/et131x/et1310_tx.c b/drivers/staging/et131x/et1310_tx.c
> index 8fb3051..03e7a4e 100644
> --- a/drivers/staging/et131x/et1310_tx.c
> +++ b/drivers/staging/et131x/et1310_tx.c
> @@ -519,12 +519,12 @@ static int nic_send_packet(struct et131x_adapter *etdev, struct tcb *tcb)
>                         * returned by pci_map_page() is always 32-bit
>                         * addressable (as defined by the pci/dma subsystem)
>                         */
> -                       desc[frag++].addr_lo =
> -                           pci_map_page(etdev->pdev,
> -                                        frags[i - 1].page,
> -                                        frags[i - 1].page_offset,
> -                                        frags[i - 1].size,
> -                                        PCI_DMA_TODEVICE);
> +                       desc[frag++].addr_lo = skb_frag_dma_map(
> +                                                       &etdev->pdev->dev,
> +                                                       &frags[i - 1],
> +                                                       0,
> +                                                       frags[i - 1].size,
> +                                                       PCI_DMA_TODEVICE);
>                }
>        }
>
> --
> 1.7.2.5
>
>

^ permalink raw reply

* Re: [PATCH] net: add Documentation/networking/scaling.txt
From: Rick Jones @ 2011-08-19 17:38 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: Tom Herbert, rdunlap, linux-doc, davem, netdev, willemb
In-Reply-To: <1313524404.2725.50.camel@bwh-desktop>

On 08/16/2011 12:53 PM, Ben Hutchings wrote:
> On Mon, 2011-08-01 at 11:49 -0700, Rick Jones wrote:
>> On 07/31/2011 11:56 PM, Tom Herbert wrote:
>>> Describes RSS, RPS, RFS, accelerated RFS, and XPS.
>>>
>>> Signed-off-by: Tom Herbert<therbert@google.com>
>>> ---
>>>    Documentation/networking/scaling.txt |  346 ++++++++++++++++++++++++++++++++++
>>>    1 files changed, 346 insertions(+), 0 deletions(-)
>>>    create mode 100644 Documentation/networking/scaling.txt
>>>
>>> diff --git a/Documentation/networking/scaling.txt b/Documentation/networking/scaling.txt
>>> new file mode 100644
>>> index 0000000..aa51f0f
>>> --- /dev/null
>>> +++ b/Documentation/networking/scaling.txt
>>> @@ -0,0 +1,346 @@
>>> +Scaling in the Linux Networking Stack
>>> +
>>> +
>>> +Introduction
>>> +============
>>> +
>>> +This document describes a set of complementary techniques in the Linux
>>> +networking stack to increase parallelism and improve performance (in
>>> +throughput, latency, CPU utilization, etc.) for multi-processor systems.
>>
>> Why not just leave-out the parenthetical lest some picky pedant find a
>> specific example where either of those three are not improved?
>
> As I'm sure you're aware, there is often a trade-off between throughput
> and latency.  It might be useful to provide some guidelines for
> optimising each of the above.

Yes.  I just worry about taking *too* many steps down the slippery slope 
and ending-up ballooning into a broad systems tuning tutorial.

rick

^ permalink raw reply

* Re: A question about MTUs and TCP stack
From: Rick Jones @ 2011-08-19 17:33 UTC (permalink / raw)
  To: Pawan Singh; +Cc: netdev@vger.kernel.org
In-Reply-To: <A55791AD2520D54CA92150D16630D03B36116A5B@VA3DIAXVS1F1.RED001.local>

On 08/16/2011 11:33 AM, Pawan Singh wrote:
> Hi
>
> I am posting this question to "netdev" mailing list because I could
> no longer find "linux-net" mailing list.
>
> I find that the Linux TCP stack consumes huge amount of CPU if the
> MTU of an interface is set to 2400 and it is receiving 1000 byte
> Ethernet packets. On the other hand, if the MTU is set to 1500, the
> CPU consumption is reduced drastically. Increased CPU usage causes
> network throughput to drop considerably (from 800-900 Mbps to 200
> Mbps). My kernel version is fedora core 6 and we are using 1 Gig NICs
> (Intel 82546GB and Broadcom NetXtreme BCM5721):
>
> Linux he7700-tg 2.6.22.14-72.fc6 #1 SMP Wed Nov 21 14:10:25 EST 2007
> x86_64 x86_64 x86_64 GNU/Linux

Based on my experience, you are best off using much more contemporary 
kernels when bringing things to the attention of the netdev list.  The 
list is much more concerned with current bits than those from years past.

Issues with bits from years past are probably best addressed towards the 
distro(s) using them, if any.

Looking into getting a oprofile profile of the behaviour, particularly 
if it still exists in a contemporary kernel would probably be a good idea.

Also good ideas would be checking the netstat statistics before and 
after a test run, and looking at packet traces.  You want to look for 
things like stats on collapsing buffers and packets being dropped.

rick jones

>
> I do not know the TCP buffer management internals and how they are
> affected by MTU. Is there some FAQ/information online or do I have to
> open up the source code and try to identify the source of the
> problem. I guess I can also try newer versions of the kernel and see
> if the issue has been resolved.
>
> -Pawan

^ permalink raw reply

* Re: [Patch] Scm: Remove unnecessary pid & credential references in Unix socket's send and receive path
From: Tim Chen @ 2011-08-19 16:44 UTC (permalink / raw)
  To: David Miller
  Cc: eric.dumazet, viro, ebiederm, ak, matt.fleming, linux-kernel,
	netdev
In-Reply-To: <20110818.220432.1711182086933489213.davem@davemloft.net>

On Thu, 2011-08-18 at 22:04 -0700, David Miller wrote:
> From: Tim Chen <tim.c.chen@linux.intel.com>
> Date: Wed, 17 Aug 2011 16:56:14 -0700
> 
> > +static __inline__ void scm_release(struct scm_cookie *scm)
> > +{
> > +	/* keep ref on pid and cred */
> > +	scm->pid = NULL;
> > +	scm->cred = NULL;
> > +	if (scm && scm->fp)
> > +		__scm_destroy(scm);
> > +}
> 
> After dereferencing scm-> already, it seems a big redundant to test
> it subsequently against NULL.

Thanks for catching it.  I've updated the patch per your comment.

Tim

-----------------

Patch series 109f6e39..7361c36c back in 2.6.36 added functionality to
allow credentials to work across pid namespaces for packets sent via
UNIX sockets.  However, the atomic reference counts on pid and
credentials caused plenty of cache bouncing when there are numerous
threads of the same pid sharing a UNIX socket.  This patch mitigates the
problem by eliminating extraneous reference counts on pid and
credentials on both send and receive path of UNIX sockets. I found a 2x
improvement in hackbench's threaded case.

On the receive path in unix_dgram_recvmsg, currently there is an
increment of reference count on pid and credentials in scm_set_cred.
Then there are two decrement of the reference counts.  Once in scm_recv
and once when skb_free_datagram call skb->destructor function
unix_destruct_scm.  One pair of increment and decrement of ref count on
pid and credentials can be eliminated from the receive path.  Until we
destroy the skb, we already set a reference when we created the skb on
the send side.

On the send path, there are two increments of ref count on pid and
credentials, once in scm_send and once in unix_scm_to_skb.  Then there
is a decrement of the reference counts in scm_destroy's call to
scm_destroy_cred at the end of unix_dgram_send* functions.   One pair of
increment and decrement of the reference counts can be removed so we
only need to increment the ref counts once.

By incorporating these changes, for hackbench running on a 4 socket
NHM-EX machine with 40 cores, the execution of hackbench on
50 groups of 20 threads sped up by factor of 2.

Hackbench command used for testing:
./hackbench 50 thread 2000

Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
---
diff --git a/include/net/scm.h b/include/net/scm.h
index 745460f..68e1e48 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -53,6 +53,14 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm,
 	cred_to_ucred(pid, cred, &scm->creds);
 }
 
+static __inline__ void scm_set_cred_noref(struct scm_cookie *scm,
+				    struct pid *pid, const struct cred *cred)
+{
+	scm->pid  = pid;
+	scm->cred = cred;
+	cred_to_ucred(pid, cred, &scm->creds);
+}
+
 static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
 {
 	put_pid(scm->pid);
@@ -70,6 +78,15 @@ static __inline__ void scm_destroy(struct scm_cookie *scm)
 		__scm_destroy(scm);
 }
 
+static __inline__ void scm_release(struct scm_cookie *scm)
+{
+	/* keep ref on pid and cred */
+	scm->pid = NULL;
+	scm->cred = NULL;
+	if (scm->fp)
+		__scm_destroy(scm);
+}
+
 static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,
 			       struct scm_cookie *scm)
 {
@@ -108,15 +125,14 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
 	if (!msg->msg_control) {
 		if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp)
 			msg->msg_flags |= MSG_CTRUNC;
-		scm_destroy(scm);
+		if (scm && scm->fp)
+			__scm_destroy(scm);
 		return;
 	}
 
 	if (test_bit(SOCK_PASSCRED, &sock->flags))
 		put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(scm->creds), &scm->creds);
 
-	scm_destroy_cred(scm);
-
 	scm_passec(sock, msg, scm);
 
 	if (!scm->fp)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 0722a25..bd85c06 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1382,11 +1382,17 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
 	return max_level;
 }
 
-static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb,
+			   bool send_fds, bool ref)
 {
 	int err = 0;
-	UNIXCB(skb).pid  = get_pid(scm->pid);
-	UNIXCB(skb).cred = get_cred(scm->cred);
+	if (ref) {
+		UNIXCB(skb).pid  = get_pid(scm->pid);
+		UNIXCB(skb).cred = get_cred(scm->cred);
+	} else {
+		UNIXCB(skb).pid  = scm->pid;
+		UNIXCB(skb).cred = scm->cred;
+	}
 	UNIXCB(skb).fp = NULL;
 	if (scm->fp && send_fds)
 		err = unix_attach_fds(scm, skb);
@@ -1411,7 +1417,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int namelen = 0; /* fake GCC */
 	int err;
 	unsigned hash;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	long timeo;
 	struct scm_cookie tmp_scm;
 	int max_level;
@@ -1452,7 +1458,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
-	err = unix_scm_to_skb(siocb->scm, skb, true);
+	err = unix_scm_to_skb(siocb->scm, skb, true, false);
 	if (err < 0)
 		goto out_free;
 	max_level = err + 1;
@@ -1548,7 +1554,7 @@ restart:
 	unix_state_unlock(other);
 	other->sk_data_ready(other, len);
 	sock_put(other);
-	scm_destroy(siocb->scm);
+	scm_release(siocb->scm);
 	return len;
 
 out_unlock:
@@ -1558,7 +1564,8 @@ out_free:
 out:
 	if (other)
 		sock_put(other);
-	scm_destroy(siocb->scm);
+	if (skb == NULL)
+		scm_destroy(siocb->scm);
 	return err;
 }
 
@@ -1570,7 +1577,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	struct sock *sk = sock->sk;
 	struct sock *other = NULL;
 	int err, size;
-	struct sk_buff *skb;
+	struct sk_buff *skb = NULL;
 	int sent = 0;
 	struct scm_cookie tmp_scm;
 	bool fds_sent = false;
@@ -1635,11 +1642,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		size = min_t(int, size, skb_tailroom(skb));
 
 
-		/* Only send the fds in the first buffer */
-		err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
+		/* Only send the fds and no ref to pid in the first buffer */
+		if (fds_sent)
+			err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, true);
+		else
+			err = unix_scm_to_skb(siocb->scm, skb, !fds_sent, false);
 		if (err < 0) {
 			kfree_skb(skb);
-			goto out_err;
+			goto out;
 		}
 		max_level = err + 1;
 		fds_sent = true;
@@ -1647,7 +1657,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
 		if (err) {
 			kfree_skb(skb);
-			goto out_err;
+			goto out;
 		}
 
 		unix_state_lock(other);
@@ -1664,7 +1674,10 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		sent += size;
 	}
 
-	scm_destroy(siocb->scm);
+	if (skb)
+		scm_release(siocb->scm);
+	else
+		scm_destroy(siocb->scm);
 	siocb->scm = NULL;
 
 	return sent;
@@ -1677,7 +1690,9 @@ pipe_err:
 		send_sig(SIGPIPE, current, 0);
 	err = -EPIPE;
 out_err:
-	scm_destroy(siocb->scm);
+	if (skb == NULL)
+		scm_destroy(siocb->scm);
+out:
 	siocb->scm = NULL;
 	return sent ? : err;
 }
@@ -1781,7 +1796,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 		siocb->scm = &tmp_scm;
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
-	scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
+	scm_set_cred_noref(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
 	unix_set_secdata(siocb->scm, skb);
 
 	if (!(flags & MSG_PEEK)) {
@@ -1943,7 +1958,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 			}
 		} else {
 			/* Copy credentials */
-			scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred);
+			scm_set_cred_noref(siocb->scm, UNIXCB(skb).pid,
+					   UNIXCB(skb).cred);
 			check_creds = 1;
 		}
 



^ permalink raw reply related

* Re: [PATCH] virtio-net: Read MAC only after initializing MSI-X
From: Sasha Levin @ 2011-08-19 16:33 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: linux-kernel, Rusty Russell, virtualization, netdev, kvm
In-Reply-To: <20110819152335.GA19489@redhat.com>

On Fri, 2011-08-19 at 18:23 +0300, Michael S. Tsirkin wrote:
> On Sat, Aug 13, 2011 at 11:51:01AM +0300, Sasha Levin wrote:
> > The MAC of a virtio-net device is located at the first field of the device
> > specific header. This header is located at offset 20 if the device doesn't
> > support MSI-X or offset 24 if it does.
> > 
> > Current code in virtnet_probe() used to probe the MAC before checking for
> > MSI-X, which means that the read was always made from offset 20 regardless
> > of whether MSI-X in enabled or not.
> > 
> > This patch moves the MAC probe to after the detection of whether MSI-X is
> > enabled. This way the MAC will be read from offset 24 if the device indeed
> > supports MSI-X.
> > 
> > Cc: Rusty Russell <rusty@rustcorp.com.au>
> > Cc: Michael S. Tsirkin <mst@redhat.com>
> > Cc: virtualization@lists.linux-foundation.org
> > Cc: netdev@vger.kernel.org
> > Cc: kvm@vger.kernel.org
> > Signed-off-by: Sasha Levin <levinsasha928@gmail.com>
> 
> I am not sure I see a bug in virtio: the config pace layout simply
> changes as msix is enabled and disabled (and if you look at the latest
> draft, also on whether 64 bit features are enabled).
> It doesn't depend on msix capability being present in device.
> 
> The spec seems to be explicit enough:
> 	If MSI-X is enabled for the device, two additional fields immediately
> 	follow this header.
> 
> So I'm guessing the bug is in kvm tools which assume
> same layout for when msix is enabled and disabled.
> qemu-kvm seems to do the right thing so the device
> seems to get the correct mac.

We assumed that PCI config space has a static layout like most other
devices. Having a behavior of "First bit 20 does something, but after
enabling MSI-X it does something completely different" sounds strange.

I'm wondering why offsets of the config structure change during run time
and are not statically defined when the device is started.

It's not like VIRTIO_F_FEATURES_HI can be disabled after it was enabled,
or MSI-X can be simply disabled during run time.

Maybe this is better solved by copying the way it was done in PCI itself
with capability linked list?

-- 

Sasha.


^ permalink raw reply

* [PATCH] net: add APIs for manipulating skb page fragments.
From: Ian Campbell @ 2011-08-19 16:25 UTC (permalink / raw)
  To: netdev
  Cc: linux-kernel, Ian Campbell, David S. Miller, Eric Dumazet,
	Michał Mirosław

The primary aim is to add skb_frag_(ref|unref) in order to remove the use of
bare get/put_page on SKB pages fragments and to isolate users from subsequent
changes to the skb_frag_t data structure.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: "Michał Mirosław" <mirq-linux@rere.qmqm.pl>
Cc: netdev@vger.kernel.org
---
 include/linux/skbuff.h |  170 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f902c33..a4b3647 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/dmaengine.h>
 #include <linux/hrtimer.h>
+#include <linux/dma-mapping.h>
 
 /* Don't change this without changing skb_csum_unnecessary! */
 #define CHECKSUM_NONE 0
@@ -1127,14 +1128,47 @@ static inline int skb_pagelen(const struct sk_buff *skb)
 	return len + skb_headlen(skb);
 }
 
-static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
-				      struct page *page, int off, int size)
+/**
+ * __skb_fill_page_desc - initialise a paged fragment in an skb
+ * @skb: buffer containing fragment to be initialised
+ * @i: paged fragment index to initialise
+ * @page: the page to use for this fragment
+ * @off: the offset to the data with @page
+ * @size: the length of the data
+ *
+ * Initialises the @i'th fragment of @skb to point to &size bytes at
+ * offset @off within @page.
+ *
+ * Does not take any additional reference on the fragment.
+ */
+static inline void __skb_fill_page_desc(struct sk_buff *skb, int i,
+					struct page *page, int off, int size)
 {
 	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 	frag->page		  = page;
 	frag->page_offset	  = off;
 	frag->size		  = size;
+}
+
+/**
+ * skb_fill_page_desc - initialise a paged fragment in an skb
+ * @skb: buffer containing fragment to be initialised
+ * @i: paged fragment index to initialise
+ * @page: the page to use for this fragment
+ * @off: the offset to the data with @page
+ * @size: the length of the data
+ *
+ * As per __skb_fill_page_desc() -- initialises the @i'th fragment of
+ * @skb to point to &size bytes at offset @off within @page. In
+ * addition updates @skb such that @i is the last fragment.
+ *
+ * Does not take any additional reference on the fragment.
+ */
+static inline void skb_fill_page_desc(struct sk_buff *skb, int i,
+				      struct page *page, int off, int size)
+{
+	__skb_fill_page_desc(skb, i, page, off, size);
 	skb_shinfo(skb)->nr_frags = i + 1;
 }
 
@@ -1629,6 +1663,138 @@ static inline void netdev_free_page(struct net_device *dev, struct page *page)
 }
 
 /**
+ * skb_frag_page - retrieve the page refered to by a paged fragment
+ * @frag: the paged fragment
+ *
+ * Returns the &struct page associated with @frag.
+ */
+static inline struct page *skb_frag_page(const skb_frag_t *frag)
+{
+	return frag->page;
+}
+
+/**
+ * __skb_frag_ref - take an addition reference on a paged fragment.
+ * @frag: the paged fragment
+ *
+ * Takes an additional reference on the paged fragment @frag.
+ */
+static inline void __skb_frag_ref(skb_frag_t *frag)
+{
+	get_page(skb_frag_page(frag));
+}
+
+/**
+ * skb_frag_ref - take an addition reference on a paged fragment of an skb.
+ * @skb: the buffer
+ * @f: the fragment offset.
+ *
+ * Takes an additional reference on the @f'th paged fragment of @skb.
+ */
+static inline void skb_frag_ref(struct sk_buff *skb, int f)
+{
+	__skb_frag_ref(&skb_shinfo(skb)->frags[f]);
+}
+
+/**
+ * __skb_frag_unref - release a reference on a paged fragment.
+ * @frag: the paged fragment
+ *
+ * Releases a reference on the paged fragment @frag.
+ */
+static inline void __skb_frag_unref(skb_frag_t *frag)
+{
+	put_page(skb_frag_page(frag));
+}
+
+/**
+ * skb_frag_unref - release a reference on a paged fragment of an skb.
+ * @skb: the buffer
+ * @f: the fragment offset
+ *
+ * Releases a reference on the @f'th paged fragment of @skb.
+ */
+static inline void skb_frag_unref(struct sk_buff *skb, int f)
+{
+	__skb_frag_unref(&skb_shinfo(skb)->frags[f]);
+}
+
+/**
+ * skb_frag_address - gets the address of the data contained in a paged fragment
+ * @frag: the paged fragment buffer
+ *
+ * Returns the address of the data within @frag. The page must already
+ * be mapped.
+ */
+static inline void *skb_frag_address(const skb_frag_t *frag)
+{
+	return page_address(skb_frag_page(frag)) + frag->page_offset;
+}
+
+/**
+ * skb_frag_address_safe - gets the address of the data contained in a paged fragment
+ * @frag: the paged fragment buffer
+ *
+ * Returns the address of the data within @frag. Checks that the page
+ * is mapped and returns %NULL otherwise.
+ */
+static inline void *skb_frag_address_safe(const skb_frag_t *frag)
+{
+	void *ptr = page_address(skb_frag_page(frag));
+	if (unlikely(!ptr))
+		return NULL;
+
+	return ptr + frag->page_offset;
+}
+
+/**
+ * __skb_frag_set_page - sets the page contained in a paged fragment
+ * @frag: the paged fragment
+ * @page: the page to set
+ *
+ * Sets the fragment @frag to contain @page.
+ */
+static inline void __skb_frag_set_page(skb_frag_t *frag, struct page *page)
+{
+	frag->page = page;
+	__skb_frag_ref(frag);
+}
+
+/**
+ * skb_frag_set_page - sets the page contained in a paged fragment of an skb
+ * @skb: the buffer
+ * @f: the fragment offset
+ * @page: the page to set
+ *
+ * Sets the @f'th fragment of @skb to contain @page.
+ */
+static inline void skb_frag_set_page(struct sk_buff *skb, int f,
+				     struct page *page)
+{
+	__skb_frag_set_page(&skb_shinfo(skb)->frags[f], page);
+}
+
+/**
+ * skb_frag_dma_map - maps a paged fragment via the DMA API
+ * @device: the device to map the fragment to
+ * @frag: the paged fragment to map
+ * @offset: the offset within the fragment (starting at the
+ *          fragment's own offset)
+ * @size: the number of bytes to map
+ * @direction: the direction of the mapping (%PCI_DMA_*)
+ *
+ * Maps the page associated with @frag to @device.
+ */
+static inline dma_addr_t skb_frag_dma_map(struct device *dev,
+					  const skb_frag_t *frag,
+					  size_t offset, size_t size,
+					  enum dma_data_direction dir)
+{
+	return dma_map_page(dev, skb_frag_page(frag),
+			    frag->page_offset + offset, size, dir);
+}
+
+/**
  *	skb_clone_writable - is the header of a clone writable
  *	@skb: buffer to check
  *	@len: length up to which to write
-- 
1.7.2.5

^ permalink raw reply related

* Re: Linux vs FreeBSD Which is correct.
From: Chris Friesen @ 2011-08-19 16:18 UTC (permalink / raw)
  To: sclark46
  Cc: Pascal Hambourg, Rémi Denis-Courmont,
	Linux Kernel Network Developers
In-Reply-To: <4E4D08B8.8020309@earthlink.net>

On 08/18/2011 06:42 AM, Stephen Clark wrote:

> I guess I don't really understand what reverse path filter stuff is all
> about, much less making it weaker.
> But using 2 made the pings responses be seen.

It's described in RFC3704.  The idea is to block spoofed packets.

 From Documentation/networking/ip-sysctl.txt:

rp_filter - INTEGER
0 - No source validation.
1 - Strict mode as defined in RFC3704 Strict Reverse Path
     Each incoming packet is tested against the FIB and if the interface
     is not the best reverse path the packet check will fail.
     By default failed packets are discarded.
2 - Loose mode as defined in RFC3704 Loose Reverse Path
     Each incoming packet's source address is also tested against the FIB
     and if the source address is not reachable via any interface
     the packet check will fail.

    Current recommended practice in RFC3704 is to enable strict mode
    to prevent IP spoofing from DDos attacks. If using asymmetric routing
    or other complicated routing, then loose mode is recommended.

    The max value from conf/{all,interface}/rp_filter is used
    when doing source validation on the {interface}.

    Default value is 0. Note that some distributions enable it
    in startup scripts.



-- 
Chris Friesen
Software Developer
GENBAND
chris.friesen@genband.com
www.genband.com

^ permalink raw reply

* Re: [PATCH] virtio-net: Read MAC only after initializing MSI-X
From: Michael S. Tsirkin @ 2011-08-19 15:23 UTC (permalink / raw)
  To: Sasha Levin; +Cc: linux-kernel, Rusty Russell, virtualization, netdev, kvm
In-Reply-To: <1313225461-24458-1-git-send-email-levinsasha928@gmail.com>

On Sat, Aug 13, 2011 at 11:51:01AM +0300, Sasha Levin wrote:
> The MAC of a virtio-net device is located at the first field of the device
> specific header. This header is located at offset 20 if the device doesn't
> support MSI-X or offset 24 if it does.
> 
> Current code in virtnet_probe() used to probe the MAC before checking for
> MSI-X, which means that the read was always made from offset 20 regardless
> of whether MSI-X in enabled or not.
> 
> This patch moves the MAC probe to after the detection of whether MSI-X is
> enabled. This way the MAC will be read from offset 24 if the device indeed
> supports MSI-X.
> 
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Michael S. Tsirkin <mst@redhat.com>
> Cc: virtualization@lists.linux-foundation.org
> Cc: netdev@vger.kernel.org
> Cc: kvm@vger.kernel.org
> Signed-off-by: Sasha Levin <levinsasha928@gmail.com>

I am not sure I see a bug in virtio: the config pace layout simply
changes as msix is enabled and disabled (and if you look at the latest
draft, also on whether 64 bit features are enabled).
It doesn't depend on msix capability being present in device.

The spec seems to be explicit enough:
	If MSI-X is enabled for the device, two additional fields immediately
	follow this header.

So I'm guessing the bug is in kvm tools which assume
same layout for when msix is enabled and disabled.
qemu-kvm seems to do the right thing so the device
seems to get the correct mac.

> ---
>  drivers/net/virtio_net.c |   16 ++++++++--------
>  1 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0c7321c..55ccf96 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -981,14 +981,6 @@ static int virtnet_probe(struct virtio_device *vdev)
>  		/* (!csum && gso) case will be fixed by register_netdev() */
>  	}
>  
> -	/* Configuration may specify what MAC to use.  Otherwise random. */
> -	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
> -		vdev->config->get(vdev,
> -				  offsetof(struct virtio_net_config, mac),
> -				  dev->dev_addr, dev->addr_len);
> -	} else
> -		random_ether_addr(dev->dev_addr);
> -
>  	/* Set up our device-specific information */
>  	vi = netdev_priv(dev);
>  	netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);
> @@ -1032,6 +1024,14 @@ static int virtnet_probe(struct virtio_device *vdev)
>  			dev->features |= NETIF_F_HW_VLAN_FILTER;
>  	}
>  
> +	/* Configuration may specify what MAC to use.  Otherwise random. */
> +	if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
> +		vdev->config->get(vdev,
> +				  offsetof(struct virtio_net_config, mac),
> +				  dev->dev_addr, dev->addr_len);
> +	} else
> +		random_ether_addr(dev->dev_addr);
> +
>  	err = register_netdev(dev);
>  	if (err) {
>  		pr_debug("virtio_net: registering device failed\n");
> -- 
> 1.7.6

^ permalink raw reply

* Re: net: rps: support 802.1Q
From: Changli Gao @ 2011-08-19 15:05 UTC (permalink / raw)
  To: Ben Hutchings; +Cc: David S. Miller, Eric Dumazet, Tom Herbert, netdev
In-Reply-To: <1313754853.2814.8.camel@deadeye>

On Fri, Aug 19, 2011 at 7:54 PM, Ben Hutchings
<bhutchings@solarflare.com> wrote:
>
> Should this really be reading an unlimited number of tags?

Not unlimited, but it won't stop until reaching the end of the packet.

>  What if an
> attacker starts sending packets full of VLAN tags?  Since this runs
> before netfilter, there would be no way to prevent those packets burning
> our CPU time.  And if there are legitimately multiple VLAN tags, they
> presumably won't all have the 802.1q Ethertype.
>

Do we need to limit the number of rounds to stop this kind of "bad"
packets from burning our CPU time? Then,  __netif_receive_skb() has to
be update too, so the inspection of tunnel in __skb_get_rxhash() does.
Is there a such limitation in xfrm?

Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [RFC 0/0] Introducing a generic socket offload framework
From: San Mehat @ 2011-08-19 14:58 UTC (permalink / raw)
  To: jhs
  Cc: davem, mst, rusty, linux-kernel, virtualization, netdev,
	digitaleric, mikew, miche, maccarro
In-Reply-To: <1313758185.16299.43.camel@mojatatu>

On Fri, Aug 19, 2011 at 5:49 AM, jamal <hadi@cyberus.ca> wrote:
>
> On Thu, 2011-08-18 at 15:07 -0700, San Mehat wrote:
> > TL;DR
> > -----
> > In this RFC we propose the introduction of the concept of hardware socket
> > offload to the Linux kernel. Patches will accompany this RFC in a few days,
> > but we felt we had enough on the design to solicit constructive discussion
> > from the community at-large.
> >
>
> [..]
>
> > ALTERNATIVE STRATEGIES
> > ----------------------
> >
> > An alternative strategy for providing similar functionality involves either
> > modifying glibc or using LD_PRELOAD tricks to intercept socket calls. We were
> > forced to rule this out due to the complexity (and fragility) involved with
> > attempting to maintain a general solution compatible across various
> > distributions where platform-libraries differ.
>
> Above should have been in your TL;DR;->
>
> LD_PRELOAD is also horrible because of the granularity of the socket
> calls;
> Having things in the kernel and specifically tagging socket as needing
> this feature is much much more manageable.
>
> Tying things to virtualization may miss the big picture because there
> are many other use cases for intercepting socket calls, example:
> Samir Bellabes <sam@synack.fr> has been trying to get what he refers to
> as "personal firewall" (equivalent to the silly windows firewall) which
> prompts the user "ping from blah, do you want to allow a response?"
> That requires intercepting send/recv calls, prompt the user in possibly
> some pop-up and act based on response. It requires looking at content.
> He is trying to use selinux for that interface,
> but i think this would be the right abstraction.

I agree; there's no reason this needs to be tied to virtualization -
it was just the
driving force behind the design. I will generalize the backend interface types

> I hope you plan to support send/recv.

yes

> I also hope you add support for SOCK_RAW (and maybe SOCK_PACKET).

Can you explain a good use-case for SOCK_RAW in this type of
environment? We were noodling it around locally and couldn't come up
with one that we needed to support.

>
> Q: If you want this to be transparent to the apps, who/what is doing
> the tagging of SOCK_HWASSIST? clearly not the app if you dont want to
> change it.

The decision of whether to tag a socket or not is made by the 'hardware'

>
> I like the uri abstraction if it doesnt come at the expense of the
> app transparency.
>

Thank you

-san

> cheers,
> jamal
>



--
San Mehat | Staff Software Engineer | san@google.com | 415-366-6172

^ permalink raw reply

* [PATCH net-next] net: reserve ooo_okay when copying skb header
From: Changli Gao @ 2011-08-19 14:44 UTC (permalink / raw)
  To: David S. Miller; +Cc: Eric Dumazet, Tom Herbert, netdev, Changli Gao

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
---
 net/core/skbuff.c |    1 +
 1 file changed, 1 insertion(+)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index edb66f3..e27334e 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -529,6 +529,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->mac_header		= old->mac_header;
 	skb_dst_copy(new, old);
 	new->rxhash		= old->rxhash;
+	new->ooo_okay		= old->ooo_okay;
 	new->l4_rxhash		= old->l4_rxhash;
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);

^ permalink raw reply related

* Re: [RFC 0/0] Introducing a generic socket offload framework
From: San Mehat @ 2011-08-19 14:44 UTC (permalink / raw)
  To: jhs
  Cc: mst, netdev, miche, linux-kernel, virtualization, digitaleric,
	mikew, maccarro, davem
In-Reply-To: <1313758185.16299.43.camel@mojatatu>


[-- Attachment #1.1: Type: text/plain, Size: 2605 bytes --]

On Fri, Aug 19, 2011 at 5:49 AM, jamal <hadi@cyberus.ca> wrote:

> On Thu, 2011-08-18 at 15:07 -0700, San Mehat wrote:
> > TL;DR
> > -----
> > In this RFC we propose the introduction of the concept of hardware socket
> > offload to the Linux kernel. Patches will accompany this RFC in a few
> days,
> > but we felt we had enough on the design to solicit constructive
> discussion
> > from the community at-large.
> >
>
> [..]
>
> > ALTERNATIVE STRATEGIES
> > ----------------------
> >
> > An alternative strategy for providing similar functionality involves
> either
> > modifying glibc or using LD_PRELOAD tricks to intercept socket calls. We
> were
> > forced to rule this out due to the complexity (and fragility) involved
> with
> > attempting to maintain a general solution compatible across various
> > distributions where platform-libraries differ.
>
> Above should have been in your TL;DR;->
>
> LD_PRELOAD is also horrible because of the granularity of the socket
> calls;
> Having things in the kernel and specifically tagging socket as needing
> this feature is much much more manageable.
>
> Tying things to virtualization may miss the big picture because there
> are many other use cases for intercepting socket calls, example:
> Samir Bellabes <sam@synack.fr> has been trying to get what he refers to
> as "personal firewall" (equivalent to the silly windows firewall) which
> prompts the user "ping from blah, do you want to allow a response?"
> That requires intercepting send/recv calls, prompt the user in possibly
> some pop-up and act based on response. It requires looking at content.
> He is trying to use selinux for that interface,
> but i think this would be the right abstraction.
>

I agree; there's no reason this needs to be tied to virtualization - it was
just the
driving force behind the design. I will generalize the backend interface
types


> I hope you plan to support send/recv.
>

yes


> I also hope you add support for SOCK_RAW (and maybe SOCK_PACKET).
>
>
 Can you explain a good use-case for SOCK_RAW in this type of environment?
We were noodling it around locally and couldn't come up with one that we
needed to support.

Q: If you want this to be transparent to the apps, who/what is doing
> the tagging of SOCK_HWASSIST? clearly not the app if you dont want to
> change it.
>
>
 The decision of whether to tag a socket or not is made by the 'hardware'

I like the uri abstraction if it doesnt come at the expense of the
> app transparency.
>
>
Thank you,

-san


> cheers,
> jamal
>
>


-- 
San Mehat | Staff Software Engineer | san@google.com | 415-366-6172

[-- Attachment #1.2: Type: text/html, Size: 6533 bytes --]

[-- Attachment #2: Type: text/plain, Size: 184 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization

^ permalink raw reply

* Re: Move interface across network namespaces
From: Renato Westphal @ 2011-08-19 14:38 UTC (permalink / raw)
  To: Eric W. Biederman; +Cc: netdev, kaber, David Lamparter
In-Reply-To: <m14o1e71qe.fsf@fess.ebiederm.org>

>> Well, this regression was introduced by commit a2835763e130c343ac,
>> which was merged into v2.6.34. Reverting parts of this commit makes
>> the problem go away but breaks the support of "specifying device flags
>> during device creation". I don't know the best way to fix this... any
>> ideas?
>
> Everything going through dev_change_net_namespace already needs to be
> in the initialized state.  So it looks like we just need to do:
>
> Does the patch below work for you?
>
> Eric
>
> ---
>
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 17d67b5..bfbde69 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -6108,6 +6108,8 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
>        call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
>        call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
>
> +       rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
> +
>        /*
>         *      Flush the unicast and multicast chains
>         */
>

 This works pretty fine. Thanks! :)

-- 
Renato Westphal

^ permalink raw reply

* Re: [PATCH/RFC v3 0/75] enable SKB paged fragment lifetime visibility
From: Ian Campbell @ 2011-08-19 14:30 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, linux-nfs
In-Reply-To: <20110819.070451.1651299907098952592.davem@davemloft.net>

On Fri, 2011-08-19 at 07:04 -0700, David Miller wrote:
> From: David Miller <davem@davemloft.net>
> Date: Fri, 19 Aug 2011 06:34:10 -0700 (PDT)
> 
> > From: David Miller <davem@davemloft.net>
> > Date: Fri, 19 Aug 2011 06:29:58 -0700 (PDT)
> > 
> >> From: Ian Campbell <Ian.Campbell@citrix.com>
> >> Date: Fri, 19 Aug 2011 14:26:33 +0100
> >> 
> >>> This is v3 of my series to enable visibility into SKB paged fragment's
> >> 
> >> Please tone down the patch count :-/  I'm not going to review anything more
> >> than ~20 or so patches at a time from any one person.
> > 
> > Also none of your patches will even apply to net-next GIT since nearly
> > all the ethernet drivers have been moved under drivers/net/ethernet
> 
> Ian, please acknowledge my grievances here.  I see you replying to other
> people, but not to me and I'm the one who has to process all of this
> stuff eventually.

Sorry, I thought I had replied to you first, did it go missing?

> If you want this series to be taken seriously:
> 
> 1) Make your patches against net-next GIT, none of your driver patches will
>    apply because they have all been moved around to different locations
>    under drivers/net

> 2) Submit this in a _SANE_ way.  This means, get the first patch that adds
>    the new interfaces approved and merged.  Then slowly and carefuly submit
>    small, reasonably sized, sets of patches that convert the drivers over.

That all makes sense, I'll do it that way.

> Otherwise there is no way I'm even looking at this stuff, let alone actually
> apply it.

> Realize that every time you patch bomb rediculiously like this I have
> to click and classify every single patch in your bomb at
> http://patchwork.ozlabs.org/project/netdev/list/

Ah, right, I hadn't considered the patchwork thing, I'm very sorry about
that!

Ian.



^ permalink raw reply

* Re: [PATCH] net: add the comment for skb->l4_hash
From: David Miller @ 2011-08-19 14:27 UTC (permalink / raw)
  To: xiaosuo; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <20110819.072547.515057198170181830.davem@davemloft.net>

From: David Miller <davem@davemloft.net>
Date: Fri, 19 Aug 2011 07:25:47 -0700 (PDT)

> From: Changli Gao <xiaosuo@gmail.com>
> Date: Fri, 19 Aug 2011 22:22:24 +0800
> 
>> On Fri, Aug 19, 2011 at 10:19 PM, David Miller <davem@davemloft.net> wrote:
>>> From: Changli Gao <xiaosuo@gmail.com>
>>> Date: Fri, 19 Aug 2011 22:14:12 +0800
>>>
>>>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>>>
>>> It was obvious to me this time, but please in the future indicate in
>>> "[PATCH]" that a change is targetted for net-next as this one was.
>>>
>>> Applied, thanks.
>>>
>> 
>> Sorry again. Please apply the new one instead, as I made a typo
>> mistake in the previous two patches. Thanks.
> 
> Ok, done.

BTW for next time, "net-next" indication belongs inside of "[]"
brackets, it should look something like:

	Subject: [PATCH net-next] net: blah blah blah

Thanks.

^ permalink raw reply

* Re: [PATCH] net: add the comment for skb->l4_hash
From: David Miller @ 2011-08-19 14:25 UTC (permalink / raw)
  To: xiaosuo; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <CABa6K_G+cv3z-WoFSu3s017uVa54aHcn_D=R6WX3=-YA2o2j8g@mail.gmail.com>

From: Changli Gao <xiaosuo@gmail.com>
Date: Fri, 19 Aug 2011 22:22:24 +0800

> On Fri, Aug 19, 2011 at 10:19 PM, David Miller <davem@davemloft.net> wrote:
>> From: Changli Gao <xiaosuo@gmail.com>
>> Date: Fri, 19 Aug 2011 22:14:12 +0800
>>
>>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>>
>> It was obvious to me this time, but please in the future indicate in
>> "[PATCH]" that a change is targetted for net-next as this one was.
>>
>> Applied, thanks.
>>
> 
> Sorry again. Please apply the new one instead, as I made a typo
> mistake in the previous two patches. Thanks.

Ok, done.

^ permalink raw reply

* Re: [PATCH] net: add the comment for skb->l4_hash
From: Changli Gao @ 2011-08-19 14:22 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, therbert, netdev
In-Reply-To: <20110819.071918.2148980329707419757.davem@davemloft.net>

On Fri, Aug 19, 2011 at 10:19 PM, David Miller <davem@davemloft.net> wrote:
> From: Changli Gao <xiaosuo@gmail.com>
> Date: Fri, 19 Aug 2011 22:14:12 +0800
>
>> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
>
> It was obvious to me this time, but please in the future indicate in
> "[PATCH]" that a change is targetted for net-next as this one was.
>
> Applied, thanks.
>

Sorry again. Please apply the new one instead, as I made a typo
mistake in the previous two patches. Thanks.

-- 
Regards,
Changli Gao(xiaosuo@gmail.com)

^ permalink raw reply

* Re: [PATCH net-next v4 af-packet 2/2] Enhance af-packet to provide (near zero)lossless packet capture functionality.
From: chetan loke @ 2011-08-19 14:22 UTC (permalink / raw)
  To: David Miller; +Cc: netdev
In-Reply-To: <20110819.033613.1959565749346388734.davem@davemloft.net>

On Fri, Aug 19, 2011 at 6:36 AM, David Miller <davem@davemloft.net> wrote:
> From: Chetan Loke <loke.chetan@gmail.com>
> Date: Thu, 11 Aug 2011 22:31:00 -0400
>
>> +struct kbdq_ft_ops {
>> +     int num_ops;
>> +     void (*ft_ops[2])(void *, void *);
>> +};
> ...
>> +     struct kbdq_ft_ops kfops;
>  ...
>> +static void prb_init_ft_ops(struct kbdq_core *p1,
>> +                     union tpacket_req_u *req_u)
>> +{
>> +     p1->kfops.ft_ops[p1->kfops.num_ops++] = prb_fill_vlan_info;
>> +
>> +     if (req_u->req3.tp_feature_req_word) {
>> +             if (req_u->req3.tp_feature_req_word & TP_FT_REQ_FILL_RXHASH)
>> +                     p1->kfops.ft_ops[p1->kfops.num_ops++] = prb_fill_rxhash;
>> +             else
>> +                     p1->kfops.ft_ops[p1->kfops.num_ops++] =
>> +                     prb_clear_rxhash;
>> +     }
>> +}
>
> It is a lot cheaper to just test the flags in-line than do indirect calls.
> Indirect calls are very expensive on many cpus.
>
> In fact, since the first op (prb_fill_vlan_info) is unconditional, we eat
> the indirect call cost for absolutely no reason at all.

Oki. Will test the flags in-line in prb_run_all_ft_ops().


>
> This kfops stuff was not present in your previous changes.  And I'm
> going to tell you that if you keep adding things each revision instead
> of just fixing the specific items you've received feedback about, a
> set of changes this invasive and of this size will never get merged.
>

First, thanks(to you and others) for reviewing the huge patchset. I
did think about breaking it into multiple patches but since it
involves changes in a single file it was a little difficult. And I
didn't want to create a separate file for this.

> Please resist the urge to further tinker with the code, and just
> address the feedback we give you.
>
Sorry won't happen. These changes should have been present in v3 when
I added the feature-request word because that was the intent of that
word. It was a screw up on my part.


Chetan Loke

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox