* [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
@ 2005-01-03 12:56 Thomas Graf
2005-01-04 4:13 ` jamal
` (2 more replies)
0 siblings, 3 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-03 12:56 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: netdev
[-- Attachment #1: Type: text/plain, Size: 2531 bytes --]
Jamal et al,
I attached 4 patches of a first ematch implementation. Comments
and suggestions very much appreciated. Compiles but untested.
Patch 1: ematch API
API visible to classifier:
tcf_em_tree_validate(tp, tlv, tree)
tlv: ematches TLV
tree: temporary struct tcf_ematch_tree
Validates the data in the TLV and builds the ematch tree
upon the temporary variable.
tcf_em_tree_change(tp, dst, src)
dst: destination ematch tree (from classifier private data)
src: source ematch tree (temporary tree from _validate)
Replaces the ematch tree in the classifier with the temporary
tree.
tcf_em_tree_destroy(tp, tree)
Destroys an ematch tree
tcf_em_tree_dump(skb, tree, tlv_type)
tlv_type: T of ematches TLV (classifier specific)
Dumps the ematch tree to the skb with given tlv_type.
tcf_em_tree_match(skb, tree, res)
res: struct tcf_result *
Macro returning 1 if no ematches are configured, otherwise
the tree is evaulated and 1 is returned if the tree matches.
The complete API is also visible if ematch is not configured but
will result in empty macros/structures. Those need to be
improved though.
API visible to ematches:
tcf_em_register(ops)
tcf_em_unregister(ops)
ematches must at least provide the following callbacks:
change, match
Optional callbacks are: destroy, dump
kind must be set to a unique ID, i thought about declaring
1..2^16 to ematches within the mainline tree and have the
rest declared as to be used for private use to avoid collisions.
Patch 2: u32 ematch
Is an ematch based on the existing u32 match but allows to
specify the layer and is able to read u32 values if alignment
does not allow direct access. Additionally it supports
the operands, eq, lt, gt. It is a few ticks slower than the
existing match but worth it. However, it does not support
the neat nexthdr via hashing as u32 does which is the main
problem before u32 can make proper use of it.
Patch 3: nbyte ematch
Compares n bytes at specified offset. To be used for IPv6
address matches to avoid 4 ANDed u32 matches.
Patch 4: Basic Classifier
This is the most basic classifier possible doing nothing more
than executing extensions and ematches. It follows the
architecture of u32 and fw by storing a filter list in tp->root.
This eventually makes fw obsolete once meta ematch is available.
I didn't copy the u32/fw code but rather made use of the list.h.
pskbs are completely unhandled so far as I'm still not sure
how to do it properly.
Cheers
[-- Attachment #2: 01_ematch.diff --]
[-- Type: text/plain, Size: 12108 bytes --]
diff -Nru linux-2.6.10-bk4.orig/include/linux/pkt_cls.h linux-2.6.10-bk4/include/linux/pkt_cls.h
--- linux-2.6.10-bk4.orig/include/linux/pkt_cls.h 2005-01-02 21:39:16.000000000 +0100
+++ linux-2.6.10-bk4/include/linux/pkt_cls.h 2005-01-02 21:31:48.000000000 +0100
@@ -319,4 +319,47 @@
#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1)
+struct tcf_ematch_tree_hdr
+{
+ __u16 nmatches;
+ __u16 progid;
+};
+
+enum
+{
+ TCA_EMATCH_TREE_UNSPEC,
+ TCA_EMATCH_TREE_HDR,
+ TCA_EMATCH_TREE_LIST,
+ __TCA_EMATCH_TREE_MAX
+};
+#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1)
+
+struct tcf_ematch_hdr
+{
+ __u16 handle;
+ __u16 matchID;
+ __u16 kind;
+ __u8 flags;
+ __u8 pad; /* currently unused */
+};
+
+enum
+{
+ TCF_EM_CONTAINER,
+ __TCF_EM_MAX
+};
+
+#define TCF_EM_REL_MASK 3
+#define TCF_EM_REL_VALID(v) \
+ (!(((v) & TCF_EM_REL_MASK) == TCF_EM_REL_MASK))
+#define TCF_EM_LAST_KEY(v) (!((v) & TCF_EM_REL_MASK))
+
+#define TCF_EM_REL_OBVIOUS(v, r) (TCF_EM_LAST_KEY(v) || \
+ (!(r) && ((v) & TCF_EM_REL_AND)) || ((r) && ((v) & TCF_EM_REL_OR)))
+
+#define TCF_EM_REL_END (1<<0)
+#define TCF_EM_REL_AND (1<<1)
+#define TCF_EM_REL_OR (1<<2)
+#define TCF_EM_INVERT (1<<3)
+
#endif
diff -Nru linux-2.6.10-bk4.orig/include/linux/rtnetlink.h linux-2.6.10-bk4/include/linux/rtnetlink.h
--- linux-2.6.10-bk4.orig/include/linux/rtnetlink.h 2005-01-02 21:39:16.000000000 +0100
+++ linux-2.6.10-bk4/include/linux/rtnetlink.h 2005-01-02 21:32:46.000000000 +0100
@@ -779,6 +779,11 @@
goto rtattr_failure; \
__rta_fill(skb, attrtype, attrlen, data); })
+#define RTA_PUT_NOHDR(skb, attrlen, data) \
+({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \
+ goto rtattr_failure; \
+ memcpy(skb_put(skb, RTA_ALIGN(attrlen)), data, attrlen); })
+
static inline struct rtattr *
__rta_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
diff -Nru linux-2.6.10-bk4.orig/include/net/pkt_cls.h linux-2.6.10-bk4/include/net/pkt_cls.h
--- linux-2.6.10-bk4.orig/include/net/pkt_cls.h 2005-01-02 21:39:16.000000000 +0100
+++ linux-2.6.10-bk4/include/net/pkt_cls.h 2005-01-02 21:33:21.000000000 +0100
@@ -149,6 +149,75 @@
extern int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
struct tcf_ext_map *map);
+#ifdef CONFIG_NET_EMATCH
+
+struct tcf_ematch_ops;
+
+struct tcf_ematch
+{
+ struct tcf_ematch_hdr hdr;
+ struct tcf_ematch_ops * ops;
+ unsigned long data;
+};
+
+struct tcf_ematch_tree
+{
+ struct tcf_ematch_tree_hdr hdr;
+ struct tcf_ematch * matches;
+
+};
+
+#define em_lookup_match(t, i) (&(t)->matches[(i)])
+
+struct tcf_ematch_ops
+{
+ int kind;
+
+ int (*change)(struct tcf_proto *tp, void *data,
+ int len, struct tcf_ematch *m);
+
+ int (*match)(struct sk_buff *skb, struct tcf_ematch *m);
+ int (*destroy)(struct tcf_proto *tp, struct tcf_ematch *m);
+ int (*dump)(struct sk_buff *skb, struct tcf_ematch *m);
+
+ struct module * owner;
+ struct list_head link;
+};
+
+extern int tcf_em_register(struct tcf_ematch_ops *);
+extern int tcf_em_unregister(struct tcf_ematch_ops *);
+extern int tcf_em_tree_validate(struct tcf_proto *, struct rtattr *,
+ struct tcf_ematch_tree *);
+extern void tcf_em_tree_destroy(struct tcf_proto *, struct tcf_ematch_tree *);
+extern int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int);
+extern int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *);
+
+static inline void
+tcf_em_tree_change(struct tcf_proto *tp, struct tcf_ematch_tree *dst,
+ struct tcf_ematch_tree *src)
+{
+ tcf_tree_lock(tp);
+ memcpy(dst, src, sizeof(*dst));
+ tcf_tree_unlock(tp);
+}
+
+#define tcf_em_tree_match(skb, t) \
+ ((t)->hdr.nmatches ? __tcf_em_tree_match(skb, t) : 1)
+
+#else /* CONFIG_NET_EMATCH */
+
+struct tcf_ematch_tree
+{
+};
+
+#define tcf_em_tree_validate(tp, tb, t) do { } while(0)
+#define tcf_em_tree_destroy(tp, t) do { } while(0)
+#define tcf_em_tree_dump(skb, t, tlv) do { } while(0)
+#define tcf_em_tree_change(tp, dst, src) do { } while(0)
+#define tcf_em_tree_match(skb, t) do { } while(0)
+
+#endif /* CONFIG_NET_EMATCH */
+
#ifdef CONFIG_NET_CLS_IND
static inline int
tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv)
diff -Nru linux-2.6.10-bk4.orig/net/sched/Kconfig linux-2.6.10-bk4/net/sched/Kconfig
--- linux-2.6.10-bk4.orig/net/sched/Kconfig 2005-01-02 21:39:16.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Kconfig 2005-01-02 21:31:44.000000000 +0100
@@ -375,6 +375,12 @@
To compile this code as a module, choose M here: the
module will be called cls_rsvp6.
+config NET_EMATCH
+ bool "Extended Matches"
+ depends on NET_CLS
+ ---help---
+ TODO
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk4.orig/net/sched/Makefile linux-2.6.10-bk4/net/sched/Makefile
--- linux-2.6.10-bk4.orig/net/sched/Makefile 2005-01-02 21:39:16.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Makefile 2005-01-02 21:31:48.000000000 +0100
@@ -33,3 +33,4 @@
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
+obj-$(CONFIG_NET_EMATCH) += ematch.o
diff -Nru linux-2.6.10-bk4.orig/net/sched/ematch.c linux-2.6.10-bk4/net/sched/ematch.c
--- linux-2.6.10-bk4.orig/net/sched/ematch.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/ematch.c 2005-01-01 23:32:13.000000000 +0100
@@ -0,0 +1,300 @@
+/*
+ * net/sched/ematch.c Extended Match API
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+#define EMATCH_STACK_SIZE 32
+
+static LIST_HEAD(ematch_ops);
+static rwlock_t ematch_mod_lock = RW_LOCK_UNLOCKED;
+
+static inline struct tcf_ematch_ops *
+tcf_em_lookup(u16 kind)
+{
+ struct tcf_ematch_ops *e = NULL;
+
+ read_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link) {
+ if (kind == e->kind) {
+ if (!try_module_get(e->owner))
+ e = NULL;
+ break;
+ }
+ }
+ read_unlock(&ematch_mod_lock);
+
+ return e;
+}
+
+int tcf_em_register(struct tcf_ematch_ops *ops)
+{
+ int err = -EEXIST;
+ struct tcf_ematch_ops *e;
+
+ write_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link)
+ if (ops->kind == e->kind)
+ goto errout;
+
+ list_add_tail(&ops->link, &ematch_ops);
+ err = 0;
+errout:
+ write_unlock(&ematch_mod_lock);
+ return err;
+}
+
+int tcf_em_unregister(struct tcf_ematch_ops *ops)
+{
+ int err = 0;
+ struct tcf_ematch_ops *e;
+
+ write_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link) {
+ if (e == ops) {
+ list_del(&e->link);
+ goto out;
+ }
+ }
+
+ err = -ENOENT;
+out:
+ write_unlock(&ematch_mod_lock);
+ return err;
+}
+
+static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *th,
+ struct tcf_ematch *m, struct rtattr *rta)
+{
+ int err = -EINVAL;
+ struct tcf_ematch_hdr *mh = RTA_DATA(rta);
+ int datalen = RTA_PAYLOAD(rta) - sizeof(*mh);
+ void *data = (void *) data + sizeof(*mh);
+
+ if (!TCF_EM_REL_VALID(mh->flags))
+ goto errout;
+
+ memcpy(&m->hdr, mh, sizeof(*mh));
+
+ if (mh->kind == TCF_EM_CONTAINER) {
+ u32 ref;
+
+ if (datalen < sizeof(ref))
+ goto errout;
+ ref = *(u32 *) data;
+ if (ref >= th->nmatches)
+ goto errout;
+ m->data = ref;
+ } else {
+ struct tcf_ematch_ops *ops = tcf_em_lookup(mh->kind);
+
+ if (ops == NULL) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (ops->change) {
+ err = ops->change(tp, data, datalen, m);
+ if (err < 0)
+ goto errout;
+ }
+ }
+
+ err = 0;
+errout:
+ return err;
+}
+
+int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
+ struct tcf_ematch_tree *tree)
+{
+ int i, len, mlen, err = -EINVAL;
+ struct rtattr *m, *tb[TCA_EMATCH_TREE_MAX];
+ struct tcf_ematch_tree_hdr *th;
+
+ if (!rta || rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
+ goto errout;
+
+ if (RTA_PAYLOAD(tb[TCA_EMATCH_TREE_HDR-1]) < sizeof(*th) ||
+ RTA_PAYLOAD(tb[TCA_EMATCH_TREE_LIST-1]) < sizeof(*m))
+ goto errout;
+
+ th = RTA_DATA(tb[TCA_EMATCH_TREE_HDR-1]);
+ m = RTA_DATA(tb[TCA_EMATCH_TREE_LIST-1]);
+ len = RTA_PAYLOAD(tb[TCA_EMATCH_TREE_LIST-1]);
+ mlen = th->nmatches * sizeof(struct tcf_ematch);
+
+ memcpy(&tree->hdr, th, sizeof(*th));
+
+ tree->matches = kmalloc(mlen, GFP_KERNEL);
+ if (tree->matches == NULL)
+ goto errout;
+ memset(tree->matches, 0, mlen);
+
+ for (i = 0; RTA_OK(m, len); i++) {
+ if (rta->rta_type != (i+1) || i >= th->nmatches ||
+ RTA_PAYLOAD(rta) < sizeof(struct tcf_ematch_hdr)) {
+ err = -EINVAL;
+ goto errout_abort;
+ }
+
+ err = tcf_em_validate(tp, th, em_lookup_match(tree, i), rta);
+ if (err < 0)
+ goto errout_abort;
+
+ m = RTA_NEXT(m, len);
+ }
+
+ if (i != th->nmatches) {
+ err = -EINVAL;
+ goto errout_abort;
+ }
+
+
+ err = 0;
+errout:
+ return err;
+
+errout_abort:
+ tcf_em_tree_destroy(tp, tree);
+ return err;
+}
+
+
+void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *t)
+{
+ int i;
+
+ if (t->matches == NULL)
+ return;
+
+ for (i = 0; i < t->hdr.nmatches; i++) {
+ struct tcf_ematch *m = em_lookup_match(t, i);
+ if (m->ops) {
+ if (m->ops->destroy)
+ m->ops->destroy(tp, m);
+ module_put(m->ops->owner);
+ }
+ }
+
+ kfree(t->matches);
+}
+
+int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *t, int tlv)
+{
+ int i;
+ struct rtattr * p_rta = (struct rtattr*) skb->tail;
+ struct rtattr * pm_rta;
+
+ RTA_PUT(skb, tlv, 0, NULL);
+ RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(t->hdr), &t->hdr);
+
+ pm_rta = (struct rtattr *) skb->tail;
+ RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);
+
+ for (i = 0; i < t->hdr.nmatches; i++) {
+ struct rtattr * pd_rta = (struct rtattr*) skb->tail;
+ struct tcf_ematch *m = em_lookup_match(t, i);
+
+ RTA_PUT(skb, i+1, sizeof(m->hdr), &m->hdr);
+ if (m->ops->dump && m->ops->dump(skb, m) < 0)
+ goto rtattr_failure;
+ pd_rta->rta_len = skb->tail - (u8*) pd_rta;
+ }
+
+ pm_rta->rta_len = skb->tail - (u8*)pm_rta;
+ p_rta->rta_len = skb->tail - (u8*)p_rta;
+ return 0;
+rtattr_failure:
+ return -1;
+}
+
+static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ int r = 0;
+
+ if (m->ops->match)
+ r = m->ops->match(skb, m);
+
+ return m->hdr.flags & TCF_EM_INVERT ? !r : r;
+}
+
+/* Do not use this function directly, use tcf_em_tree_match */
+int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *t)
+{
+ int i = 0, n = 0, r = 0;
+ struct tcf_ematch *m;
+ int stack[EMATCH_STACK_SIZE];
+
+ memset(stack, 0, sizeof(stack));
+
+proceed:
+ while (n < t->hdr.nmatches) {
+ m = em_lookup_match(t, n);
+
+ if (m->hdr.kind == TCF_EM_CONTAINER) {
+ if (unlikely(i >= EMATCH_STACK_SIZE)) {
+ if (net_ratelimit())
+ printk("Local stack overflow, " \
+ "increase EMATCH_STACK_SIZE\n");
+ return -1;
+ }
+
+ if (unlikely(m->data <= n)) {
+ if (net_ratelimit())
+ printk("Detected backward precedence " \
+ "jump, fix your filter.\n");
+ return -1;
+ }
+
+ stack[i++] = n;
+ n = m->data;
+ continue;
+ }
+
+ r = tcf_em_match(skb, m);
+ if (TCF_EM_REL_OBVIOUS(m->hdr.flags, r))
+ break;
+ n++;
+ }
+
+pop_stack:
+ if (i > 0) {
+ n = stack[--i];
+ m = em_lookup_match(t, n);
+
+ if (TCF_EM_REL_OBVIOUS(m->hdr.flags, r))
+ goto pop_stack;
+ else {
+ n++;
+ goto proceed;
+ }
+ }
+
+ return r;
+}
+
+EXPORT_SYMBOL(tcf_em_register);
+EXPORT_SYMBOL(tcf_em_unregister);
+EXPORT_SYMBOL(tcf_em_tree_validate);
+EXPORT_SYMBOL(tcf_em_tree_destroy);
+EXPORT_SYMBOL(tcf_em_tree_dump);
+EXPORT_SYMBOL(__tcf_em_tree_match);
+
[-- Attachment #3: 02_em_u32.diff --]
[-- Type: text/plain, Size: 5184 bytes --]
diff -Nru linux-2.6.10-bk4.orig/include/linux/pkt_cls.h linux-2.6.10-bk4/include/linux/pkt_cls.h
--- linux-2.6.10-bk4.orig/include/linux/pkt_cls.h 2005-01-02 22:37:46.000000000 +0100
+++ linux-2.6.10-bk4/include/linux/pkt_cls.h 2005-01-02 22:37:39.000000000 +0100
@@ -346,6 +346,7 @@
enum
{
TCF_EM_CONTAINER,
+ TCF_EM_U32,
__TCF_EM_MAX
};
@@ -362,4 +363,28 @@
#define TCF_EM_REL_OR (1<<2)
#define TCF_EM_INVERT (1<<3)
+struct tcf_em_u32
+{
+ __u32 val;
+ __u32 mask;
+ __u16 off;
+ __u8 align;
+ __u8 layer:4;
+ __u8 opnd:4;
+};
+
+enum
+{
+ TCF_EM_ALIGN_U8 = 1,
+ TCF_EM_ALIGN_U16 = 2,
+ TCF_EM_ALIGN_U32 = 4
+};
+
+enum
+{
+ TCF_EM_OPND_EQ,
+ TCF_EM_OPND_GT,
+ TCF_EM_OPND_LT
+};
+
#endif
diff -Nru linux-2.6.10-bk4.orig/include/net/pkt_cls.h linux-2.6.10-bk4/include/net/pkt_cls.h
--- linux-2.6.10-bk4.orig/include/net/pkt_cls.h 2005-01-02 22:37:46.000000000 +0100
+++ linux-2.6.10-bk4/include/net/pkt_cls.h 2005-01-02 22:23:24.000000000 +0100
@@ -218,6 +218,36 @@
#endif /* CONFIG_NET_EMATCH */
+static inline u32 tcf_read_bucket(u8 *ptr, u8 align)
+{
+ switch (align) {
+ case TCF_EM_ALIGN_U8:
+ return *ptr;
+ case TCF_EM_ALIGN_U16:
+ return (*ptr << 8) | *(ptr+1);
+ case TCF_EM_ALIGN_U32:
+ return (*ptr << 24) | (*(ptr+1) << 16) |
+ (*(ptr+2) << 8) | *(ptr+3);
+ }
+ return 0;
+}
+
+static inline u8 * tcf_get_base_ptr(struct sk_buff *skb, u8 layer)
+{
+ switch (layer) {
+ case 0:
+ return skb->data;
+ case 1:
+ return skb->nh.raw;
+ case 2:
+ return skb->h.raw;
+ }
+ return NULL;
+}
+
+#define tcf_valid_offset(skb, ptr, off, len) \
+ ((ptr + off + len) < skb->tail && (ptr + off) > skb->head)
+
#ifdef CONFIG_NET_CLS_IND
static inline int
tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv)
diff -Nru linux-2.6.10-bk4.orig/net/sched/Kconfig linux-2.6.10-bk4/net/sched/Kconfig
--- linux-2.6.10-bk4.orig/net/sched/Kconfig 2005-01-02 22:37:46.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Kconfig 2005-01-02 22:37:39.000000000 +0100
@@ -381,6 +381,12 @@
---help---
TODO
+config NET_EMATCH_U32
+ tristate "U32"
+ depends on NET_EMATCH
+ ---help---
+ TODO
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk4.orig/net/sched/Makefile linux-2.6.10-bk4/net/sched/Makefile
--- linux-2.6.10-bk4.orig/net/sched/Makefile 2005-01-02 22:37:46.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Makefile 2005-01-02 22:37:39.000000000 +0100
@@ -34,3 +34,4 @@
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
+obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
diff -Nru linux-2.6.10-bk4.orig/net/sched/em_u32.c linux-2.6.10-bk4/net/sched/em_u32.c
--- linux-2.6.10-bk4.orig/net/sched/em_u32.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/em_u32.c 2005-01-02 22:38:27.000000000 +0100
@@ -0,0 +1,98 @@
+/*
+ * net/sched/em_u32.c U32 ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static int em_u32_change(struct tcf_proto *tp, void *data, int len,
+ struct tcf_ematch *m)
+{
+ if (len != sizeof(struct tcf_em_u32))
+ return -EINVAL;
+
+ m->data = (unsigned long) kmalloc(len, GFP_KERNEL);
+ if (!m->data)
+ return -ENOBUFS;
+
+ memcpy((void *) m->data, data, len);
+ return 0;
+}
+
+static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct tcf_em_u32 *u = (struct tcf_em_u32 *) m->data;
+ u8 *ptr = tcf_get_base_ptr(skb, u->layer);
+ u32 v;
+
+ if (unlikely(!tcf_valid_offset(skb, ptr, u->off, u->align)))
+ return 0;
+
+ v = tcf_read_bucket(ptr + u->off, u->align) & u->mask;
+
+ switch (u->opnd) {
+ case TCF_EM_OPND_EQ:
+ return v == u->val;
+ case TCF_EM_OPND_LT:
+ return v < u->val;
+ case TCF_EM_OPND_GT:
+ return v > u->val;
+ }
+
+ return 0;
+}
+
+static int em_u32_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+ kfree((void *) m->data);
+ return 0;
+}
+
+static int em_u32_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ RTA_PUT_NOHDR(skb, sizeof(struct tcf_em_u32), (void *) m->data);
+ return 0;
+rtattr_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_u32_ops = {
+ .kind = TCF_EM_U32,
+ .change = em_u32_change,
+ .match = em_u32_match,
+ .destroy = em_u32_destroy,
+ .dump = em_u32_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_u32_ops.link)
+};
+
+static int __init init_em_u32(void)
+{
+ return tcf_em_register(&em_u32_ops);
+}
+
+static void __exit exit_em_u32(void)
+{
+ tcf_em_unregister(&em_u32_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_u32);
+module_exit(exit_em_u32);
+
[-- Attachment #4: 03_em_nbyte.diff --]
[-- Type: text/plain, Size: 3968 bytes --]
diff -Nru linux-2.6.10-bk4.orig/include/linux/pkt_cls.h linux-2.6.10-bk4/include/linux/pkt_cls.h
--- linux-2.6.10-bk4.orig/include/linux/pkt_cls.h 2005-01-02 22:44:16.000000000 +0100
+++ linux-2.6.10-bk4/include/linux/pkt_cls.h 2005-01-02 22:44:31.000000000 +0100
@@ -347,6 +347,7 @@
{
TCF_EM_CONTAINER,
TCF_EM_U32,
+ TCF_EM_NBYTE,
__TCF_EM_MAX
};
@@ -387,4 +388,11 @@
TCF_EM_OPND_LT
};
+struct tcf_em_nbyte
+{
+ __u16 off;
+ __u16 len:12;
+ __u8 layer:4;
+};
+
#endif
diff -Nru linux-2.6.10-bk4.orig/net/sched/Kconfig linux-2.6.10-bk4/net/sched/Kconfig
--- linux-2.6.10-bk4.orig/net/sched/Kconfig 2005-01-02 22:44:16.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Kconfig 2005-01-02 22:44:31.000000000 +0100
@@ -387,6 +387,12 @@
---help---
TODO
+config NET_EMATCH_NBYTE
+ tristate "N-Byte"
+ depends on NET_EMATCH
+ ---help---
+ TODO
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk4.orig/net/sched/Makefile linux-2.6.10-bk4/net/sched/Makefile
--- linux-2.6.10-bk4.orig/net/sched/Makefile 2005-01-02 22:44:16.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Makefile 2005-01-02 22:44:31.000000000 +0100
@@ -35,3 +35,4 @@
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
+obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
diff -Nru linux-2.6.10-bk4.orig/net/sched/em_nbyte.c linux-2.6.10-bk4/net/sched/em_nbyte.c
--- linux-2.6.10-bk4.orig/net/sched/em_nbyte.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/em_nbyte.c 2005-01-02 22:54:20.000000000 +0100
@@ -0,0 +1,95 @@
+/*
+ * net/sched/em_nbyte.c N-Byte ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+struct nbyte_data
+{
+ struct tcf_em_nbyte hdr;
+ char pattern[0];
+};
+
+static int em_nbyte_validate(struct tcf_proto *tp, void *data, int len,
+ struct tcf_ematch *m)
+{
+ struct tcf_em_nbyte *n = data;
+
+ if (len < sizeof(*n) || len < (sizeof(*n) + n->len))
+ return -EINVAL;
+
+ m->data = (unsigned long) kmalloc(sizeof(*n) + n->len, GFP_KERNEL);
+ if (!m->data)
+ return -ENOBUFS;
+
+ memcpy((void *) m->data, data, sizeof(*n) + n->len);
+ return 0;
+}
+
+static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct nbyte_data *d = (struct nbyte_data *) m->data;
+ u8 *ptr = tcf_get_base_ptr(skb, d->hdr.layer);
+
+ if (unlikely(!tcf_valid_offset(skb, ptr, d->hdr.off, d->hdr.len)))
+ return 0;
+
+ return !memcmp(ptr + d->hdr.off, d->pattern, d->hdr.len);
+}
+
+static int em_nbyte_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+ kfree((void *) m->data);
+ return 0;
+}
+
+static int em_nbyte_dump(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct nbyte_data *d = (struct nbyte_data *) m->data;
+ RTA_PUT_NOHDR(skb, sizeof(*d) + d->hdr.len, d);
+ return 0;
+rtattr_failure:
+ return -1;
+}
+
+static struct tcf_ematch_ops em_nbyte_ops = {
+ .kind = TCF_EM_NBYTE,
+ .change = em_nbyte_validate,
+ .match = em_nbyte_match,
+ .destroy = em_nbyte_destroy,
+ .dump = em_nbyte_dump,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_nbyte_ops.link)
+};
+
+static int __init init_em_nbyte(void)
+{
+ return tcf_em_register(&em_nbyte_ops);
+}
+
+static void __exit exit_em_nbyte(void)
+{
+ tcf_em_unregister(&em_nbyte_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_nbyte);
+module_exit(exit_em_nbyte);
+
[-- Attachment #5: 04_cls_basic.diff --]
[-- Type: text/plain, Size: 8649 bytes --]
diff -Nru linux-2.6.10-bk4.orig/include/linux/pkt_cls.h linux-2.6.10-bk4/include/linux/pkt_cls.h
--- linux-2.6.10-bk4.orig/include/linux/pkt_cls.h 2005-01-02 22:57:36.000000000 +0100
+++ linux-2.6.10-bk4/include/linux/pkt_cls.h 2005-01-03 02:43:05.000000000 +0100
@@ -319,6 +319,20 @@
#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1)
+/* Basic filter */
+
+enum
+{
+ TCA_BASIC_UNSPEC,
+ TCA_BASIC_CLASSID,
+ TCA_BASIC_EMATCHES,
+ TCA_BASIC_ACT,
+ TCA_BASIC_POLICE,
+ __TCA_BASIC_MAX
+};
+
+#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1)
+
struct tcf_ematch_tree_hdr
{
__u16 nmatches;
diff -Nru linux-2.6.10-bk4.orig/net/sched/Kconfig linux-2.6.10-bk4/net/sched/Kconfig
--- linux-2.6.10-bk4.orig/net/sched/Kconfig 2005-01-02 22:57:36.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Kconfig 2005-01-03 03:27:54.000000000 +0100
@@ -269,6 +269,12 @@
Documentation and software is at
<http://diffserv.sourceforge.net/>.
+config NET_CLS_BASIC
+ tristate "Basic classifier"
+ depends on NET_CLS
+ ---help---
+ TODO
+
config NET_CLS_TCINDEX
tristate "TC index classifier"
depends on NET_CLS
diff -Nru linux-2.6.10-bk4.orig/net/sched/Makefile linux-2.6.10-bk4/net/sched/Makefile
--- linux-2.6.10-bk4.orig/net/sched/Makefile 2005-01-02 22:57:36.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/Makefile 2005-01-03 03:27:21.000000000 +0100
@@ -33,6 +33,7 @@
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
+obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
diff -Nru linux-2.6.10-bk4.orig/net/sched/cls_basic.c linux-2.6.10-bk4/net/sched/cls_basic.c
--- linux-2.6.10-bk4.orig/net/sched/cls_basic.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk4/net/sched/cls_basic.c 2005-01-03 13:32:00.000000000 +0100
@@ -0,0 +1,300 @@
+/*
+ * net/sched/cls_basic.c Basic Packet Classifier.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+
+struct basic_head
+{
+ u32 hgenerator;
+ struct list_head flist;
+};
+
+struct basic_filter
+{
+ u32 handle;
+ struct tcf_exts exts;
+ struct tcf_ematch_tree ematches;
+ struct tcf_result res;
+ struct list_head link;
+};
+
+static struct tcf_ext_map basic_ext_map = {
+ .action = TCA_BASIC_ACT,
+ .police = TCA_BASIC_POLICE
+};
+
+static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ int r;
+ struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_filter *f;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (!tcf_em_tree_match(skb, &f->ematches))
+ continue;
+ *res = f->res;
+ r = tcf_exts_exec(skb, &f->exts, res);
+ if (r < 0)
+ continue;
+ return r;
+ }
+ return -1;
+}
+
+static unsigned long basic_get(struct tcf_proto *tp, u32 handle)
+{
+ unsigned long l = 0UL;
+ struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_filter *f;
+
+ list_for_each_entry(f, &head->flist, link)
+ if (f->handle == handle)
+ l = (unsigned long) f;
+
+ return l;
+}
+
+static void basic_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+static int basic_init(struct tcf_proto *tp)
+{
+ return 0;
+}
+
+static inline void
+basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
+{
+ tcf_unbind_filter(tp, &f->res);
+ tcf_exts_destroy(tp, &f->exts);
+ tcf_em_tree_destroy(tp, &f->ematches);
+ kfree(f);
+}
+
+static void basic_destroy(struct tcf_proto *tp)
+{
+ struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL);
+ struct basic_filter *f, *n;
+
+ list_for_each_entry_safe(f, n, &head->flist, link) {
+ list_del(&f->link);
+ basic_delete_filter(tp, f);
+ }
+}
+
+static int basic_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_filter *t, *f = (struct basic_filter *) arg;
+
+ list_for_each_entry(t, &head->flist, link)
+ if (t == f) {
+ tcf_tree_lock(tp);
+ list_del(&t->link);
+ tcf_tree_unlock(tp);
+ basic_delete_filter(tp, t);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
+ unsigned long base, struct rtattr **tb,
+ struct rtattr *est)
+{
+ int err;
+ struct tcf_exts e;
+ struct tcf_ematch_tree t;
+
+ err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map);
+ if (err < 0)
+ return err;
+
+ err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t);
+ if (err < 0)
+ goto errout;
+
+ if (tb[TCA_BASIC_CLASSID-1]) {
+ err = -EINVAL;
+ if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32))
+ goto errout;
+
+ f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ tcf_exts_change(tp, &f->exts, &e);
+ tcf_em_tree_change(tp, &f->ematches, &t);
+
+ return 0;
+errout:
+ tcf_exts_destroy(tp, &e);
+ return err;
+}
+
+static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct rtattr **tca, unsigned long *arg)
+{
+ int err = -EINVAL;
+ struct basic_head *head = (struct basic_head *) tp->root;
+ struct rtattr *tb[TCA_BASIC_MAX];
+ struct basic_filter *f = (struct basic_filter *) *arg;
+
+ if (!tca[TCA_OPTIONS-1])
+ return -EINVAL;
+
+ if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0)
+ return -EINVAL;
+
+ if (f != NULL) {
+ if (handle && f->handle != handle)
+ return -EINVAL;
+ return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
+ }
+
+ err = -ENOBUFS;
+ if (!head) {
+ head = kmalloc(sizeof(*head), GFP_KERNEL);
+ if (!head)
+ goto errout;
+
+ memset(head, 0, sizeof(*head));
+ INIT_LIST_HEAD(&head->flist);
+ tp->root = head;
+ }
+
+ f = kmalloc(sizeof(*f), GFP_KERNEL);
+ if (!f)
+ goto errout;
+ memset(f, 0, sizeof(*f));
+
+ err = -EINVAL;
+ if (handle)
+ f->handle = handle;
+ else {
+ int i = 0x80000000;
+ do {
+ if (++head->hgenerator == 0x7FFFFFFF)
+ head->hgenerator = 1;
+ } while (--i > 0 && basic_get(tp, head->hgenerator));
+
+ if (i <= 0) {
+ printk(KERN_ERR "Insufficient number of handles\n");
+ goto errout;
+ }
+
+ f->handle = head->hgenerator;
+ }
+
+ err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]);
+ if (err < 0)
+ goto errout;
+
+ tcf_tree_lock(tp);
+ list_add(&f->link, &head->flist);
+ tcf_tree_unlock(tp);
+ *arg = (unsigned long) f;
+
+ return 0;
+errout:
+ if (*arg == 0UL && f)
+ kfree(f);
+
+ return err;
+}
+
+static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct basic_head *head = (struct basic_head *) tp->root;
+ struct basic_filter *f;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+
+ if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+static int basic_dump(struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct basic_filter *f = (struct basic_filter *) fh;
+ unsigned char *b = skb->tail;
+ struct rtattr *rta;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ rta = (struct rtattr *) b;
+ RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+
+ if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 ||
+ tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0)
+ goto rtattr_failure;
+
+ rta->rta_len = (skb->tail - b);
+ return skb->len;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+static struct tcf_proto_ops cls_basic_ops = {
+ .kind = "basic",
+ .classify = basic_classify,
+ .init = basic_init,
+ .destroy = basic_destroy,
+ .get = basic_get,
+ .put = basic_put,
+ .change = basic_change,
+ .delete = basic_delete,
+ .walk = basic_walk,
+ .dump = basic_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_basic(void)
+{
+ return register_tcf_proto_ops(&cls_basic_ops);
+}
+
+static void __exit exit_basic(void)
+{
+ unregister_tcf_proto_ops(&cls_basic_ops);
+}
+
+module_init(init_basic)
+module_exit(exit_basic)
+MODULE_LICENSE("GPL");
+
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-03 12:56 [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Thomas Graf
@ 2005-01-04 4:13 ` jamal
2005-01-04 12:03 ` Thomas Graf
2005-01-04 12:27 ` Thomas Graf
2005-01-04 22:36 ` Thomas Graf
2005-01-05 13:32 ` [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Florian Weimer
2 siblings, 2 replies; 44+ messages in thread
From: jamal @ 2005-01-04 4:13 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Mon, 2005-01-03 at 07:56, Thomas Graf wrote:
> Jamal et al,
>
> I attached 4 patches of a first ematch implementation. Comments
> and suggestions very much appreciated. Compiles but untested.
>
> Patch 1: ematch API
>
> API visible to classifier:
>
> tcf_em_tree_validate(tp, tlv, tree)
> tlv: ematches TLV
> tree: temporary struct tcf_ematch_tree
>
> Validates the data in the TLV and builds the ematch tree
> upon the temporary variable.
struct tcf_ematch_hdr
{
__u16 handle;
__u16 matchID;
__u16 kind;
__u8 flags;
__u8 pad; /* currently unused */
};
you need both matchID and handle?
struct tcf_ematch
{
struct tcf_ematch_hdr hdr;
struct tcf_ematch_ops * ops;
unsigned long data;
};
Both tcf_ematch_ops and tcf_ematch_hdr have kind;
Is data length stored somewhere?
Noticed indev still hanging there ;-> shouldnt that die by this patch?
> tcf_em_tree_change(tp, dst, src)
> dst: destination ematch tree (from classifier private data)
> src: source ematch tree (temporary tree from _validate)
>
> Replaces the ematch tree in the classifier with the temporary
> tree.
Seems to assume some owner of the ematch other than mother classifier.
Recall the idea of ownership by classifier we discussed earlier
which should be default if the ematch doesnt implement a ->change()
BTW, Is the assumption i can have a u32:
match
ematch
match
match
ematch
now gone? I couldnt tell.
> tcf_em_tree_destroy(tp, tree)
> Destroys an ematch tree
>
> tcf_em_tree_dump(skb, tree, tlv_type)
> tlv_type: T of ematches TLV (classifier specific)
>
> Dumps the ematch tree to the skb with given tlv_type.
Same comments as in ->change().
if ematch doesnt implement a destroy or dump then mother classifier is
responsible.
> tcf_em_tree_match(skb, tree, res)
> res: struct tcf_result *
>
> Macro returning 1 if no ematches are configured, otherwise
> the tree is evaulated and 1 is returned if the tree matches.
I think that looks valid for simplicty case. I wasnt so sure about
the INVERT thing you had. It doesnt look like a bad idea, just different
from what i had in mind (where you let the ematch worry about
inversion).
> The complete API is also visible if ematch is not configured but
> will result in empty macros/structures. Those need to be
> improved though.
>
> API visible to ematches:
>
> tcf_em_register(ops)
> tcf_em_unregister(ops)
>
> ematches must at least provide the following callbacks:
>
> change, match
>
> Optional callbacks are: destroy, dump
>
> kind must be set to a unique ID, i thought about declaring
> 1..2^16 to ematches within the mainline tree and have the
> rest declared as to be used for private use to avoid collisions.
With std actions also this was an issue - at the moment dont have
anything in any headers just to make it free for all - This way you
could write a simple module that has zero dependency on an already
compiled kernel. There would be standard practise where say 11 would
mean something - but i suggest not to enforce it; the register()
should probably spit some helpful message of who already has the number.
you are trying to grab.
> Patch 2: u32 ematch
>
> Is an ematch based on the existing u32 match but allows to
> specify the layer and is able to read u32 values if alignment
> does not allow direct access. Additionally it supports
> the operands, eq, lt, gt. It is a few ticks slower than the
> existing match but worth it. However, it does not support
> the neat nexthdr via hashing as u32 does which is the main
> problem before u32 can make proper use of it.
It does emulate a u32 node but not the classifier - which is a _lot_
more sophisticated (with multilevel trees of hashes etc). Maybe you
should change its name to something like 32bit match.
> Patch 3: nbyte ematch
>
> Compares n bytes at specified offset. To be used for IPv6
> address matches to avoid 4 ANDed u32 matches.
This looks useful.
My recommendation would be to have the metamatch as the first thing
so we can kill indev and friends.
> Patch 4: Basic Classifier
>
> This is the most basic classifier possible doing nothing more
> than executing extensions and ematches. It follows the
> architecture of u32 and fw by storing a filter list in tp->root.
> This eventually makes fw obsolete once meta ematch is available.
> I didn't copy the u32/fw code but rather made use of the list.h.
Very inefficient, but serves the purpose of an example.
[Even if you go as basic a hash as fw classifier you will do better]
> pskbs are completely unhandled so far as I'm still not sure
> how to do it properly.
Given where we are doing these things (egress and ingress of stack)
we would mostly be fine (unlike netfilter).
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 4:13 ` jamal
@ 2005-01-04 12:03 ` Thomas Graf
2005-01-04 13:19 ` jamal
2005-01-04 12:27 ` Thomas Graf
1 sibling, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-04 12:03 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104812028.1085.50.camel@jzny.localdomain> 2005-01-03 23:13
> struct tcf_ematch_hdr
> {
> __u16 handle;
> __u16 matchID;
> __u16 kind;
> __u8 flags;
> __u8 pad; /* currently unused */
> };
>
> you need both matchID and handle?
No, handle is yet unused and I think we can screw it again.
> Both tcf_ematch_ops and tcf_ematch_hdr have kind;
Correct, I wanted to avoid having to do transformations
but it would save us a few bits.
> Is data length stored somewhere?
Not in this patch as it wasn't needed, I added it to my local
tree yesterday though. It is indeed required if we allocate
in the ematch api instead of having the ematch doing it.
> Noticed indev still hanging there ;-> shouldnt that die by this patch?
As soon as I find the time to write the meta ematch.
> > tcf_em_tree_change(tp, dst, src)
> > dst: destination ematch tree (from classifier private data)
> > src: source ematch tree (temporary tree from _validate)
> >
> > Replaces the ematch tree in the classifier with the temporary
> > tree.
>
> Seems to assume some owner of the ematch other than mother classifier.
> Recall the idea of ownership by classifier we discussed earlier
> which should be default if the ematch doesnt implement a ->change()
The classifier must always be the owner. Splitting the vaildation
and changing into 2 separate functions makes it easy for the classifier
to stay consistent while changing without doing expensive error
recovery.
> BTW, Is the assumption i can have a u32:
> match
> ematch
> match
> match
> ematch
>
> now gone? I couldnt tell.
I can't tell you either but not really. It's still possible
but I'm not sure if it makes sense. My idea was to replace match
with ematch so it would benefit from logic relations. The problem
arises when we get to the nexthdr offset mangling.
I have to look into this but I might have to drop my idea and
do as you state above.
> > tcf_em_tree_destroy(tp, tree)
> > Destroys an ematch tree
> >
> > tcf_em_tree_dump(skb, tree, tlv_type)
> > tlv_type: T of ematches TLV (classifier specific)
> >
> > Dumps the ematch tree to the skb with given tlv_type.
>
> Same comments as in ->change().
> if ematch doesnt implement a destroy or dump then mother classifier is
> responsible.
Right, I changed this already. change/dump/destroy are fully
optional. Here's the latest API to the classifier:
change() (Optional)
Called if provided, otherwise ematch api allocates the data, stores
it in m->data and sets m->datalen. Special Case: If TCF_EM_SIMPLE
is set the ematch data consists of a simple u32 which means no
allocation is required and the value is stored in m->data directly.
Note: I might add a special field to ematch_ops which can be set to
the expected length of the ematch data so we have at least some basic
sanity check. Thoughts?
match() (Must)
...
destroy() (Optional)
Called if provided, otheriwse m->data is freed in ematch api unless
TCF_EM_SIMPLE is set.
dump() (Optional)
Called if provided, otherwise m->data is dumped onto the skb with
m->datalen as L. Special handling again for TCF_EM_SIMPLE.
I think this makes it as simple as it can get while keeping the door
open for complex ematches such as meta ematch.
> With std actions also this was an issue - at the moment dont have
> anything in any headers just to make it free for all - This way you
> could write a simple module that has zero dependency on an already
> compiled kernel. There would be standard practise where say 11 would
> mean something - but i suggest not to enforce it; the register()
> should probably spit some helpful message of who already has the number.
> you are trying to grab.
The warning is a good idea. I don't want to enforce it, a comment
is just fine and it's up to you whehter you want to fix your ematch
everyime a new ematch makes it into the kernel. I added this comment:
/* Ematch type assignments
* 1..32767 Reserved for ematches inside kernel tree
* 32768..65535 Free to use, not reliable
*/
> > Patch 2: u32 ematch
>
> It does emulate a u32 node but not the classifier - which is a _lot_
> more sophisticated (with multilevel trees of hashes etc). Maybe you
> should change its name to something like 32bit match.
Agreed. Note: With the latest API everything except for match can
be screwed. Same for em_nbyte.
> > Patch 3: nbyte ematch
> >
> > Compares n bytes at specified offset. To be used for IPv6
> > address matches to avoid 4 ANDed u32 matches.
>
> This looks useful.
> My recommendation would be to have the metamatch as the first thing
> so we can kill indev and friends.
Right, but those were easy to write in in interruptive working enviroment
and somewhat validated the API. meta ematch will take some time to write
but it sure has top priority.
> > Patch 4: Basic Classifier
>
> Very inefficient, but serves the purpose of an example.
> [Even if you go as basic a hash as fw classifier you will do better]
Fully agreed, nevertheless I think something like this is
required to fill the gaps of u32 and fw.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 12:03 ` Thomas Graf
@ 2005-01-04 13:19 ` jamal
2005-01-04 13:46 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-04 13:19 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Tue, 2005-01-04 at 07:03, Thomas Graf wrote:
> * jamal <1104812028.1085.50.camel@jzny.localdomain> 2005-01-03 23:13
> Right, I changed this already. change/dump/destroy are fully
> optional. Here's the latest API to the classifier:
>
> change() (Optional)
> Called if provided, otherwise ematch api allocates the data, stores
> it in m->data and sets m->datalen. Special Case: If TCF_EM_SIMPLE
> is set the ematch data consists of a simple u32 which means no
> allocation is required and the value is stored in m->data directly.
>
> Note: I might add a special field to ematch_ops which can be set to
> the expected length of the ematch data so we have at least some basic
> sanity check. Thoughts?
My thinking is:
It doesnt have to be simple 32 bit data.
If i pass you a struct and tell you what length it is, then you as the
classifier dont know need to know anything about it. You just store
mystruct as data and datalen from the TLV. you then pass the datastruct
to match() - Of course the match() will have to know what that struct
means.
> match() (Must)
> ...
>
> destroy() (Optional)
> Called if provided, otheriwse m->data is freed in ematch api unless
> TCF_EM_SIMPLE is set.
Again using the above logic, destroy becomes just kfree(mystruct)
> dump() (Optional)
> Called if provided, otherwise m->data is dumped onto the skb with
> m->datalen as L. Special handling again for TCF_EM_SIMPLE.
and dump becomes a matter of looking at datalen and encapsulating
mystruct in a TLV without thinking about what the content is.
> I think this makes it as simple as it can get while keeping the door
> open for complex ematches such as meta ematch.
Agreed.
> > > Patch 4: Basic Classifier
> >
> > Very inefficient, but serves the purpose of an example.
> > [Even if you go as basic a hash as fw classifier you will do better]
>
> Fully agreed, nevertheless I think something like this is
> required to fill the gaps of u32 and fw.
agreed.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 13:19 ` jamal
@ 2005-01-04 13:46 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-04 13:46 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104844753.1085.99.camel@jzny.localdomain> 2005-01-04 08:19
> On Tue, 2005-01-04 at 07:03, Thomas Graf wrote:
> > change() (Optional)
>
> My thinking is:
> It doesnt have to be simple 32 bit data.
> If i pass you a struct and tell you what length it is, then you as the
> classifier dont know need to know anything about it. You just store
> mystruct as data and datalen from the TLV. you then pass the datastruct
> to match() - Of course the match() will have to know what that struct
> means.
That's exactly how it is, basically the logic is:
if (ops->change) {
err = ops->change(tp, data, datalen, m);
if (err < 0)
goto errout;
} else if (datalen > 0) {
if (mh->flags & TCF_EM_SIMPLE) {
if (datalen != sizeof(u32))
goto errout;
m->data = *(u32 *) data;
} else {
void *v = kmalloc(datalen, GFP_KERNEL);
if (v == NULL) {
err = -ENOBUFS;
goto errout;
}
memcpy(v, data, datalen);
m->data = (unsigned long) v;
}
}
m->datalen = datalen;
> > destroy() (Optional)
> > Called if provided, otheriwse m->data is freed in ematch api unless
> > TCF_EM_SIMPLE is set.
>
> Again using the above logic, destroy becomes just kfree(mystruct)
Right, that's exactly how it is
if (m->ops->destroy)
m->ops->destroy(tp, m);
else if (!(m->hdr.flags & TCF_EM_SIMPLE) && m->data)
kfree((void *) m->data);
> > dump() (Optional)
> > Called if provided, otherwise m->data is dumped onto the skb with
> > m->datalen as L. Special handling again for TCF_EM_SIMPLE.
>
> and dump becomes a matter of looking at datalen and encapsulating
> mystruct in a TLV without thinking about what the content is.
Absolutely true, you know about the code before you read it ;->
if (m->ops->dump) {
if (m->ops->dump(skb, m) < 0)
goto rtattr_failure;
} else if (m->hdr.flags & TCF_EM_SIMPLE) {
u32 u = m->data;
RTA_PUT_NOHDR(skb, sizeof(u32), &u);
} else if (m->datalen > 0)
RTA_PUT_NOHDR(skb, m->datalen, (void *) m->data);
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 4:13 ` jamal
2005-01-04 12:03 ` Thomas Graf
@ 2005-01-04 12:27 ` Thomas Graf
2005-01-04 13:22 ` jamal
1 sibling, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-04 12:27 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104812028.1085.50.camel@jzny.localdomain> 2005-01-03 23:13
> On Mon, 2005-01-03 at 07:56, Thomas Graf wrote:
> > Patch 4: Basic Classifier
>
> Very inefficient, but serves the purpose of an example.
> [Even if you go as basic a hash as fw classifier you will do better]
Might be worth to mention the motivation for this. fw and u32
will definitely perform much better on complex setups but
many do not use u32 hashing not to mention even understand it
or have large nfmark -> classid fw maps.
Many use u32 to match simple stuff as port numbers, dscp values
or ip subnet addresses and create a new filter for every port/dscp
value and for every address. Some even use temporary classes to
emulate logic relations and this gets really slow. I have to get
numbers first but a single basic filter with ORed matches is probably
faster than a separate u32 filter for every case. Sure, once u32
has ematch support it gets better and the hashing shouldn't have
too much influence even if it's unused.. We can see what to do once
u32 can handle ematches.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 12:27 ` Thomas Graf
@ 2005-01-04 13:22 ` jamal
2005-01-04 13:41 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-04 13:22 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Tue, 2005-01-04 at 07:27, Thomas Graf wrote:
> * jamal <1104812028.1085.50.camel@jzny.localdomain> 2005-01-03 23:13
> > Very inefficient, but serves the purpose of an example.
> > [Even if you go as basic a hash as fw classifier you will do better]
>
> Might be worth to mention the motivation for this. fw and u32
> will definitely perform much better on complex setups but
> many do not use u32 hashing not to mention even understand it
> or have large nfmark -> classid fw maps.
agreed.
> Many use u32 to match simple stuff as port numbers, dscp values
> or ip subnet addresses and create a new filter for every port/dscp
> value and for every address. Some even use temporary classes to
> emulate logic relations and this gets really slow. I have to get
> numbers first but a single basic filter with ORed matches is probably
> faster than a separate u32 filter for every case.
I am pretty sure someone who knows u32 well can outperform you (in the
scenarios where u32 works using AND etc).
Start hitting 50K rules then lets talk ;->
> Sure, once u32
> has ematch support it gets better and the hashing shouldn't have
> too much influence even if it's unused.. We can see what to do once
> u32 can handle ematches.
If your intent is to write an ematch holder, then it would be worth to
at least go as far as making it some basic hash - as basic as fw does;
where collision leads toa linked list. If it is just to show an example,
then it is fine.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 13:22 ` jamal
@ 2005-01-04 13:41 ` Thomas Graf
2005-01-05 2:54 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-04 13:41 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104844935.1085.103.camel@jzny.localdomain> 2005-01-04 08:22
> On Tue, 2005-01-04 at 07:27, Thomas Graf wrote:
> > Many use u32 to match simple stuff as port numbers, dscp values
> > or ip subnet addresses and create a new filter for every port/dscp
> > value and for every address. Some even use temporary classes to
> > emulate logic relations and this gets really slow. I have to get
> > numbers first but a single basic filter with ORed matches is probably
> > faster than a separate u32 filter for every case.
>
> I am pretty sure someone who knows u32 well can outperform you (in the
> scenarios where u32 works using AND etc).
> Start hitting 50K rules then lets talk ;->
Sure but I'd call a filter with 50K ANDed rules an unlikely scenario ;->
In most cases logic will beat brute force. I used to have a u32 setup
with 4K matches and hashing, it was not only error prone but could be
replaced with 12 egp filters gaining 90kpps. Why's that? Simply because
it was easier to optimize the logic behind it. egp itself is terribly
slow compared to u32.
> If your intent is to write an ematch holder, then it would be worth to
> at least go as far as making it some basic hash - as basic as fw does;
> where collision leads toa linked list. If it is just to show an example,
> then it is fine.
Using what key? We have no knowledge about what the ematches want to
see or not.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 13:41 ` Thomas Graf
@ 2005-01-05 2:54 ` jamal
2005-01-05 11:09 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-05 2:54 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Tue, 2005-01-04 at 08:41, Thomas Graf wrote:
> * jamal <1104844935.1085.103.camel@jzny.localdomain> 2005-01-04 08:22
> > On Tue, 2005-01-04 at 07:27, Thomas Graf wrote:
> > I am pretty sure someone who knows u32 well can outperform you (in the
> > scenarios where u32 works using AND etc).
> > Start hitting 50K rules then lets talk ;->
>
> Sure but I'd call a filter with 50K ANDed rules an unlikely scenario ;->
50K matches is probably senseless - i was talking about rules (which
contain matches).
> In most cases logic will beat brute force. I used to have a u32 setup
> with 4K matches and hashing, it was not only error prone but could be
> replaced with 12 egp filters gaining 90kpps. Why's that? Simply because
> it was easier to optimize the logic behind it. egp itself is terribly
> slow compared to u32.
I think this is a debate that can be easily settled ;->
Agreed logic will beat brute force smartness and u32 is not exactly
for the faint hearted. And its usability is extremely poor - but lets
maintain its power as is.
> > If your intent is to write an ematch holder, then it would be worth to
> > at least go as far as making it some basic hash - as basic as fw does;
> > where collision leads toa linked list. If it is just to show an example,
> > then it is fine.
>
> Using what key? We have no knowledge about what the ematches want to
> see or not.
Ok, good question ;->
Maybe you should have own some 32 bit key?
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 2:54 ` jamal
@ 2005-01-05 11:09 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-05 11:09 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104893694.1124.37.camel@jzny.localdomain> 2005-01-04 21:54
> I think this is a debate that can be easily settled ;->
> Agreed logic will beat brute force smartness and u32 is not exactly
> for the faint hearted. And its usability is extremely poor - but lets
> maintain its power as is.
Absolutely.
> > Using what key? We have no knowledge about what the ematches want to
> > see or not.
>
> Ok, good question ;->
> Maybe you should have own some 32 bit key?
Actually the goal of basic is to be an alternative to u32
if hashing is not required because I think adding hashing
will simply result in a duplication of u32.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-03 12:56 [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Thomas Graf
2005-01-04 4:13 ` jamal
@ 2005-01-04 22:36 ` Thomas Graf
2005-01-05 3:12 ` jamal
2005-01-05 13:32 ` [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Florian Weimer
2 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-04 22:36 UTC (permalink / raw)
To: Jamal Hadi Salim; +Cc: netdev
Updated patch with the following changes (still untested)
* destroy/dump/change are not optional (only match is required)
* ematch can set datalen in ematch_ops to have the ematch api
do a data length sanity check. (to at least avoid bogus memory refs)
* better nop macros if ematch is not enabled
* TCF_EM_SIMPLE flag which marks an ematch config as simple, meaning
that the data consists of a u32 value.
* API documentation
* removed handle from ematch_hdr
* userspace visible ematch header is no longer used in ematch tree
and the attributes are copied now to avoid duplications such as kind.
* suggestion comment to use a kind > 2^15 for private/temporary
ematches to avoid collisions on kernel upgrades
* some minor cosmetic fixes to make code look more pretty
* renamed tcf_em_tree_change to tcf_em_tree_replace, it gives a better
impression on what is being done
Jamal, I know it's still not simple enough for you but can you live
with it? ;->
diff -Nru linux-2.6.10-bk6.orig/include/linux/pkt_cls.h linux-2.6.10-bk6/include/linux/pkt_cls.h
--- linux-2.6.10-bk6.orig/include/linux/pkt_cls.h 2005-01-04 18:10:11.000000000 +0100
+++ linux-2.6.10-bk6/include/linux/pkt_cls.h 2005-01-04 18:10:17.000000000 +0100
@@ -319,4 +319,51 @@
#define TCA_TCINDEX_MAX (__TCA_TCINDEX_MAX - 1)
+struct tcf_ematch_tree_hdr
+{
+ __u16 nmatches;
+ __u16 progid;
+};
+
+enum
+{
+ TCA_EMATCH_TREE_UNSPEC,
+ TCA_EMATCH_TREE_HDR,
+ TCA_EMATCH_TREE_LIST,
+ __TCA_EMATCH_TREE_MAX
+};
+#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1)
+
+struct tcf_ematch_hdr
+{
+ __u16 matchID;
+ __u16 kind;
+ __u16 flags;
+ __u16 pad; /* currently unused */
+};
+
+/* Ematch type assignments
+ * 1..32767 Reserved for ematches inside kernel tree
+ * 32768..65535 Free to use, not reliable
+ */
+enum
+{
+ TCF_EM_CONTAINER,
+ __TCF_EM_MAX
+};
+
+#define TCF_EM_REL_MASK 3
+#define TCF_EM_REL_VALID(v) \
+ (!(((v) & TCF_EM_REL_MASK) == TCF_EM_REL_MASK))
+#define TCF_EM_LAST_KEY(v) (!((v) & TCF_EM_REL_MASK))
+
+#define TCF_EM_REL_OBVIOUS(v, r) (TCF_EM_LAST_KEY(v) || \
+ (!(r) && ((v) & TCF_EM_REL_AND)) || ((r) && ((v) & TCF_EM_REL_OR)))
+
+#define TCF_EM_REL_END (1<<0)
+#define TCF_EM_REL_AND (1<<1)
+#define TCF_EM_REL_OR (1<<2)
+#define TCF_EM_INVERT (1<<3)
+#define TCF_EM_SIMPLE (1<<4)
+
#endif
diff -Nru linux-2.6.10-bk6.orig/include/linux/rtnetlink.h linux-2.6.10-bk6/include/linux/rtnetlink.h
--- linux-2.6.10-bk6.orig/include/linux/rtnetlink.h 2005-01-04 18:10:11.000000000 +0100
+++ linux-2.6.10-bk6/include/linux/rtnetlink.h 2005-01-04 01:33:32.000000000 +0100
@@ -779,6 +779,11 @@
goto rtattr_failure; \
__rta_fill(skb, attrtype, attrlen, data); })
+#define RTA_PUT_NOHDR(skb, attrlen, data) \
+({ if (unlikely(skb_tailroom(skb) < (int)(attrlen))) \
+ goto rtattr_failure; \
+ memcpy(skb_put(skb, RTA_ALIGN(attrlen)), data, attrlen); })
+
static inline struct rtattr *
__rta_reserve(struct sk_buff *skb, int attrtype, int attrlen)
{
diff -Nru linux-2.6.10-bk6.orig/include/net/pkt_cls.h linux-2.6.10-bk6/include/net/pkt_cls.h
--- linux-2.6.10-bk6.orig/include/net/pkt_cls.h 2005-01-04 18:10:11.000000000 +0100
+++ linux-2.6.10-bk6/include/net/pkt_cls.h 2005-01-04 19:42:59.000000000 +0100
@@ -149,6 +149,127 @@
extern int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts,
struct tcf_ext_map *map);
+#ifdef CONFIG_NET_EMATCH
+
+struct tcf_ematch_ops;
+
+/**
+ * struct tcf_ematch - extended match (ematch)
+ *
+ * @matchID: identifier to allow userspace to reidentify a match
+ * @flags: flags specifying attributes and the relation to other matches
+ * @ops: the operations lookup table of the corresponding ematch module
+ * @datalen: length of the ematch specific configuration data
+ * @data: ematch specific data
+ */
+struct tcf_ematch
+{
+ u16 matchID;
+ u16 flags;
+ struct tcf_ematch_ops * ops;
+ unsigned int datalen;
+ unsigned long data;
+};
+
+/**
+ * struct tcf_ematch_tree - ematch tree handle
+ *
+ * @hdr: ematch tree header supplied by userspace
+ * @matches: array of ematches
+ */
+struct tcf_ematch_tree
+{
+ struct tcf_ematch_tree_hdr hdr;
+ struct tcf_ematch * matches;
+
+};
+
+#define em_lookup_match(t, i) (&(t)->matches[(i)])
+
+/**
+ * struct tcf_ematch_ops - ematch module operations
+ *
+ * @kind: identifier (kind) of this ematch module
+ * @datalen: length of expected configuration data (optional)
+ * @change: called during validation (optional)
+ * @match: called during ematch tree evaluation, must return 1/0
+ * @destroy: called during destroyage (optional)
+ * @dump: called during dumping process (optional)
+ * @owner: owner, must be set to THIS_MODULE
+ * @link: link to previous/next ematch module (internal use)
+ */
+struct tcf_ematch_ops
+{
+ int kind;
+ int datalen;
+ int (*change)(struct tcf_proto *, void *,
+ int, struct tcf_ematch *);
+ int (*match)(struct sk_buff *, struct tcf_ematch *);
+ int (*destroy)(struct tcf_proto *,
+ struct tcf_ematch *);
+ int (*dump)(struct sk_buff *, struct tcf_ematch *);
+ struct module *owner;
+ struct list_head link;
+};
+
+extern int tcf_em_register(struct tcf_ematch_ops *);
+extern int tcf_em_unregister(struct tcf_ematch_ops *);
+extern int tcf_em_tree_validate(struct tcf_proto *, struct rtattr *,
+ struct tcf_ematch_tree *);
+extern void tcf_em_tree_destroy(struct tcf_proto *, struct tcf_ematch_tree *);
+extern int tcf_em_tree_dump(struct sk_buff *, struct tcf_ematch_tree *, int);
+extern int __tcf_em_tree_match(struct sk_buff *, struct tcf_ematch_tree *);
+
+/**
+ * tcf_em_tree_replace - replace ematch tree of a running classifier
+ *
+ * @tp: classifier kind handle
+ * @dst: destination ematch tree variable
+ * @src: source ematch tree (temporary tree from tcf_em_tree_validate)
+ *
+ * This functions replaces the ematch tree in @dst with the ematch
+ * tree in @src. The classifier in charge of the ematch tree may be
+ * running.
+ */
+static inline void
+tcf_em_tree_replace(struct tcf_proto *tp, struct tcf_ematch_tree *dst,
+ struct tcf_ematch_tree *src)
+{
+ tcf_tree_lock(tp);
+ memcpy(dst, src, sizeof(*dst));
+ tcf_tree_unlock(tp);
+}
+
+/**
+ * tcf_em_tree_match - evaulate an ematch tree
+ *
+ * @skb: socket buffer of the packet in question
+ * @t: ematch tree to be used for evaluation
+ *
+ * This function matches @skb against the ematch tree in @t by going
+ * through all ematches respecting their logic relations returning
+ * as soon as the result is obvious.
+ *
+ * Returns 1 if the ematch tree as-one matches, no ematches are configured
+ * or ematch is not enabled in the kernel, otherwise 0 is returned.
+ */
+#define tcf_em_tree_match(skb, t) \
+ ((t)->hdr.nmatches ? __tcf_em_tree_match(skb, t) : 1)
+
+#else /* CONFIG_NET_EMATCH */
+
+struct tcf_ematch_tree
+{
+};
+
+#define tcf_em_tree_validate(tp, tb, t) (0)
+#define tcf_em_tree_destroy(tp, t) do { } while(0)
+#define tcf_em_tree_dump(skb, t, tlv) (0)
+#define tcf_em_tree_change(tp, dst, src) do { } while(0)
+#define tcf_em_tree_match(skb, t) (1)
+
+#endif /* CONFIG_NET_EMATCH */
+
#ifdef CONFIG_NET_CLS_IND
static inline int
tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv)
diff -Nru linux-2.6.10-bk6.orig/net/sched/Kconfig linux-2.6.10-bk6/net/sched/Kconfig
--- linux-2.6.10-bk6.orig/net/sched/Kconfig 2005-01-04 18:10:11.000000000 +0100
+++ linux-2.6.10-bk6/net/sched/Kconfig 2005-01-04 18:10:17.000000000 +0100
@@ -375,6 +375,19 @@
To compile this code as a module, choose M here: the
module will be called cls_rsvp6.
+config NET_EMATCH
+ bool "Extended Matches"
+ depends on NET_CLS
+ ---help---
+ Say Y here if you want to use extended matches on top of classifiers
+ and select the extended matches below.
+
+ Extended matches are small classification helpers not worth writing
+ a separate classifier.
+
+ You must have a recent version of the iproute2 tools in order to use
+ extended matches.
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk6.orig/net/sched/Makefile linux-2.6.10-bk6/net/sched/Makefile
--- linux-2.6.10-bk6.orig/net/sched/Makefile 2005-01-04 18:10:11.000000000 +0100
+++ linux-2.6.10-bk6/net/sched/Makefile 2005-01-04 18:10:17.000000000 +0100
@@ -33,3 +33,4 @@
obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
+obj-$(CONFIG_NET_EMATCH) += ematch.o
diff -Nru linux-2.6.10-bk6.orig/net/sched/ematch.c linux-2.6.10-bk6/net/sched/ematch.c
--- linux-2.6.10-bk6.orig/net/sched/ematch.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk6/net/sched/ematch.c 2005-01-04 18:55:40.000000000 +0100
@@ -0,0 +1,396 @@
+/*
+ * net/sched/ematch.c Extended Match API
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+#define EMATCH_STACK_SIZE 32
+
+static LIST_HEAD(ematch_ops);
+static rwlock_t ematch_mod_lock = RW_LOCK_UNLOCKED;
+
+static inline struct tcf_ematch_ops *
+tcf_em_lookup(u16 kind)
+{
+ struct tcf_ematch_ops *e = NULL;
+
+ read_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link) {
+ if (kind == e->kind) {
+ if (!try_module_get(e->owner))
+ e = NULL;
+ break;
+ }
+ }
+ read_unlock(&ematch_mod_lock);
+
+ return e;
+}
+
+/**
+ * tcf_em_register - register an extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their presence.
+ * The given @ops must have kind set to a unique identifier and the
+ * callback match() must be implemented. All other callbacks are optional
+ * and a fallback implementation is used instead.
+ *
+ * Returns -EEXISTS if an ematch of the same kind has already registered.
+ */
+int tcf_em_register(struct tcf_ematch_ops *ops)
+{
+ int err = -EEXIST;
+ struct tcf_ematch_ops *e;
+
+ write_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link)
+ if (ops->kind == e->kind)
+ goto errout;
+
+ list_add_tail(&ops->link, &ematch_ops);
+ err = 0;
+errout:
+ write_unlock(&ematch_mod_lock);
+ return err;
+}
+
+/**
+ * tcf_em_unregister - unregster and extended match
+ *
+ * @ops: ematch operations lookup table
+ *
+ * This function must be called by ematches to announce their disappearance
+ * for examples when the module gets unloaded. The @ops parameter must be
+ * the same as the one used for registration.
+ *
+ * Returns -ENOENT if no matching ematch was found.
+ */
+int tcf_em_unregister(struct tcf_ematch_ops *ops)
+{
+ int err = 0;
+ struct tcf_ematch_ops *e;
+
+ write_lock(&ematch_mod_lock);
+ list_for_each_entry(e, &ematch_ops, link) {
+ if (e == ops) {
+ list_del(&e->link);
+ goto out;
+ }
+ }
+
+ err = -ENOENT;
+out:
+ write_unlock(&ematch_mod_lock);
+ return err;
+}
+
+static int tcf_em_validate(struct tcf_proto *tp, struct tcf_ematch_tree_hdr *th,
+ struct tcf_ematch *m, struct rtattr *rta)
+{
+ int err = -EINVAL;
+ struct tcf_ematch_hdr *mh = RTA_DATA(rta);
+ int datalen = RTA_PAYLOAD(rta) - sizeof(*mh);
+ void *data = (void *) data + sizeof(*mh);
+
+ if (!TCF_EM_REL_VALID(mh->flags))
+ goto errout;
+
+ if (mh->kind == TCF_EM_CONTAINER) {
+ u32 ref;
+
+ if (datalen < sizeof(ref))
+ goto errout;
+ ref = *(u32 *) data;
+ if (ref >= th->nmatches)
+ goto errout;
+ m->data = ref;
+ } else {
+ struct tcf_ematch_ops *ops = tcf_em_lookup(mh->kind);
+
+ if (ops == NULL) {
+ err = -ENOENT;
+ goto errout;
+ }
+
+ if (ops->datalen && datalen < ops->datalen)
+ goto errout;
+
+ if (ops->change) {
+ err = ops->change(tp, data, datalen, m);
+ if (err < 0)
+ goto errout;
+ } else if (datalen > 0) {
+ if (mh->flags & TCF_EM_SIMPLE) {
+ if (datalen < sizeof(u32))
+ goto errout;
+ m->data = *(u32 *) data;
+ } else {
+ void *v = kmalloc(datalen, GFP_KERNEL);
+ if (v == NULL) {
+ err = -ENOBUFS;
+ goto errout;
+ }
+ memcpy(v, data, datalen);
+ m->data = (unsigned long) v;
+ }
+ }
+ }
+
+ m->matchID = mh->matchID;
+ m->flags = mh->flags;
+ m->datalen = datalen;
+
+ err = 0;
+errout:
+ return err;
+}
+
+/**
+ * tcf_em_tree_validate - validate ematch config TLV and build ematch tree
+ *
+ * @tp: classifier kind handle
+ * @rta: ematch tree configuration TLV
+ * @tree: destination ematch tree variable to store the resulting
+ * ematch tree.
+ *
+ * This function validates the given configuration TLV @rta and builds an
+ * ematch tree in @tree. The resulting tree must later be copied into
+ * the private classifier data using tcf_em_tree_change(). You MUST NOT
+ * provide the ematch tree variable of the private classifier data directly,
+ * the changes would not be locked properly.
+ *
+ * Returns a negative error code if the configuration TLV contains errors.
+ */
+int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta,
+ struct tcf_ematch_tree *tree)
+{
+ int i, len, mlen, err = -EINVAL;
+ struct rtattr *m, *tb[TCA_EMATCH_TREE_MAX];
+ struct tcf_ematch_tree_hdr *th;
+
+ if (!rta || rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0)
+ goto errout;
+
+ if (RTA_PAYLOAD(tb[TCA_EMATCH_TREE_HDR-1]) < sizeof(*th) ||
+ RTA_PAYLOAD(tb[TCA_EMATCH_TREE_LIST-1]) < sizeof(*m))
+ goto errout;
+
+ th = RTA_DATA(tb[TCA_EMATCH_TREE_HDR-1]);
+ m = RTA_DATA(tb[TCA_EMATCH_TREE_LIST-1]);
+ len = RTA_PAYLOAD(tb[TCA_EMATCH_TREE_LIST-1]);
+ mlen = th->nmatches * sizeof(struct tcf_ematch);
+
+ memcpy(&tree->hdr, th, sizeof(*th));
+
+ tree->matches = kmalloc(mlen, GFP_KERNEL);
+ if (tree->matches == NULL)
+ goto errout;
+ memset(tree->matches, 0, mlen);
+
+ for (i = 0; RTA_OK(m, len); i++) {
+ if (rta->rta_type != (i+1) || i >= th->nmatches ||
+ RTA_PAYLOAD(rta) < sizeof(struct tcf_ematch_hdr)) {
+ err = -EINVAL;
+ goto errout_abort;
+ }
+
+ err = tcf_em_validate(tp, th, em_lookup_match(tree, i), rta);
+ if (err < 0)
+ goto errout_abort;
+
+ m = RTA_NEXT(m, len);
+ }
+
+ if (i != th->nmatches) {
+ err = -EINVAL;
+ goto errout_abort;
+ }
+
+
+ err = 0;
+errout:
+ return err;
+
+errout_abort:
+ tcf_em_tree_destroy(tp, tree);
+ return err;
+}
+
+/**
+ * tcf_em_tree_destroy - destroy an ematch tree
+ *
+ * @tp: classifier kind handle
+ * @t: ematch tree to be deleted
+ *
+ * This functions destroys an ematch tree previously created by
+ * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that
+ * the ematch tree is not in use before calling this function.
+ */
+void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *t)
+{
+ int i;
+
+ if (t->matches == NULL)
+ return;
+
+ for (i = 0; i < t->hdr.nmatches; i++) {
+ struct tcf_ematch *m = em_lookup_match(t, i);
+ if (m->ops) {
+ if (m->ops->destroy)
+ m->ops->destroy(tp, m);
+ else if (!(m->flags & TCF_EM_SIMPLE) && m->data)
+ kfree((void *) m->data);
+ module_put(m->ops->owner);
+ }
+ }
+
+ t->hdr.nmatches = 0;
+ kfree(t->matches);
+}
+
+/**
+ * tcf_em_tree_dump - dump ematch tree into a rtnl message
+ *
+ * @skb: skb holding the rtnl message
+ * @t: ematch tree to be dumped
+ * @tlv: TLV type to be used to encapsulate the tree
+ *
+ * This function dumps a ematch tree into a rtnl message. It is valid to
+ * call this function while the ematch tree is in use.
+ *
+ * Returns -1 if the skb tailroom is insufficient.
+ */
+int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *t, int tlv)
+{
+ int i;
+ struct rtattr * p_rta = (struct rtattr*) skb->tail;
+ struct rtattr * pm_rta;
+
+ RTA_PUT(skb, tlv, 0, NULL);
+ RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(t->hdr), &t->hdr);
+
+ pm_rta = (struct rtattr *) skb->tail;
+ RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL);
+
+ for (i = 0; i < t->hdr.nmatches; i++) {
+ struct rtattr * pd_rta = (struct rtattr*) skb->tail;
+ struct tcf_ematch *m = em_lookup_match(t, i);
+ struct tcf_ematch_hdr hdr = {
+ .kind = m->ops->kind,
+ .matchID = m->matchID,
+ .flags = m->flags
+ };
+
+ RTA_PUT(skb, i+1, sizeof(hdr), &hdr);
+ if (m->ops->dump) {
+ if (m->ops->dump(skb, m) < 0)
+ goto rtattr_failure;
+ } else if (m->flags & TCF_EM_SIMPLE) {
+ u32 u = m->data;
+ RTA_PUT_NOHDR(skb, sizeof(u32), &u);
+ } else if (m->datalen > 0)
+ RTA_PUT_NOHDR(skb, m->datalen, (void *) m->data);
+
+ pd_rta->rta_len = skb->tail - (u8*) pd_rta;
+ }
+
+ pm_rta->rta_len = skb->tail - (u8*) pm_rta;
+ p_rta->rta_len = skb->tail - (u8*) p_rta;
+ return 0;
+rtattr_failure:
+ return -1;
+}
+
+static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ int r = 0;
+
+ if (m->ops->match)
+ r = m->ops->match(skb, m);
+
+ return m->flags & TCF_EM_INVERT ? !r : r;
+}
+
+/* Do not use this function directly, use tcf_em_tree_match instead */
+int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *t)
+{
+ int i = 0, n = 0, r = 0;
+ struct tcf_ematch *m;
+ int stack[EMATCH_STACK_SIZE];
+
+ memset(stack, 0, sizeof(stack));
+
+proceed:
+ while (n < t->hdr.nmatches) {
+ m = em_lookup_match(t, n);
+
+ if (m->ops->kind == TCF_EM_CONTAINER) {
+ if (unlikely(i >= EMATCH_STACK_SIZE))
+ goto stack_overflow;
+
+ if (unlikely(m->data <= n))
+ goto backward_jump;
+
+ stack[i++] = n;
+ n = m->data;
+ continue;
+ }
+
+ r = tcf_em_match(skb, m);
+ if (TCF_EM_REL_OBVIOUS(m->flags, r))
+ break;
+ n++;
+ }
+
+pop_stack:
+ if (i > 0) {
+ n = stack[--i];
+ m = em_lookup_match(t, n);
+
+ if (TCF_EM_REL_OBVIOUS(m->flags, r))
+ goto pop_stack;
+ else {
+ n++;
+ goto proceed;
+ }
+ }
+
+ return r;
+
+stack_overflow:
+ if (net_ratelimit())
+ printk("Local stack overflow, increase EMATCH_STACK_SIZE\n");
+ return -1;
+
+backward_jump:
+ if (net_ratelimit())
+ printk("Detected backward precedence jump, fix your filter.\n");
+ return -1;
+}
+
+EXPORT_SYMBOL(tcf_em_register);
+EXPORT_SYMBOL(tcf_em_unregister);
+EXPORT_SYMBOL(tcf_em_tree_validate);
+EXPORT_SYMBOL(tcf_em_tree_destroy);
+EXPORT_SYMBOL(tcf_em_tree_dump);
+EXPORT_SYMBOL(__tcf_em_tree_match);
+
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-04 22:36 ` Thomas Graf
@ 2005-01-05 3:12 ` jamal
2005-01-05 11:00 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-05 3:12 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Tue, 2005-01-04 at 17:36, Thomas Graf wrote:
> * TCF_EM_SIMPLE flag which marks an ematch config as simple, meaning
> that the data consists of a u32 value.
This is 1 of 2 parts i think thats still an issue; otherwise looks very
good.
Why do i need to signal something as simple? AND why does it have to be
32 bit type - what edge does that give you?
I should be able to specify a struct with two 32 bits and
encap it in a TLV and the classifier can treat it the same way - it
knows the type and length - thats sufficient to create, destroy and
dump.
The other issue is still on the ematch/match interleaving i.e i should
be able to say something along the lines:
//simple slammer-worm or code-red ACL detector rule
//using u32 classifier and ematches
(match ip protocol udp port 1434 AND
ematch packetlen minsize 404 maxsize 404) OR
(match ip protocol tcp http AND
ematch urlscanner "*.ida")
action ipt -j ULOG "Virus detected and dropped"
action drop
Not a very good example - but you can see how powerfull this is when you
can quickly use a string scanner such as the one you have as an ematch
while maintaining u32 as is.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 3:12 ` jamal
@ 2005-01-05 11:00 ` Thomas Graf
2005-01-05 13:33 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-05 11:00 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104894728.1117.56.camel@jzny.localdomain> 2005-01-04 22:12
> On Tue, 2005-01-04 at 17:36, Thomas Graf wrote:
>
> > * TCF_EM_SIMPLE flag which marks an ematch config as simple, meaning
> > that the data consists of a u32 value.
>
> This is 1 of 2 parts i think thats still an issue; otherwise looks very
> good.
> Why do i need to signal something as simple? AND why does it have to be
> 32 bit type - what edge does that give you?
You don't have to, providing a 32bit data chunk without TCF_EM_SIMPLE
set will simply result in allocating & copy. It's an optimization,
nothing more.
> I should be able to specify a struct with two 32 bits and
> encap it in a TLV and the classifier can treat it the same way - it
> knows the type and length - thats sufficient to create, destroy and
> dump.
Correct, maybe you're right and I should drop it again.
> The other issue is still on the ematch/match interleaving i.e i should
> be able to say something along the lines:
>
> //simple slammer-worm or code-red ACL detector rule
> //using u32 classifier and ematches
> (match ip protocol udp port 1434 AND
> ematch packetlen minsize 404 maxsize 404) OR
> (match ip protocol tcp http AND
> ematch urlscanner "*.ida")
> action ipt -j ULOG "Virus detected and dropped"
> action drop
>
> Not a very good example - but you can see how powerfull this is when you
> can quickly use a string scanner such as the one you have as an ematch
> while maintaining u32 as is.
Basically you could do that already with the basic classifier but
I understand your concern and it would be neat to benefit from u32's
hashing. There are 2 options:
1) We make u32 hold multiple ematch trees, a u32 key can be of 3
kinds: u32/ematch/container. It's kind of a hack and not very
fast due to a lot of stack movement.
2) We make the existing u32 match be an ematch which I already did
expect for the nexthdr bits. That is the select will simply be
replaced by an ematch tree. I'll take a look into how we could
have the classifier take influence on the ematches config data.
One possibiliy is to have a struct transfered via map which
contains useful data such as offset to next header (u32/rsvp).
I have to think about this a little more though.
Personally I'm all for 2) because it's just cleaner and easier to
maintain. It's probably the best to not to use the u32 ematch I
wrote (which I renamed to cmp) but to write a new one behaving
exactly the same as the existing u32 match.
Attached cmp ematch (formerly u32), it was on diet for a while
and is quite smallish now.
diff -Nru linux-2.6.10-bk6.orig/include/linux/pkt_cls.h linux-2.6.10-bk6/include/linux/pkt_cls.h
--- linux-2.6.10-bk6.orig/include/linux/pkt_cls.h 2005-01-05 01:09:29.000000000 +0100
+++ linux-2.6.10-bk6/include/linux/pkt_cls.h 2005-01-05 01:46:34.000000000 +0100
@@ -349,6 +349,7 @@
enum
{
TCF_EM_CONTAINER,
+ TCF_EM_CMP,
__TCF_EM_MAX
};
@@ -366,4 +367,28 @@
#define TCF_EM_INVERT (1<<3)
#define TCF_EM_SIMPLE (1<<4)
+struct tcf_em_cmp
+{
+ __u32 val;
+ __u32 mask;
+ __u16 off;
+ __u8 align;
+ __u8 layer:4;
+ __u8 opnd:4;
+};
+
+enum
+{
+ TCF_EM_ALIGN_U8 = 1,
+ TCF_EM_ALIGN_U16 = 2,
+ TCF_EM_ALIGN_U32 = 4
+};
+
+enum
+{
+ TCF_EM_OPND_EQ,
+ TCF_EM_OPND_GT,
+ TCF_EM_OPND_LT
+};
+
#endif
diff -Nru linux-2.6.10-bk6.orig/include/net/pkt_cls.h linux-2.6.10-bk6/include/net/pkt_cls.h
--- linux-2.6.10-bk6.orig/include/net/pkt_cls.h 2005-01-05 01:09:29.000000000 +0100
+++ linux-2.6.10-bk6/include/net/pkt_cls.h 2005-01-05 01:26:03.000000000 +0100
@@ -270,6 +270,36 @@
#endif /* CONFIG_NET_EMATCH */
+static inline u32 tcf_read_bucket(u8 *ptr, u8 align)
+{
+ switch (align) {
+ case TCF_EM_ALIGN_U8:
+ return *ptr;
+ case TCF_EM_ALIGN_U16:
+ return (*ptr << 8) | *(ptr+1);
+ case TCF_EM_ALIGN_U32:
+ return (*ptr << 24) | (*(ptr+1) << 16) |
+ (*(ptr+2) << 8) | *(ptr+3);
+ }
+ return 0;
+}
+
+static inline u8 * tcf_get_base_ptr(struct sk_buff *skb, u8 layer)
+{
+ switch (layer) {
+ case 0:
+ return skb->data;
+ case 1:
+ return skb->nh.raw;
+ case 2:
+ return skb->h.raw;
+ }
+ return NULL;
+}
+
+#define tcf_valid_offset(skb, ptr, off, len) \
+ ((ptr + off + len) < skb->tail && (ptr + off) > skb->head)
+
#ifdef CONFIG_NET_CLS_IND
static inline int
tcf_change_indev(struct tcf_proto *tp, char *indev, struct rtattr *indev_tlv)
diff -Nru linux-2.6.10-bk6.orig/net/sched/Kconfig linux-2.6.10-bk6/net/sched/Kconfig
--- linux-2.6.10-bk6.orig/net/sched/Kconfig 2005-01-05 01:09:29.000000000 +0100
+++ linux-2.6.10-bk6/net/sched/Kconfig 2005-01-05 01:34:38.000000000 +0100
@@ -388,6 +388,16 @@
You must have a recent version of the iproute2 tools in order to use
extended matches.
+config NET_EMATCH_CMP
+ tristate "Simple packet data comparison"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be able to classify packets based on
+ simple packet data comparisons for 8, 16, and 32bit values.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_cmp.
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk6.orig/net/sched/Makefile linux-2.6.10-bk6/net/sched/Makefile
--- linux-2.6.10-bk6.orig/net/sched/Makefile 2005-01-05 01:09:29.000000000 +0100
+++ linux-2.6.10-bk6/net/sched/Makefile 2005-01-05 01:28:03.000000000 +0100
@@ -34,3 +34,4 @@
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
+obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
diff -Nru linux-2.6.10-bk6.orig/net/sched/em_cmp.c linux-2.6.10-bk6/net/sched/em_cmp.c
--- linux-2.6.10-bk6.orig/net/sched/em_cmp.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk6/net/sched/em_cmp.c 2005-01-05 01:47:00.000000000 +0100
@@ -0,0 +1,66 @@
+/*
+ * net/sched/em_cmp.c Simple packet data comparison ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *m)
+{
+ struct tcf_em_cmp *u = (struct tcf_em_cmp *) m->data;
+ u8 *ptr = tcf_get_base_ptr(skb, u->layer);
+ u32 v;
+
+ if (unlikely(!tcf_valid_offset(skb, ptr, u->off, u->align)))
+ return 0;
+
+ v = tcf_read_bucket(ptr + u->off, u->align) & u->mask;
+
+ /* FIXME: byte order issues */
+
+ switch (u->opnd) {
+ case TCF_EM_OPND_EQ:
+ return v == u->val;
+ case TCF_EM_OPND_LT:
+ return v < u->val;
+ case TCF_EM_OPND_GT:
+ return v > u->val;
+ }
+
+ return 0;
+}
+
+static struct tcf_ematch_ops em_cmp_ops = {
+ .kind = TCF_EM_CMP,
+ .datalen = sizeof(struct tcf_em_cmp),
+ .match = em_cmp_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_cmp_ops.link)
+};
+
+static int __init init_em_cmp(void)
+{
+ return tcf_em_register(&em_cmp_ops);
+}
+
+static void __exit exit_em_cmp(void)
+{
+ tcf_em_unregister(&em_cmp_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_cmp);
+module_exit(exit_em_cmp);
+
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 11:00 ` Thomas Graf
@ 2005-01-05 13:33 ` jamal
2005-01-05 14:45 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-05 13:33 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Wed, 2005-01-05 at 06:00, Thomas Graf wrote:
> * jamal <1104894728.1117.56.camel@jzny.localdomain> 2005-01-04 22:12
> > Why do i need to signal something as simple? AND why does it have to be
> > 32 bit type - what edge does that give you?
>
> You don't have to, providing a 32bit data chunk without TCF_EM_SIMPLE
> set will simply result in allocating & copy. It's an optimization,
> nothing more.
Sorry i missed that. Isnt it still unecessary though? You should be able
to pass L=4 and not need the speacial treatment, no?
> > Not a very good example - but you can see how powerfull this is when you
> > can quickly use a string scanner such as the one you have as an ematch
> > while maintaining u32 as is.
>
> Basically you could do that already with the basic classifier but
> I understand your concern and it would be neat to benefit from u32's
> hashing. There are 2 options:
> 1) We make u32 hold multiple ematch trees, a u32 key can be of 3
> kinds: u32/ematch/container. It's kind of a hack and not very
> fast due to a lot of stack movement.
Indeed this is what i was thinking of.
The only added overhead I can think of is when processing a series of
u32 keys _within the same selector_ (not across selectors), you check if
its u32 native and execute localy (as is done now) or transfer the check
to the ematch defined in that key and continue to next key based on the
ematchs return code + the logical operation.
> 2) We make the existing u32 match be an ematch which I already did
> expect for the nexthdr bits. That is the select will simply be
> replaced by an ematch tree. I'll take a look into how we could
> have the classifier take influence on the ematches config data.
> One possibiliy is to have a struct transfered via map which
> contains useful data such as offset to next header (u32/rsvp).
> I have to think about this a little more though.
This could be done in addition to #1. I see #1 as more important for u32
but not so for things like fwmark, tcindex which should fizzle away with
meta ematch. I think the danger is in trying to replicate u32 as an
ematch; if somehow you can loop back and use the real u32 code, then
fine. I feel it being non-trivial to do so.
Like i said earlier, theres a lot of power in u32 other than in basic
matching.
> Personally I'm all for 2) because it's just cleaner and easier to
> maintain.
Aha, thats why we were not converging then ;->
I think #2 is better for other classifiers which were already doomed
anyways. #1 is important for u32 and perhaps other classifiers like rsvp
and route. And yes, #1 is more work ;->
> It's probably the best to not to use the u32 ematch I
> wrote (which I renamed to cmp) but to write a new one behaving
> exactly the same as the existing u32 match.
>
hang on:-> No dont rewrite u32 please. Your cmp is good for the basic
matches that u32 does, but is _nowhere_ close to being able to do what
u32 can when used properly.
> Attached cmp ematch (formerly u32), it was on diet for a while
> and is quite smallish now.
Looks very appetizing. Two general comments:
> diff -Nru linux-2.6.10-bk6.orig/include/linux/pkt_cls.h linux-2.6.10-bk6/include/linux/pkt_cls.h
> --- linux-2.6.10-bk6.orig/include/linux/pkt_cls.h 2005-01-05 01:09:29.000000000 +0100
> +++ linux-2.6.10-bk6/include/linux/pkt_cls.h 2005-01-05 01:46:34.000000000 +0100
> @@ -349,6 +349,7 @@
> enum
> {
> TCF_EM_CONTAINER,
> + TCF_EM_CMP,
> __TCF_EM_MAX
> };
I Should be able to compile a new ematch as a module in an already
running kernel. So, other than generic stuff for ematches, things like
TCF_EM_CMP should not be in that enumeration.
> @@ -366,4 +367,28 @@
> #define TCF_EM_INVERT (1<<3)
> #define TCF_EM_SIMPLE (1<<4)
>
> +struct tcf_em_cmp
> +{
> + __u32 val;
> + __u32 mask;
> + __u16 off;
> + __u8 align;
> + __u8 layer:4;
> + __u8 opnd:4;
> +};
> +
> +enum
> +{
> + TCF_EM_ALIGN_U8 = 1,
> + TCF_EM_ALIGN_U16 = 2,
> + TCF_EM_ALIGN_U32 = 4
> +};
> +
> +enum
> +{
> + TCF_EM_OPND_EQ,
> + TCF_EM_OPND_GT,
> + TCF_EM_OPND_LT
> +};
> +
Which means the above should go in its own file; probably create a
include/linux/tc_ematch directory with a file of sort tcf_em_cmp.h
which holds all the above.
> #endif
> diff -Nru linux-2.6.10-bk6.orig/include/net/pkt_cls.h linux-2.6.10-bk6/include/net/pkt_cls.h
> --- linux-2.6.10-bk6.orig/include/net/pkt_cls.h 2005-01-05 01:09:29.000000000 +0100
> +++ linux-2.6.10-bk6/include/net/pkt_cls.h 2005-01-05 01:26:03.000000000 +0100
> @@ -270,6 +270,36 @@
>
> #endif /* CONFIG_NET_EMATCH */
>
> +static inline u32 tcf_read_bucket(u8 *ptr, u8 align)
> +{
> + switch (align) {
> + case TCF_EM_ALIGN_U8:
> + return *ptr;
> + case TCF_EM_ALIGN_U16:
> + return (*ptr << 8) | *(ptr+1);
> + case TCF_EM_ALIGN_U32:
> + return (*ptr << 24) | (*(ptr+1) << 16) |
> + (*(ptr+2) << 8) | *(ptr+3);
> + }
> + return 0;
> +}
> +
> +static inline u8 * tcf_get_base_ptr(struct sk_buff *skb, u8 layer)
> +{
> + switch (layer) {
> + case 0:
> + return skb->data;
> + case 1:
> + return skb->nh.raw;
> + case 2:
> + return skb->h.raw;
> + }
> + return NULL;
> +}
> +
> +#define tcf_valid_offset(skb, ptr, off, len) \
> + ((ptr + off + len) < skb->tail && (ptr + off) > skb->head)
> +
And all this also goes in that header file as well.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 13:33 ` jamal
@ 2005-01-05 14:45 ` Thomas Graf
2005-01-05 16:48 ` Thomas Graf
2005-01-06 13:47 ` jamal
0 siblings, 2 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-05 14:45 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1104931991.1117.152.camel@jzny.localdomain> 2005-01-05 08:33
> On Wed, 2005-01-05 at 06:00, Thomas Graf wrote:
> > * jamal <1104894728.1117.56.camel@jzny.localdomain> 2005-01-04 22:12
>
> > > Why do i need to signal something as simple? AND why does it have to be
> > > 32 bit type - what edge does that give you?
> >
> > You don't have to, providing a 32bit data chunk without TCF_EM_SIMPLE
> > set will simply result in allocating & copy. It's an optimization,
> > nothing more.
>
> Sorry i missed that. Isnt it still unecessary though? You should be able
> to pass L=4 and not need the speacial treatment, no?
Agreed, but the ematch might expect an allocated block. Assuming the
data is variable and sometimes is L=4, sometimes L=16 the ematch
requires special handling because m->data might hold the value directly
or a pointer depending on datalen.
> > 1) We make u32 hold multiple ematch trees, a u32 key can be of 3
> > kinds: u32/ematch/container. It's kind of a hack and not very
> > fast due to a lot of stack movement.
>
> Indeed this is what i was thinking of.
> The only added overhead I can think of is when processing a series of
> u32 keys _within the same selector_ (not across selectors), you check if
> its u32 native and execute localy (as is done now) or transfer the check
> to the ematch defined in that key and continue to next key based on the
> ematchs return code + the logical operation.
Exactly, does the performance gap come with any advantage? No. That's
why I don't like it.
> > 2) We make the existing u32 match be an ematch which I already did
> > expect for the nexthdr bits. That is the select will simply be
> > replaced by an ematch tree. I'll take a look into how we could
> > have the classifier take influence on the ematches config data.
> > One possibiliy is to have a struct transfered via map which
> > contains useful data such as offset to next header (u32/rsvp).
> > I have to think about this a little more though.
>
> This could be done in addition to #1. I see #1 as more important for u32
> but not so for things like fwmark, tcindex which should fizzle away with
> meta ematch. I think the danger is in trying to replicate u32 as an
> ematch; if somehow you can loop back and use the real u32 code, then
> fine. I feel it being non-trivial to do so.
> Like i said earlier, theres a lot of power in u32 other than in basic
> matching.
Most importantly I don't want to touch any of the hashing code in u32.
I really like and it should stay as it is. The existing u32 match can
be easly made an ematch so this would safe us the extra work in u32 to
implement logic relations again and to fiddle with complicated selector
TLVs. The only problem with this is the nexthdr bits because it relies
on the hashing code. So we have to make this data available to the
ematch which is actually not a bad idea anyway. So I'm thinking about
introducing a new structure tcf_em_pkt_info or alike which carries
some additional information found out by the classifiers which can be
used by ematches. This can be information about the next header,
already extracted dscp values, etc.
This would give us the chance to add a very small em_u32.c (~40 lines)
doing exactly the same as the current u32 match and have the u32
selector replaced with an ematch tree at no additional cost. Backward
compatibility is as easy as creating a flat ANDed ematch tree.
Note: The u32 ematch I'm talking about is not the cmp ematch, cmp is
more advanced but also slightly slower.
Thoughts?
> I think #2 is better for other classifiers which were already doomed
> anyways. #1 is important for u32 and perhaps other classifiers like rsvp
> and route. And yes, #1 is more work ;->
Why is it better? What's the advantage?
> hang on:-> No dont rewrite u32 please. Your cmp is good for the basic
> matches that u32 does, but is _nowhere_ close to being able to do what
> u32 can when used properly.
Right, that's why I now call it cmp and the existing u32 match becomes
the u32 ematch.
> I Should be able to compile a new ematch as a module in an already
> running kernel. So, other than generic stuff for ematches, things like
> TCF_EM_CMP should not be in that enumeration.
It's not a requirement to put it there but we need to manage the
assigned types for ematches in mainline anyway.
> Which means the above should go in its own file; probably create a
> include/linux/tc_ematch directory with a file of sort tcf_em_cmp.h
> which holds all the above.
This one I can agree.
> And all this also goes in that header file as well.
I put it here because it might be very useful for other ematches
or further classifiers, you name it. tcf_get_base_ptr is used by
nbyte ematch for example.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 14:45 ` Thomas Graf
@ 2005-01-05 16:48 ` Thomas Graf
2005-01-06 14:03 ` jamal
2005-01-06 13:47 ` jamal
1 sibling, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-05 16:48 UTC (permalink / raw)
To: jamal; +Cc: netdev
> Most importantly I don't want to touch any of the hashing code in u32.
> I really like and it should stay as it is. The existing u32 match can
> be easly made an ematch so this would safe us the extra work in u32 to
> implement logic relations again and to fiddle with complicated selector
> TLVs. The only problem with this is the nexthdr bits because it relies
> on the hashing code. So we have to make this data available to the
> ematch which is actually not a bad idea anyway. So I'm thinking about
> introducing a new structure tcf_em_pkt_info or alike which carries
> some additional information found out by the classifiers which can be
> used by ematches. This can be information about the next header,
> already extracted dscp values, etc.
Here's what I mean, it moves the u32 match as-is to an ematch so it
benefits from logic relations, inversion and can be used from other
classifiers as well. All we have to do is set info->ptr and
info->nexthdr to ptr respetively off2 before we evaluate the ematch
tree. The pkt_info struct is then passed to tcf_em_tree_match and
made available to every ematch.
Thoughts?
diff -Nru linux-2.6.10-bk8.orig/include/linux/pkt_cls.h linux-2.6.10-bk8/include/linux/pkt_cls.h
--- linux-2.6.10-bk8.orig/include/linux/pkt_cls.h 2005-01-05 17:40:02.000000000 +0100
+++ linux-2.6.10-bk8/include/linux/pkt_cls.h 2005-01-05 17:38:42.000000000 +0100
@@ -351,6 +351,7 @@
TCF_EM_CONTAINER,
TCF_EM_CMP,
TCF_EM_NBYTE,
+ TCF_EM_U32,
__TCF_EM_MAX
};
diff -Nru linux-2.6.10-bk8.orig/include/net/pkt_cls.h linux-2.6.10-bk8/include/net/pkt_cls.h
--- linux-2.6.10-bk8.orig/include/net/pkt_cls.h 2005-01-05 17:40:02.000000000 +0100
+++ linux-2.6.10-bk8/include/net/pkt_cls.h 2005-01-05 17:39:08.000000000 +0100
@@ -274,6 +274,8 @@
struct tcf_pkt_info
{
+ u8 * ptr;
+ int nexthdr;
};
static inline u32 tcf_read_bucket(u8 *ptr, u8 align)
diff -Nru linux-2.6.10-bk8.orig/net/sched/Kconfig linux-2.6.10-bk8/net/sched/Kconfig
--- linux-2.6.10-bk8.orig/net/sched/Kconfig 2005-01-05 17:38:26.000000000 +0100
+++ linux-2.6.10-bk8/net/sched/Kconfig 2005-01-05 17:41:59.000000000 +0100
@@ -408,6 +408,12 @@
To compile this code as a module, choose M here: the
module will be called em_nbyte.
+config NET_EMATCH_U32
+ tristate "U32 hashing key"
+ depends on NET_EMATCH
+ ---help---
+ TODO
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk8.orig/net/sched/Makefile linux-2.6.10-bk8/net/sched/Makefile
--- linux-2.6.10-bk8.orig/net/sched/Makefile 2005-01-05 17:38:26.000000000 +0100
+++ linux-2.6.10-bk8/net/sched/Makefile 2005-01-05 17:40:15.000000000 +0100
@@ -36,3 +36,4 @@
obj-$(CONFIG_NET_EMATCH) += ematch.o
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
+obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
diff -Nru linux-2.6.10-bk8.orig/net/sched/em_u32.c linux-2.6.10-bk8/net/sched/em_u32.c
--- linux-2.6.10-bk8.orig/net/sched/em_u32.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk8/net/sched/em_u32.c 2005-01-05 17:38:42.000000000 +0100
@@ -0,0 +1,53 @@
+/*
+ * net/sched/em_u32.c U32 Ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Based on net/sched/cls_u32.c
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/pkt_cls.h>
+
+static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ struct tc_u32_key *k = (struct tc_u32_key *) m->data;
+ u8 *ptr = info->ptr + k->off + (info->nexthdr & k->offmask);
+
+ return !((*(u32*) ptr ^ k->val) & k->mask);
+}
+
+static struct tcf_ematch_ops em_u32_ops = {
+ .kind = TCF_EM_U32,
+ .datalen = sizeof(struct tc_u32_key),
+ .match = em_u32_match,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_u32_ops.link)
+};
+
+static int __init init_em_u32(void)
+{
+ return tcf_em_register(&em_u32_ops);
+}
+
+static void __exit exit_em_u32(void)
+{
+ tcf_em_unregister(&em_u32_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_u32);
+module_exit(exit_em_u32);
+
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 16:48 ` Thomas Graf
@ 2005-01-06 14:03 ` jamal
0 siblings, 0 replies; 44+ messages in thread
From: jamal @ 2005-01-06 14:03 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Wed, 2005-01-05 at 11:48, Thomas Graf wrote:
> Here's what I mean, it moves the u32 match as-is to an ematch so it
> benefits from logic relations, inversion and can be used from other
> classifiers as well. All we have to do is set info->ptr and
> info->nexthdr to ptr respetively off2 before we evaluate the ematch
> tree. The pkt_info struct is then passed to tcf_em_tree_match and
> made available to every ematch.
>
> Thoughts?
I think this is fine; getting into complicated-land with off2 etc but
fine and does not preclude (and is lower importance in my opinion) than
having u32 do its own magic.
Note again Thomas: I do realize its more work to do the ematch/match
thing ;->
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 14:45 ` Thomas Graf
2005-01-05 16:48 ` Thomas Graf
@ 2005-01-06 13:47 ` jamal
2005-01-06 19:41 ` Thomas Graf
1 sibling, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-06 13:47 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
Sorry for the latency - vacation over, i am gonna slow down a little ..
On Wed, 2005-01-05 at 09:45, Thomas Graf wrote:
> * jamal <1104931991.1117.152.camel@jzny.localdomain> 2005-01-05 08:33
[..]
> > Sorry i missed that. Isnt it still unecessary though? You should be able
> > to pass L=4 and not need the speacial treatment, no?
>
> Agreed, but the ematch might expect an allocated block. Assuming the
> data is variable and sometimes is L=4, sometimes L=16 the ematch
> requires special handling because m->data might hold the value directly
> or a pointer depending on datalen.
So the issue is whether its by ref or copy? Maybe thats what the flag is
for then. My view is that _everything_ for ematches should be by copy
for simplicty.
> > > 1) We make u32 hold multiple ematch trees, a u32 key can be of 3
> > > kinds: u32/ematch/container. It's kind of a hack and not very
> > > fast due to a lot of stack movement.
> >
> > Indeed this is what i was thinking of.
> > The only added overhead I can think of is when processing a series of
> > u32 keys _within the same selector_ (not across selectors), you check if
> > its u32 native and execute localy (as is done now) or transfer the check
> > to the ematch defined in that key and continue to next key based on the
> > ematchs return code + the logical operation.
>
> Exactly, does the performance gap come with any advantage? No.
Oh, yes;-> the one check in the datapath comes with a _huge_ advantage
even all that you had was old style matches because now you have embeded
logical operations which are more than just the old AND. But more
importantly you can use ematches as well to influence the existing u32
tree path.
> That's
> why I don't like it.
>
>
> > > 2) We make the existing u32 match be an ematch which I already did
> > > expect for the nexthdr bits. That is the select will simply be
> > > replaced by an ematch tree. I'll take a look into how we could
> > > have the classifier take influence on the ematches config data.
> > > One possibiliy is to have a struct transfered via map which
> > > contains useful data such as offset to next header (u32/rsvp).
> > > I have to think about this a little more though.
> >
> > This could be done in addition to #1. I see #1 as more important for u32
> > but not so for things like fwmark, tcindex which should fizzle away with
> > meta ematch. I think the danger is in trying to replicate u32 as an
> > ematch; if somehow you can loop back and use the real u32 code, then
> > fine. I feel it being non-trivial to do so.
> > Like i said earlier, theres a lot of power in u32 other than in basic
> > matching.
>
> Most importantly I don't want to touch any of the hashing code in u32.
What i am reading is you see more work involved. Is this correct?
> I really like and it should stay as it is. The existing u32 match can
> be easly made an ematch so this would safe us the extra work in u32 to
> implement logic relations again and to fiddle with complicated selector
> TLVs. The only problem with this is the nexthdr bits because it relies
> on the hashing code. So we have to make this data available to the
> ematch which is actually not a bad idea anyway. So I'm thinking about
> introducing a new structure tcf_em_pkt_info or alike which carries
> some additional information found out by the classifiers which can be
> used by ematches. This can be information about the next header,
> already extracted dscp values, etc.
>
> This would give us the chance to add a very small em_u32.c (~40 lines)
> doing exactly the same as the current u32 match and have the u32
> selector replaced with an ematch tree at no additional cost. Backward
> compatibility is as easy as creating a flat ANDed ematch tree.
>
> Note: The u32 ematch I'm talking about is not the cmp ematch, cmp is
> more advanced but also slightly slower.
>
> Thoughts?
I think I understand more after reading the above now.
There is one issue which i think is the big source of our lack of sync:
You conclude people are gonna want to use the logical tree building
scheme you are putting in to put together matches and ematches. U32
_already_ has a tree building scheme which is very very flexible. Now
that the sel2 matches will provide u32 logial operators, it presents a
very interesting fresh outlook on life in a jiffy of a packet. This is
what i dont wanna kill or ignore. fw, tcindex etc donot have this
infrastructure so they dont matter.
> > I think #2 is better for other classifiers which were already doomed
> > anyways. #1 is important for u32 and perhaps other classifiers like rsvp
> > and route. And yes, #1 is more work ;->
>
> Why is it better? What's the advantage?
Refer to what i said above: u32 has built in tree building scheme made
more interesting now that there exist more interesting logical
operators.
> > hang on:-> No dont rewrite u32 please. Your cmp is good for the basic
> > matches that u32 does, but is _nowhere_ close to being able to do what
> > u32 can when used properly.
>
> Right, that's why I now call it cmp and the existing u32 match becomes
> the u32 ematch.
Again, u32 classifier is not just matches; the more interesting thing
is the layout of the rules that it can be taught to do.
I think the ematch which emulates the std u32 match is of course
valuable to have but it _doesnt_ deserve the same name.
> > I Should be able to compile a new ematch as a module in an already
> > running kernel. So, other than generic stuff for ematches, things like
> > TCF_EM_CMP should not be in that enumeration.
>
> It's not a requirement to put it there but we need to manage the
> assigned types for ematches in mainline anyway.
Thinking more about it; not sure why you would even bother managing
them. Everything runs at the same kernel privilege level. I am not sure
you want to have certain things that can only be built when recompiling
the kernel
> > And all this also goes in that header file as well.
>
> I put it here because it might be very useful for other ematches
> or further classifiers, you name it. tcf_get_base_ptr is used by
> nbyte ematch for example.
If its generic then it stays in the main header;
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-06 13:47 ` jamal
@ 2005-01-06 19:41 ` Thomas Graf
2005-01-07 13:45 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-06 19:41 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1105019225.2312.7.camel@jzny.localdomain> 2005-01-06 08:47
> So the issue is whether its by ref or copy? Maybe thats what the flag is
> for then. My view is that _everything_ for ematches should be by copy
> for simplicty.
If we do everything as ref we'll be allocating 4 byte chunks or we
introduce a storage u32 which pollutes the structure. I don't
like that given that the transfer of a single u32 is probably the
most common for all those smallish ematches for a specific thing.
For simplicity, you don't even notice if you're not aware of that
it can be of help so I don't think we're losing any simplicity here.
> Oh, yes;-> the one check in the datapath comes with a _huge_ advantage
> even all that you had was old style matches because now you have embeded
> logical operations which are more than just the old AND. But more
> importantly you can use ematches as well to influence the existing u32
> tree path.
Missunderstanding here, I meant is there any advantage in having
multiple ematch trees (interleaved) over just making the existing
u32 key an ematch and have the selector (with the hashing bits
extracted) with one ematch tree.
> What i am reading is you see more work involved. Is this correct?
Well, given we can agree on moving the u32 key to an ematch and
have the selector replaced with an ematch tree the following
modifications would be required in u32:
1) extract hashing bits out of selector and move it into a new
TLV.
2) Replace the foreach key match loop with an ematch_tree match
3) Fill out a pkt_info struct with ptr and off2 so we don't lose
hashing capabilities
4) Add backward compat code. Old selector must be transformed
into a flat ematch tree and the hashing bits must be extracted
and stored in the new struct.
> There is one issue which i think is the big source of our lack of sync:
> You conclude people are gonna want to use the logical tree building
> scheme you are putting in to put together matches and ematches. U32
> _already_ has a tree building scheme which is very very flexible.
I know and I'm not gonna break it but rather replace the ANDed
u32 key chains with an ematch tree. I'm fully aware of what u32
can do and I will in no way remove anything.
To make it clear, I'm only gonna change about 10 lines in classify:
for (i = n->sel.nkeys; i>0; i--, key++) {
if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) {
n = n->next;
goto next_knode;
}
}
will be replaced with:
info.nexthdr = off2;
info.ptr = ptr;
if (!tcf_em_tree_match(..., &info)) {
n = n->next;
goto next_knode;
}
That's all, nothing else is changed. I think this is exactly the part were
we're out of sync.
The most difficult part is to do the Kconfig dependencies in a smart way ;->
> Again, u32 classifier is not just matches; the more interesting thing
> is the layout of the rules that it can be taught to do.
> I think the ematch which emulates the std u32 match is of course
> valuable to have but it _doesnt_ deserve the same name.
Stupid terms, em_u32.c is a replacement for the u32 key and it has exactly
the same behaviour. I'll be happy to rename it but as you know I really
suck at naming things ;->
> Thinking more about it; not sure why you would even bother managing
> them. Everything runs at the same kernel privilege level. I am not sure
> you want to have certain things that can only be built when recompiling
> the kernel
Well, we have exactly the same issues as with TLV types. I don't see
why one would need to recompile things. The enumeration is for ematches
included in the kernel tree.
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-06 19:41 ` Thomas Graf
@ 2005-01-07 13:45 ` jamal
2005-01-08 14:54 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-07 13:45 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Thu, 2005-01-06 at 14:41, Thomas Graf wrote:
> * jamal <1105019225.2312.7.camel@jzny.localdomain> 2005-01-06 08:47
[..]
> If we do everything as ref we'll be allocating 4 byte chunks or we
> introduce a storage u32 which pollutes the structure. I don't
> like that given that the transfer of a single u32 is probably the
> most common for all those smallish ematches for a specific thing.
> For simplicity, you don't even notice if you're not aware of that
> it can be of help so I don't think we're losing any simplicity here.
I am not sure the optimization for a single u32 as the ematch data is
valid ;-> But this is not a show stopper, i will wait for the code to
see if its an annoyance or tolerable.
> Missunderstanding here, I meant is there any advantage in having
> multiple ematch trees (interleaved) over just making the existing
> u32 key an ematch and have the selector (with the hashing bits
> extracted) with one ematch tree.
Misunderstanding is the right description.
If you did a s/ematch/match for the u32 part then we'd be shooting for
the same thing;-> So i take back what i said, you are not gonna mess up
the u32 tree logic.
> 1) extract hashing bits out of selector and move it into a new
> TLV.
> 2) Replace the foreach key match loop with an ematch_tree match
This is our contention point.
> 3) Fill out a pkt_info struct with ptr and off2 so we don't lose
> hashing capabilities
And this is why i dont like it.
> 4) Add backward compat code. Old selector must be transformed
> into a flat ematch tree and the hashing bits must be extracted
> and stored in the new struct.
I think the u32 changes are one-shot if you want to avoid lotsa #ifdefs.
Someone sends you a old sel, then convert it to a new one for storage.
Dumping is a little trickier, need some way to recognize old style
request.
> The most difficult part is to do the Kconfig dependencies in a smart way ;->
The trick would be to always use sel2 and present no kconfig options for back
compat. We need to figure out how to recognize an old style dump and we are
set.
> > Again, u32 classifier is not just matches; the more interesting thing
> > is the layout of the rules that it can be taught to do.
> > I think the ematch which emulates the std u32 match is of course
> > valuable to have but it _doesnt_ deserve the same name.
>
> Stupid terms, em_u32.c is a replacement for the u32 key and it has exactly
> the same behaviour. I'll be happy to rename it but as you know I really
> suck at naming things ;->
em_u32 sounds better ;->
Above you are trying to insert off2 into the info (what i said i didnt
like) - how are you going to achieve the same with a standalone en_u32
from say you basic classifier?
> > Thinking more about it; not sure why you would even bother managing
> > them. Everything runs at the same kernel privilege level. I am not sure
> > you want to have certain things that can only be built when recompiling
> > the kernel
>
> Well, we have exactly the same issues as with TLV types. I don't see
> why one would need to recompile things. The enumeration is for ematches
> included in the kernel tree.
Your call. Actions do it the way i described it. It is more flexible in
my opionion, nothing reserved. Good practise is to know who uses what
(not hardcoding in headers) and the register will catch any discrepancy.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-07 13:45 ` jamal
@ 2005-01-08 14:54 ` Thomas Graf
2005-01-10 13:26 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-08 14:54 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1105105511.1046.77.camel@jzny.localdomain> 2005-01-07 08:45
> > 3) Fill out a pkt_info struct with ptr and off2 so we don't lose
> > hashing capabilities
>
> And this is why i dont like it.
What's the reason for not liking it? I know it's not a perfect solution
in terms of layers but having the classifier sharing already gathered
information to an ematch is not a bad thing.
> I think the u32 changes are one-shot if you want to avoid lotsa #ifdefs.
> Someone sends you a old sel, then convert it to a new one for storage.
> Dumping is a little trickier, need some way to recognize old style
> request.
The easiest way is to introduce a new TLV type and regard configuration
requests carrying the old type in compatibility mode.
> The trick would be to always use sel2 and present no kconfig options for back
> compat. We need to figure out how to recognize an old style dump and we are
> set.
Given we always use the new method by converting old style parameters
and we use em_u32 as u32 key we would need to put a dependcy on ematch
&& em_u32 for cls_u32.
> Above you are trying to insert off2 into the info (what i said i didnt
> like) - how are you going to achieve the same with a standalone en_u32
> from say you basic classifier?
I won't and it's not necessary, one can use u32 if he requires the
nexthdr capabilities or otherwise use em_cmp support the skb layers.
(Which i know is not perfect since the pointers to those layers are
not provided all the time).
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-08 14:54 ` Thomas Graf
@ 2005-01-10 13:26 ` jamal
2005-01-10 21:17 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-10 13:26 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Sat, 2005-01-08 at 09:54, Thomas Graf wrote:
> * jamal <1105105511.1046.77.camel@jzny.localdomain> 2005-01-07 08:45
> > > 3) Fill out a pkt_info struct with ptr and off2 so we don't lose
> > > hashing capabilities
> >
> > And this is why i dont like it.
>
> What's the reason for not liking it? I know it's not a perfect solution
> in terms of layers but having the classifier sharing already gathered
> information to an ematch is not a bad thing.
I think its _a hack_ Thomas ;-> Mostly because it has dependency on u32.
off2 doesnt exist on any other classifier and the basic ematch should be
usable by any classifier.
> The easiest way is to introduce a new TLV type and regard configuration
> requests carrying the old type in compatibility mode.
>
Sounds reasonable.
> > The trick would be to always use sel2 and present no kconfig options for back
> > compat. We need to figure out how to recognize an old style dump and we are
> > set.
>
> Given we always use the new method by converting old style parameters
> and we use em_u32 as u32 key we would need to put a dependcy on ematch
> && em_u32 for cls_u32.
>
I think that you should kill this em_u32 idea if it works only with u32.
> > Above you are trying to insert off2 into the info (what i said i didnt
> > like) - how are you going to achieve the same with a standalone en_u32
> > from say you basic classifier?
>
> I won't and it's not necessary, one can use u32 if he requires the
> nexthdr capabilities or otherwise use em_cmp support the skb layers.
> (Which i know is not perfect since the pointers to those layers are
> not provided all the time).
Why not just have a check to see if it is native match then not to call
up any ematch executing code. Have the native match maybe be of kind 0.
Have the return code for ematch lookup return something that indicates
that you need to match using local u32 instead of ematch.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-10 13:26 ` jamal
@ 2005-01-10 21:17 ` Thomas Graf
2005-01-10 22:05 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-10 21:17 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1105363582.1041.162.camel@jzny.localdomain> 2005-01-10 08:26
> I think its _a hack_ Thomas ;-> Mostly because it has dependency on u32.
> off2 doesnt exist on any other classifier and the basic ematch should be
> usable by any classifier.
It does not, u32 does have a dependency on em_u32 but not vice versa.
em_u32 is perfectly useful even without nexthdr functionality since
this is the way it is used today in 90% of the cases and it should be
a little bit faster than em_cmp but also a bit less powerful. On top
of that, rsvp could provide this information as well so one could
extend rsvp with em_u32 ematches. I think we should not think of it
as being dependant on off2 but rather as it is able to use information
from a underlying layer.
> Why not just have a check to see if it is native match then not to call
> up any ematch executing code. Have the native match maybe be of kind 0.
> Have the return code for ematch lookup return something that indicates
> that you need to match using local u32 instead of ematch.
This limits the number of native ematches to 1 or any other reserved
number and complicates userspace part for no reason. Is there any
advantage besides that it fits into layerrs more nicely? I can
tell you the disavantages and then we can compare ;->
- additional code is required in the classifier which would not
be required otherwise.
- given we define 0 as native ematch, what happens if we need
another native one? reserve a number in the namespace and
make a comment, "please do not use" or will we just say,
well, we can make it a regular ematch, it's not perfectly clean
but it works perfectly fine?
- making it a little bit generic such as em_u32 makes it useable
by other classifiers. One example is the above mentioned
rsvp which parses headers or there might be other specialized
classifiers having use for it. we can't have this if its put
into the classifier itself.
- userspace needs additional special handling and this will
get ugly once we need more than 1 native ematch, we'd need
some register api so ematch modules could tell which numbers
are native for them.
- did i state it's more work already? ;->
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-10 21:17 ` Thomas Graf
@ 2005-01-10 22:05 ` jamal
2005-01-10 23:30 ` Thomas Graf
2005-01-13 17:41 ` [RFC] meta ematch Thomas Graf
0 siblings, 2 replies; 44+ messages in thread
From: jamal @ 2005-01-10 22:05 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
On Mon, 2005-01-10 at 16:17, Thomas Graf wrote:
> * jamal <1105363582.1041.162.camel@jzny.localdomain> 2005-01-10 08:26
> > I think its _a hack_ Thomas ;-> Mostly because it has dependency on u32.
> > off2 doesnt exist on any other classifier and the basic ematch should be
> > usable by any classifier.
>
> It does not, u32 does have a dependency on em_u32 but not vice versa.
> em_u32 is perfectly useful even without nexthdr functionality since
> this is the way it is used today in 90% of the cases and it should be
> a little bit faster than em_cmp but also a bit less powerful. On top
> of that, rsvp could provide this information as well so one could
> extend rsvp with em_u32 ematches. I think we should not think of it
> as being dependant on off2 but rather as it is able to use information
> from a underlying layer.
Ok, you make a convincing arguement ;-> No more concerns from my side.
Churn that code!
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-10 22:05 ` jamal
@ 2005-01-10 23:30 ` Thomas Graf
2005-01-13 17:41 ` [RFC] meta ematch Thomas Graf
1 sibling, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-10 23:30 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1105394738.1085.63.camel@jzny.localdomain> 2005-01-10 17:05
> On Mon, 2005-01-10 at 16:17, Thomas Graf wrote:
> > * jamal <1105363582.1041.162.camel@jzny.localdomain> 2005-01-10 08:26
> > > I think its _a hack_ Thomas ;-> Mostly because it has dependency on u32.
> > > off2 doesnt exist on any other classifier and the basic ematch should be
> > > usable by any classifier.
> >
> > It does not, u32 does have a dependency on em_u32 but not vice versa.
> > em_u32 is perfectly useful even without nexthdr functionality since
> > this is the way it is used today in 90% of the cases and it should be
> > a little bit faster than em_cmp but also a bit less powerful. On top
> > of that, rsvp could provide this information as well so one could
> > extend rsvp with em_u32 ematches. I think we should not think of it
> > as being dependant on off2 but rather as it is able to use information
> > from a underlying layer.
>
> Ok, you make a convincing arguement ;-> No more concerns from my side.
> Churn that code!
I started testing via the basic classifier but will do the cls_u32
changes soon and then remerge and do the final tests once all the
other pkt_sched changes have made it into linus's tree and I can
work from a fresh bk tree. I'll also try to find the time this week
to do the iproute2 changes for the tcf_exts changset so iproute2 is
actually capable of configuring actions for all classifiers.
^ permalink raw reply [flat|nested] 44+ messages in thread
* [RFC] meta ematch
2005-01-10 22:05 ` jamal
2005-01-10 23:30 ` Thomas Graf
@ 2005-01-13 17:41 ` Thomas Graf
2005-01-13 18:54 ` Patrick McHardy
1 sibling, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-13 17:41 UTC (permalink / raw)
To: jamal; +Cc: netdev
* jamal <1105394738.1085.63.camel@jzny.localdomain> 2005-01-10 17:05
>
> Ok, you make a convincing arguement ;-> No more concerns from my side.
> Churn that code!
>
Found some cycles today and wrote the meta ematch. It tried to
find a good compromise between speed and power.
So far I added the following matching possibilies:
- random
- load average (0,1,2)
- dev (numeric and string)
- indev (numeric and string)
- realdev (numeric and string)
- skb priority
- ... protocol
- ... security
- ... pkttype (to easly match on multicast/broadcast)
- ... pktlen
- ... datalen
- ... maclen
- netfilter mark
- ... cache
- ... conntrack info
- ... debug variable
- tc index
- ... verdict
- ... classid
- routing classid
- .... iif
Yet to come are more routing and socket attributes such as queue
sizes, backlog sizes, neighbour attribute of the route found, ...
It is also possible to compare two kernel meta values, e.g
realdev equals dev.
Numeric matches may be modified via shift and mask operators
to for example only consider a part of nfmark.
Binary matches may have a shift modifier to only consider
a certain amount of the data, e.g. "eth1" with shift 1 would
end up with "eth". I added this because I wanted something
like eth% but didn't want to implement expensive string
operations.
If its not obvious, random and loadavg are intended for
load balancing purposes, i.e.
tc filter add ... basic meta random mask 1 eq 1 and \
loadavg_5 lt 10 action redirect ...
diff -Nru linux-2.6.10-bk14.orig/include/linux/pkt_cls.h linux-2.6.10-bk14/include/linux/pkt_cls.h
--- linux-2.6.10-bk14.orig/include/linux/pkt_cls.h 2005-01-13 11:18:05.000000000 +0100
+++ linux-2.6.10-bk14/include/linux/pkt_cls.h 2005-01-13 11:17:52.000000000 +0100
@@ -352,6 +352,7 @@
TCF_EM_CMP,
TCF_EM_NBYTE,
TCF_EM_U32,
+ TCF_EM_META,
__TCF_EM_MAX
};
diff -Nru linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h
--- linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h 2005-01-13 17:20:59.000000000 +0100
@@ -0,0 +1,71 @@
+#ifndef __LINUX_TC_EM_META_H
+#define __LINUX_TC_EM_META_H
+
+#include <linux/pkt_cls.h>
+
+enum
+{
+ TCA_EM_META_HDR,
+ TCA_EM_META_LVALUE,
+ TCA_EM_META_RVALUE,
+ __TCA_EM_META_MAX
+};
+#define TCA_EM_META_MAX (__TCA_EM_META_MAX - 1)
+
+struct tcf_meta_val
+{
+ __u16 kind;
+ __u8 shift;
+ __u8 op;
+};
+
+#define TCF_META_TYPE_MASK (0xf << 12)
+#define TCF_META_TYPE(kind) (((kind) & TCF_META_TYPE_MASK) >> 12)
+#define TCF_META_ID_MASK 0x7ff
+#define TCF_META_ID(kind) ((kind) & TCF_META_ID_MASK)
+
+enum
+{
+ TCF_META_TYPE_VAR,
+ TCF_META_TYPE_INT,
+ __TCF_META_TYPE_MAX
+};
+#define TCF_META_TYPE_MAX (__TCF_META_TYPE_MAX - 1)
+
+enum
+{
+ TCF_META_ID_VALUE,
+ TCF_META_ID_RANDOM,
+ TCF_META_ID_LOADAVG_0,
+ TCF_META_ID_LOADAVG_1,
+ TCF_META_ID_LOADAVG_2,
+ TCF_META_ID_DEV,
+ TCF_META_ID_INDEV,
+ TCF_META_ID_REALDEV,
+ TCF_META_ID_PRIORITY,
+ TCF_META_ID_PROTOCOL,
+ TCF_META_ID_SECURITY,
+ TCF_META_ID_PKTTYPE,
+ TCF_META_ID_PKTLEN,
+ TCF_META_ID_DATALEN,
+ TCF_META_ID_MACLEN,
+ TCF_META_ID_NFMARK,
+ TCF_META_ID_NFCACHE,
+ TCF_META_ID_NFCTINFO,
+ TCF_META_ID_NFDEBUG,
+ TCF_META_ID_TCINDEX,
+ TCF_META_ID_TCVERDICT,
+ TCF_META_ID_TCCLASSID,
+ TCF_META_ID_RTCLASSID,
+ TCF_META_ID_RTIIF,
+ __TCF_META_ID_MAX
+};
+#define TCF_META_ID_MAX (__TCF_META_ID_MAX - 1)
+
+struct tcf_meta_hdr
+{
+ struct tcf_meta_val left;
+ struct tcf_meta_val right;
+};
+
+#endif
diff -Nru linux-2.6.10-bk14.orig/net/sched/Kconfig linux-2.6.10-bk14/net/sched/Kconfig
--- linux-2.6.10-bk14.orig/net/sched/Kconfig 2005-01-13 11:18:05.000000000 +0100
+++ linux-2.6.10-bk14/net/sched/Kconfig 2005-01-13 11:17:52.000000000 +0100
@@ -428,6 +428,16 @@
To compile this code as a module, choose M here: the
module will be called em_u32.
+config NET_EMATCH_META
+ tristate "Metadata"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be ablt to classify packets based on
+ metadata.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_meta.
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk14.orig/net/sched/Makefile linux-2.6.10-bk14/net/sched/Makefile
--- linux-2.6.10-bk14.orig/net/sched/Makefile 2005-01-13 11:18:05.000000000 +0100
+++ linux-2.6.10-bk14/net/sched/Makefile 2005-01-13 11:17:52.000000000 +0100
@@ -37,3 +37,4 @@
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
+obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
diff -Nru linux-2.6.10-bk14.orig/net/sched/em_meta.c linux-2.6.10-bk14/net/sched/em_meta.c
--- linux-2.6.10-bk14.orig/net/sched/em_meta.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk14/net/sched/em_meta.c 2005-01-13 18:07:04.000000000 +0100
@@ -0,0 +1,566 @@
+/*
+ * net/sched/em_meta.c Metadata ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/tc_ematch/tc_em_meta.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/pkt_cls.h>
+
+struct meta_value
+{
+ struct tcf_meta_val hdr;
+ unsigned long value;
+ unsigned int len;
+};
+
+struct meta_match
+{
+ struct meta_value lvalue;
+ struct meta_value rvalue;
+};
+
+#define meta_id(value) (TCF_META_ID((value)->hdr.kind))
+#define meta_type(value) (TCF_META_TYPE((value)->hdr.kind))
+
+/**************************************************************************
+ * System status & misc
+ **************************************************************************/
+
+static int meta_int_random(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ get_random_bytes(&dst->value, sizeof(dst->value));
+ return 0;
+}
+
+static inline unsigned long fixed_loadavg(unsigned long v)
+{
+ return (v + (FIXED_1/200)) >> FSHIFT;
+}
+
+static int meta_int_loadavg_0(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = fixed_loadavg(avenrun[0]);
+ return 0;
+}
+
+static int meta_int_loadavg_1(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = fixed_loadavg(avenrun[1]);
+ return 0;
+}
+
+static int meta_int_loadavg_2(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = fixed_loadavg(avenrun[2]);
+ return 0;
+}
+
+/**************************************************************************
+ * Device names & indices
+ **************************************************************************/
+
+static inline int int_dev(struct net_device *dev, struct meta_value *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = dev->ifindex;
+ return 0;
+}
+
+static inline int var_dev(struct net_device *dev, struct meta_value *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = (unsigned long) dev->name;
+ dst->len = strlen(dev->name);
+ return 0;
+}
+
+static int meta_int_dev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ return int_dev(skb->dev, dst);
+}
+
+static int meta_var_dev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ return var_dev(skb->dev, dst);
+}
+
+static int meta_int_indev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ return int_dev(skb->input_dev, dst);
+}
+
+static int meta_var_indev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ return var_dev(skb->input_dev, dst);
+}
+
+static int meta_int_realdev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ return int_dev(skb->real_dev, dst);
+}
+
+static int meta_var_realdev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ return var_dev(skb->real_dev, dst);
+}
+
+/**************************************************************************
+ * skb attributes
+ **************************************************************************/
+
+static int meta_int_priority(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->priority;
+ return 0;
+}
+
+static int meta_int_protocol(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->protocol;
+ return 0;
+}
+
+static int meta_int_security(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->security;
+ return 0;
+}
+
+static int meta_int_pkttype(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->pkt_type;
+ return 0;
+}
+
+static int meta_int_pktlen(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->len;
+ return 0;
+}
+
+static int meta_int_datalen(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->data_len;
+ return 0;
+}
+
+static int meta_int_maclen(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->mac_len;
+ return 0;
+}
+
+/**************************************************************************
+ * Netfilter
+ **************************************************************************/
+
+#ifdef CONFIG_NETFILTER
+static int meta_int_nfmark(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->nfmark;
+ return 0;
+}
+
+static int meta_int_nfcache(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->nfcache;
+ return 0;
+}
+
+static int meta_int_nfctinfo(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->nfctinfo;
+ return 0;
+}
+
+#ifdef CONFIG_NETFILTER_DEBUG
+static int meta_int_nfdebug(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->nf_debug;
+ return 0;
+}
+#endif
+#endif
+
+/**************************************************************************
+ * Traffic Control
+ **************************************************************************/
+
+static int meta_int_tcindex(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->tc_index;
+ return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int meta_int_tcverd(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->tc_verd;
+ return 0;
+}
+
+static int meta_int_tcclassid(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ dst->value = skb->tc_classid;
+ return 0;
+}
+#endif
+
+/**************************************************************************
+ * Routing
+ **************************************************************************/
+
+#ifdef CONFIG_NET_CLS_ROUTE
+static int meta_int_rtclassid(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ if (unlikely(skb->dst == NULL))
+ return -1;
+
+ dst->value = skb->dst->tclassid;
+ return 0;
+}
+#endif
+
+static int meta_int_rtiif(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ if (unlikely(skb->dst == NULL))
+ return -1;
+
+ dst->value = ((struct rtable*) skb->dst)->fl.iif;
+ return 0;
+}
+
+struct meta_ops
+{
+ int (*get)(struct sk_buff *, struct tcf_pkt_info *,
+ struct meta_value *, struct meta_value *);
+};
+
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+ [TCF_META_TYPE_VAR] = {
+ [TCF_META_ID_DEV] = { .get = meta_var_dev },
+ [TCF_META_ID_INDEV] = { .get = meta_var_indev },
+ [TCF_META_ID_REALDEV] = { .get = meta_var_realdev }
+ },
+ [TCF_META_TYPE_INT] = {
+ [TCF_META_ID_RANDOM] = { .get = meta_int_random },
+ [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 },
+ [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 },
+ [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 },
+ [TCF_META_ID_DEV] = { .get = meta_int_dev },
+ [TCF_META_ID_INDEV] = { .get = meta_int_indev },
+ [TCF_META_ID_REALDEV] = { .get = meta_int_realdev },
+ [TCF_META_ID_PRIORITY] = { .get = meta_int_priority },
+ [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol },
+ [TCF_META_ID_SECURITY] = { .get = meta_int_security },
+ [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype },
+ [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen },
+ [TCF_META_ID_DATALEN] = { .get = meta_int_datalen },
+ [TCF_META_ID_MACLEN] = { .get = meta_int_maclen },
+#ifdef CONFIG_NETFILTER
+ [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark },
+ [TCF_META_ID_NFCACHE] = { .get = meta_int_nfcache },
+ [TCF_META_ID_NFCTINFO] = { .get = meta_int_nfctinfo },
+#ifdef CONFIG_NETFILTER_DEBUG
+ [TCF_META_ID_NFDEBUG] = { .get = meta_int_nfdebug },
+#endif
+#endif
+ [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex },
+#ifdef CONFIG_NET_CLS_ACT
+ [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd },
+ [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid },
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid },
+#endif
+ [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif }
+ }
+};
+
+static inline struct meta_ops * meta_ops(struct meta_value *v)
+{
+ return &__meta_ops[meta_type(v)][meta_id(v)];
+}
+
+static int meta_var_compare(struct meta_value *a, struct meta_value *b)
+{
+ int r = a->len - b->len;
+
+ if (r == 0)
+ r = memcmp((void *) a->value, (void *) b->value, a->len);
+
+ return r;
+}
+
+static int meta_var_change(struct meta_value *dst, struct rtattr *rta)
+{
+ int len = RTA_PAYLOAD(rta);
+
+ dst->value = (unsigned long) kmalloc(len, GFP_KERNEL);
+ if (dst->value == 0UL)
+ return -ENOMEM;
+ memcpy((void *) dst->value, RTA_DATA(rta), len);
+ dst->len = len;
+ return 0;
+}
+
+static void meta_var_destroy(struct meta_value *v)
+{
+ kfree((void *) v->value);
+}
+
+static void meta_var_apply_extras(struct meta_value *v,
+ struct meta_value *dst)
+{
+ int shift = v->hdr.shift;
+
+ if (shift && shift < dst->len)
+ dst->len -= shift;
+}
+
+static int meta_int_compare(struct meta_value *a, struct meta_value *b)
+{
+ return a->value - b->value;
+}
+
+static int meta_int_change(struct meta_value *dst, struct rtattr *rta)
+{
+ if (RTA_PAYLOAD(rta) < sizeof(u32))
+ return -EINVAL;
+ dst->value = *(u32*) RTA_DATA(rta);
+ dst->len = sizeof(u32);
+ return 0;
+}
+
+static void meta_int_apply_extras(struct meta_value *v,
+ struct meta_value *dst)
+{
+ if (v->hdr.shift)
+ dst->value >>= v->hdr.shift;
+
+ if (v->value)
+ dst->value &= v->value;
+}
+
+struct meta_type_ops
+{
+ void (*destroy)(struct meta_value *);
+ int (*compare)(struct meta_value *, struct meta_value *);
+ int (*change)(struct meta_value *, struct rtattr *);
+ void (*apply_extras)(struct meta_value *, struct meta_value *);
+};
+
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+ [TCF_META_TYPE_VAR] = {
+ .destroy = meta_var_destroy,
+ .compare = meta_var_compare,
+ .change = meta_var_change,
+ .apply_extras = meta_var_apply_extras
+ },
+ [TCF_META_TYPE_INT] = {
+ .compare = meta_int_compare,
+ .change = meta_int_change,
+ .apply_extras = meta_int_apply_extras
+ }
+};
+
+static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+{
+ return &__meta_type_ops[meta_type(v)];
+}
+
+static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_value *dst)
+{
+ int err;
+
+ if (meta_id(v) == TCF_META_ID_VALUE) {
+ dst->value = v->value;
+ dst->len = v->len;
+ return 0;
+ }
+
+ err = meta_ops(v)->get(skb, info, v, dst);
+ if (err < 0)
+ return err;
+
+ if (meta_type_ops(v)->apply_extras)
+ meta_type_ops(v)->apply_extras(v, dst);
+
+ return 0;
+}
+
+static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ int r;
+ struct meta_match *meta = (struct meta_match *) m->data;
+ struct meta_value l_value, r_value;
+
+ if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+ meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+ return 0;
+
+ r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
+
+ switch (meta->lvalue.hdr.op) {
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
+ }
+
+ return 0;
+}
+
+static inline void meta_delete(struct meta_match *meta)
+{
+ struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+
+ if (ops && ops->destroy) {
+ ops->destroy(&meta->lvalue);
+ ops->destroy(&meta->rvalue);
+ }
+
+ kfree(meta);
+}
+
+static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta)
+{
+ if (rta) {
+ if (RTA_PAYLOAD(rta) == 0)
+ return -EINVAL;
+
+ return meta_type_ops(dst)->change(dst, rta);
+ }
+
+ return 0;
+}
+
+static int em_meta_change(struct tcf_proto *tp, void *data, int len,
+ struct tcf_ematch *m)
+{
+ int err = -EINVAL;
+ struct rtattr *tb[TCA_EM_META_MAX];
+ struct tcf_meta_hdr *hdr;
+ struct meta_match *meta = NULL;
+
+ if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0)
+ goto errout;
+
+ if (tb[TCA_EM_META_HDR-1] == NULL ||
+ RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr))
+ goto errout;
+ hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]);
+
+ if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
+ TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
+ TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
+ TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
+ goto errout;
+
+ meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+ if (meta == NULL)
+ goto errout;
+ memset(meta, 0, sizeof(*meta));
+
+ memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
+ memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
+
+ if (meta_ops(&meta->lvalue)->get == NULL ||
+ meta_ops(&meta->rvalue)->get == NULL) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 ||
+ meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0)
+ goto errout;
+
+ m->datalen = sizeof(*meta);
+ m->data = (unsigned long) meta;
+
+ err = 0;
+errout:
+ if (err && meta)
+ meta_delete(meta);
+ return err;
+}
+
+static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+ meta_delete((struct meta_match *) m->data);
+}
+
+static struct tcf_ematch_ops em_meta_ops = {
+ .kind = TCF_EM_META,
+ .change = em_meta_change,
+ .match = em_meta_match,
+ .destroy = em_meta_destroy,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_meta_ops.link)
+};
+
+static int __init init_em_meta(void)
+{
+ return tcf_em_register(&em_meta_ops);
+}
+
+static void __exit exit_em_meta(void)
+{
+ tcf_em_unregister(&em_meta_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_meta);
+module_exit(exit_em_meta);
+
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] meta ematch
2005-01-13 17:41 ` [RFC] meta ematch Thomas Graf
@ 2005-01-13 18:54 ` Patrick McHardy
2005-01-13 19:20 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: Patrick McHardy @ 2005-01-13 18:54 UTC (permalink / raw)
To: Thomas Graf; +Cc: jamal, netdev
Thomas Graf wrote:
>Found some cycles today and wrote the meta ematch. It tried to
>find a good compromise between speed and power.
>
>So far I added the following matching possibilies:
> - random
> - load average (0,1,2)
> - dev (numeric and string)
> - indev (numeric and string)
> - realdev (numeric and string)
> - skb priority
> - ... protocol
> - ... security
> - ... pkttype (to easly match on multicast/broadcast)
> - ... pktlen
> - ... datalen
> - ... maclen
> - netfilter mark
> - ... cache
> - ... conntrack info
> - ... debug variable
> - tc index
> - ... verdict
> - ... classid
> - routing classid
> - .... iif
>
>Yet to come are more routing and socket attributes such as queue
>sizes, backlog sizes, neighbour attribute of the route found, ...
>
>It is also possible to compare two kernel meta values, e.g
>realdev equals dev.
>
>Numeric matches may be modified via shift and mask operators
>to for example only consider a part of nfmark.
>
>Binary matches may have a shift modifier to only consider
>a certain amount of the data, e.g. "eth1" with shift 1 would
>end up with "eth". I added this because I wanted something
>like eth% but didn't want to implement expensive string
>operations.
>
>
Looks great. I have a few doubts about about the set of chosen values
though. Things like nf_debug and nf_cache were never meant to be
userspace-visible. What about backwards compatibility if we want to
remove it, or some other more meaningful value where just returning 0
wouldn't be the same ?
A couple of minor things:
- var_dev sets dst->value to dev->name, meta_var_destroy will try to
free dev->name.
- meta_int_change only uses 32 bit, but dst->value is unsigned long
(64 bit on 64-bit arches). nfmark for example is unsigned long, so
you should also use *(unsigned long *).
- for the same reason meta_int_compare should return long not int
>If its not obvious, random and loadavg are intended for
>load balancing purposes, i.e.
>
>
I have my doubts about the usefullness of load balancing traffic based
on CPU load, but I guess it doesn't hurt.
Regards
Patrick
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-13 18:54 ` Patrick McHardy
@ 2005-01-13 19:20 ` Thomas Graf
2005-01-14 1:13 ` Patrick McHardy
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-13 19:20 UTC (permalink / raw)
To: Patrick McHardy; +Cc: jamal, netdev
* Patrick McHardy <41E6C3E5.2020908@trash.net> 2005-01-13 19:54
> Looks great. I have a few doubts about about the set of chosen values
> though. Things like nf_debug and nf_cache were never meant to be
> userspace-visible. What about backwards compatibility if we want to
> remove it, or some other more meaningful value where just returning 0
> wouldn't be the same ?
It is indeed problematic and they should be marked as "for debugging
purposes (unreliable)" but at least nf_debug and nfctinfo are
very useful for debugging.
> - var_dev sets dst->value to dev->name, meta_var_destroy will try to
> free dev->name.
The `dst` meta_value is the l_value/r_lvalue from em_meta_match and
never gets destroyed. I reused meta_data to store address & length.
It might be a good idea to make a new struct for this to make it
more readable though.
> - meta_int_change only uses 32 bit, but dst->value is unsigned long
> (64 bit on 64-bit arches). nfmark for example is unsigned long, so
> you should also use *(unsigned long *).
Doesn't work when size of long differs between kernel and userspace.
I'm aware of this but it seems everyone is using int anyway for nfmark,
so yes this indeed limits the use of nfmark match to only 32 bits
on 64bit machines. The proper way is to introduce a new type
TCF_EM_TYPE_INT64 and access nfmark over it but I didn't want to
create a new type just because of this special case. We can always
add it later as addition to the 32bit version.
> - for the same reason meta_int_compare should return long not int
Agreed.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-13 19:20 ` Thomas Graf
@ 2005-01-14 1:13 ` Patrick McHardy
2005-01-14 15:14 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: Patrick McHardy @ 2005-01-14 1:13 UTC (permalink / raw)
To: Thomas Graf; +Cc: jamal, netdev
Thomas Graf wrote:
>* Patrick McHardy <41E6C3E5.2020908@trash.net> 2005-01-13 19:54
>
>
>
>>Looks great. I have a few doubts about about the set of chosen values
>>though. Things like nf_debug and nf_cache were never meant to be
>>userspace-visible. What about backwards compatibility if we want to
>>remove it, or some other more meaningful value where just returning 0
>>wouldn't be the same ?
>>
>>
>
>It is indeed problematic and they should be marked as "for debugging
>purposes (unreliable)" but at least nf_debug and nfctinfo are
>very useful for debugging.
>
True. nfctinfo is even useful for more, the direction of a connection
might be interesting. connmark, conntrack counters, src-ip before SNAT
etc. might also be interesting, but they are horrible to implement cleanly
because any dependency on ip_conntrack_lock will automatically load
ip_conntrack. Perhaps we should add something like nf_ct_get_afinfo() to
return a set of conntrack operations to nf_conntrack.
For things beside the nf* fields: I think we should make it very clear
that everything that isn't already visible to userspace in some way, and
thus won't disappear (like priority, nfmark, load average ...), can get
changed/removed any time.
>>- var_dev sets dst->value to dev->name, meta_var_destroy will try to
>> free dev->name.
>>
>>
>
>The `dst` meta_value is the l_value/r_lvalue from em_meta_match and
>never gets destroyed. I reused meta_data to store address & length.
>It might be a good idea to make a new struct for this to make it
>more readable though.
>
Looks good to me already. I only looked at the diff, so I didn't really
follow the codepath.
>>- meta_int_change only uses 32 bit, but dst->value is unsigned long
>> (64 bit on 64-bit arches). nfmark for example is unsigned long, so
>> you should also use *(unsigned long *).
>>
>>
>
>Doesn't work when size of long differs between kernel and userspace.
>I'm aware of this but it seems everyone is using int anyway for nfmark,
>so yes this indeed limits the use of nfmark match to only 32 bits
>on 64bit machines. The proper way is to introduce a new type
>TCF_EM_TYPE_INT64 and access nfmark over it but I didn't want to
>create a new type just because of this special case. We can always
>add it later as addition to the 32bit version.
>
>
Shouldn't be too hard to get right. In the kernel you can decide based
on RTA_PAYLOAD. Userspace needs some other way to notice it is running
as a 32-bit binary on a 64-bit kernel, but that's something you can't
solve in the kernel anyway.
Regards
Patrick
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-14 1:13 ` Patrick McHardy
@ 2005-01-14 15:14 ` Thomas Graf
2005-01-16 14:58 ` jamal
2005-01-16 16:11 ` jamal
0 siblings, 2 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-14 15:14 UTC (permalink / raw)
To: Patrick McHardy; +Cc: jamal, netdev
* Patrick McHardy <41E71CC4.3020102@trash.net> 2005-01-14 02:13
> True. nfctinfo is even useful for more, the direction of a connection
> might be interesting. connmark, conntrack counters, src-ip before SNAT
> etc. might also be interesting, but they are horrible to implement cleanly
> because any dependency on ip_conntrack_lock will automatically load
> ip_conntrack. Perhaps we should add something like nf_ct_get_afinfo() to
> return a set of conntrack operations to nf_conntrack.
Yes, I'm kind of afraid to create a too big dependency on netfilter,
so I sticked to the easly reachable values.
I think I'll remove the netfilter attributes again except for nfmark
and readd them together with the more complicated routing information
and the socket attributes to not delay the whole ematch patch for too long.
> For things beside the nf* fields: I think we should make it very clear
> that everything that isn't already visible to userspace in some way, and
> thus won't disappear (like priority, nfmark, load average ...), can get
> changed/removed any time.
Agreed.
> >The `dst` meta_value is the l_value/r_lvalue from em_meta_match and
> >never gets destroyed. I reused meta_data to store address & length.
> >It might be a good idea to make a new struct for this to make it
> >more readable though.
> >
> Looks good to me already. I only looked at the diff, so I didn't really
> follow the codepath.
I changed it anyway, it looks a lot cleaner now.
> Shouldn't be too hard to get right. In the kernel you can decide based
> on RTA_PAYLOAD. Userspace needs some other way to notice it is running
> as a 32-bit binary on a 64-bit kernel, but that's something you can't
> solve in the kernel anyway.
Both have their drawbacks but I think yours is a bit simpler. I changed
it to the code below and gave responsibility to userspace.
+ if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) {
+ dst->val = *(unsigned long *) RTA_DATA(rta);
+ dst->len = sizeof(unsigned long);
+ } else if (RTA_PAYLOAD(rta) == sizeof(u32)) {
+ dst->val = *(u32 *) RTA_DATA(rta);
+ dst->len = sizeof(u32);
+ } else
+ return -EINVAL;
Here's a revised patch. I fixed the numeric comparison issues and
added meta_obj instead of using meta_data to give a better impression
on the difference of a comparable object and meta data definitions.
diff -Nru linux-2.6.10-bk14.orig/include/linux/pkt_cls.h linux-2.6.10-bk14/include/linux/pkt_cls.h
--- linux-2.6.10-bk14.orig/include/linux/pkt_cls.h 2005-01-13 11:18:05.000000000 +0100
+++ linux-2.6.10-bk14/include/linux/pkt_cls.h 2005-01-13 11:17:52.000000000 +0100
@@ -352,6 +352,7 @@
TCF_EM_CMP,
TCF_EM_NBYTE,
TCF_EM_U32,
+ TCF_EM_META,
__TCF_EM_MAX
};
diff -Nru linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h
--- linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h 2005-01-13 22:16:40.000000000 +0100
@@ -0,0 +1,72 @@
+#ifndef __LINUX_TC_EM_META_H
+#define __LINUX_TC_EM_META_H
+
+#include <linux/pkt_cls.h>
+
+enum
+{
+ TCA_EM_META_UNSPEC,
+ TCA_EM_META_HDR,
+ TCA_EM_META_LVALUE,
+ TCA_EM_META_RVALUE,
+ __TCA_EM_META_MAX
+};
+#define TCA_EM_META_MAX (__TCA_EM_META_MAX - 1)
+
+struct tcf_meta_val
+{
+ __u16 kind;
+ __u8 shift;
+ __u8 op;
+};
+
+#define TCF_META_TYPE_MASK (0xf << 12)
+#define TCF_META_TYPE(kind) (((kind) & TCF_META_TYPE_MASK) >> 12)
+#define TCF_META_ID_MASK 0x7ff
+#define TCF_META_ID(kind) ((kind) & TCF_META_ID_MASK)
+
+enum
+{
+ TCF_META_TYPE_VAR,
+ TCF_META_TYPE_INT,
+ __TCF_META_TYPE_MAX
+};
+#define TCF_META_TYPE_MAX (__TCF_META_TYPE_MAX - 1)
+
+enum
+{
+ TCF_META_ID_VALUE,
+ TCF_META_ID_RANDOM,
+ TCF_META_ID_LOADAVG_0,
+ TCF_META_ID_LOADAVG_1,
+ TCF_META_ID_LOADAVG_2,
+ TCF_META_ID_DEV,
+ TCF_META_ID_INDEV,
+ TCF_META_ID_REALDEV,
+ TCF_META_ID_PRIORITY,
+ TCF_META_ID_PROTOCOL,
+ TCF_META_ID_SECURITY,
+ TCF_META_ID_PKTTYPE,
+ TCF_META_ID_PKTLEN,
+ TCF_META_ID_DATALEN,
+ TCF_META_ID_MACLEN,
+ TCF_META_ID_NFMARK,
+ TCF_META_ID_NFCACHE,
+ TCF_META_ID_NFCTINFO,
+ TCF_META_ID_NFDEBUG,
+ TCF_META_ID_TCINDEX,
+ TCF_META_ID_TCVERDICT,
+ TCF_META_ID_TCCLASSID,
+ TCF_META_ID_RTCLASSID,
+ TCF_META_ID_RTIIF,
+ __TCF_META_ID_MAX
+};
+#define TCF_META_ID_MAX (__TCF_META_ID_MAX - 1)
+
+struct tcf_meta_hdr
+{
+ struct tcf_meta_val left;
+ struct tcf_meta_val right;
+};
+
+#endif
diff -Nru linux-2.6.10-bk14.orig/net/sched/Kconfig linux-2.6.10-bk14/net/sched/Kconfig
--- linux-2.6.10-bk14.orig/net/sched/Kconfig 2005-01-13 11:18:05.000000000 +0100
+++ linux-2.6.10-bk14/net/sched/Kconfig 2005-01-14 14:09:41.000000000 +0100
@@ -428,6 +428,17 @@
To compile this code as a module, choose M here: the
module will be called em_u32.
+config NET_EMATCH_META
+ tristate "Metadata"
+ depends on NET_EMATCH
+ ---help---
+ Say Y here if you want to be ablt to classify packets based on
+ metadata such as load average, netfilter attributes, socket
+ attributes and routing decisions.
+
+ To compile this code as a module, choose M here: the
+ module will be called em_meta.
+
config NET_CLS_ACT
bool "Packet ACTION"
depends on EXPERIMENTAL && NET_CLS && NET_QOS
diff -Nru linux-2.6.10-bk14.orig/net/sched/Makefile linux-2.6.10-bk14/net/sched/Makefile
--- linux-2.6.10-bk14.orig/net/sched/Makefile 2005-01-13 11:18:05.000000000 +0100
+++ linux-2.6.10-bk14/net/sched/Makefile 2005-01-13 11:17:52.000000000 +0100
@@ -37,3 +37,4 @@
obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o
obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o
obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o
+obj-$(CONFIG_NET_EMATCH_META) += em_meta.o
diff -Nru linux-2.6.10-bk14.orig/net/sched/em_meta.c linux-2.6.10-bk14/net/sched/em_meta.c
--- linux-2.6.10-bk14.orig/net/sched/em_meta.c 1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.10-bk14/net/sched/em_meta.c 2005-01-14 14:08:41.000000000 +0100
@@ -0,0 +1,587 @@
+/*
+ * net/sched/em_meta.c Metadata ematch
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Thomas Graf <tgraf@suug.ch>
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/skbuff.h>
+#include <linux/random.h>
+#include <linux/tc_ematch/tc_em_meta.h>
+#include <net/dst.h>
+#include <net/route.h>
+#include <net/pkt_cls.h>
+
+struct meta_obj
+{
+ unsigned long value;
+ unsigned int len;
+};
+
+struct meta_value
+{
+ struct tcf_meta_val hdr;
+ unsigned long val;
+ unsigned int len;
+};
+
+struct meta_match
+{
+ struct meta_value lvalue;
+ struct meta_value rvalue;
+};
+
+#define meta_id(value) (TCF_META_ID((value)->hdr.kind))
+#define meta_type(value) (TCF_META_TYPE((value)->hdr.kind))
+
+/**************************************************************************
+ * System status & misc
+ **************************************************************************/
+
+static int meta_int_random(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ get_random_bytes(&dst->value, sizeof(dst->value));
+ return 0;
+}
+
+static inline unsigned long fixed_loadavg(unsigned long v)
+{
+ return (v + (FIXED_1/200)) >> FSHIFT;
+}
+
+static int meta_int_loadavg_0(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = fixed_loadavg(avenrun[0]);
+ return 0;
+}
+
+static int meta_int_loadavg_1(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = fixed_loadavg(avenrun[1]);
+ return 0;
+}
+
+static int meta_int_loadavg_2(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = fixed_loadavg(avenrun[2]);
+ return 0;
+}
+
+/**************************************************************************
+ * Device names & indices
+ **************************************************************************/
+
+static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = dev->ifindex;
+ return 0;
+}
+
+static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
+{
+ if (unlikely(dev == NULL))
+ return -1;
+
+ dst->value = (unsigned long) dev->name;
+ dst->len = strlen(dev->name);
+ return 0;
+}
+
+static int meta_int_dev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ return int_dev(skb->dev, dst);
+}
+
+static int meta_var_dev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ return var_dev(skb->dev, dst);
+}
+
+static int meta_int_indev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ return int_dev(skb->input_dev, dst);
+}
+
+static int meta_var_indev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ return var_dev(skb->input_dev, dst);
+}
+
+static int meta_int_realdev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ return int_dev(skb->real_dev, dst);
+}
+
+static int meta_var_realdev(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ return var_dev(skb->real_dev, dst);
+}
+
+/**************************************************************************
+ * skb attributes
+ **************************************************************************/
+
+static int meta_int_priority(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->priority;
+ return 0;
+}
+
+static int meta_int_protocol(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ /* Let userspace take care of the byte ordering */
+ dst->value = skb->protocol;
+ return 0;
+}
+
+static int meta_int_security(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->security;
+ return 0;
+}
+
+static int meta_int_pkttype(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->pkt_type;
+ return 0;
+}
+
+static int meta_int_pktlen(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->len;
+ return 0;
+}
+
+static int meta_int_datalen(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->data_len;
+ return 0;
+}
+
+static int meta_int_maclen(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->mac_len;
+ return 0;
+}
+
+/**************************************************************************
+ * Netfilter
+ **************************************************************************/
+
+#ifdef CONFIG_NETFILTER
+static int meta_int_nfmark(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->nfmark;
+ return 0;
+}
+
+static int meta_int_nfcache(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->nfcache;
+ return 0;
+}
+
+static int meta_int_nfctinfo(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->nfctinfo;
+ return 0;
+}
+
+#ifdef CONFIG_NETFILTER_DEBUG
+static int meta_int_nfdebug(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->nf_debug;
+ return 0;
+}
+#endif
+#endif
+
+/**************************************************************************
+ * Traffic Control
+ **************************************************************************/
+
+static int meta_int_tcindex(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->tc_index;
+ return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ACT
+static int meta_int_tcverd(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->tc_verd;
+ return 0;
+}
+
+static int meta_int_tcclassid(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ dst->value = skb->tc_classid;
+ return 0;
+}
+#endif
+
+/**************************************************************************
+ * Routing
+ **************************************************************************/
+
+#ifdef CONFIG_NET_CLS_ROUTE
+static int meta_int_rtclassid(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ if (unlikely(skb->dst == NULL))
+ return -1;
+
+ dst->value = skb->dst->tclassid;
+ return 0;
+}
+#endif
+
+static int meta_int_rtiif(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ if (unlikely(skb->dst == NULL))
+ return -1;
+
+ dst->value = ((struct rtable*) skb->dst)->fl.iif;
+ return 0;
+}
+
+struct meta_ops
+{
+ int (*get)(struct sk_buff *, struct tcf_pkt_info *,
+ struct meta_value *, struct meta_obj *);
+};
+
+static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
+ [TCF_META_TYPE_VAR] = {
+ [TCF_META_ID_DEV] = { .get = meta_var_dev },
+ [TCF_META_ID_INDEV] = { .get = meta_var_indev },
+ [TCF_META_ID_REALDEV] = { .get = meta_var_realdev }
+ },
+ [TCF_META_TYPE_INT] = {
+ [TCF_META_ID_RANDOM] = { .get = meta_int_random },
+ [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 },
+ [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 },
+ [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 },
+ [TCF_META_ID_DEV] = { .get = meta_int_dev },
+ [TCF_META_ID_INDEV] = { .get = meta_int_indev },
+ [TCF_META_ID_REALDEV] = { .get = meta_int_realdev },
+ [TCF_META_ID_PRIORITY] = { .get = meta_int_priority },
+ [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol },
+ [TCF_META_ID_SECURITY] = { .get = meta_int_security },
+ [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype },
+ [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen },
+ [TCF_META_ID_DATALEN] = { .get = meta_int_datalen },
+ [TCF_META_ID_MACLEN] = { .get = meta_int_maclen },
+#ifdef CONFIG_NETFILTER
+ [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark },
+ [TCF_META_ID_NFCACHE] = { .get = meta_int_nfcache },
+ [TCF_META_ID_NFCTINFO] = { .get = meta_int_nfctinfo },
+#ifdef CONFIG_NETFILTER_DEBUG
+ [TCF_META_ID_NFDEBUG] = { .get = meta_int_nfdebug },
+#endif
+#endif
+ [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex },
+#ifdef CONFIG_NET_CLS_ACT
+ [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd },
+ [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid },
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+ [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid },
+#endif
+ [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif }
+ }
+};
+
+static inline struct meta_ops * meta_ops(struct meta_value *v)
+{
+ return &__meta_ops[meta_type(v)][meta_id(v)];
+}
+
+static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
+{
+ int r = a->len - b->len;
+
+ if (r == 0)
+ r = memcmp((void *) a->value, (void *) b->value, a->len);
+
+ return r;
+}
+
+static int meta_var_change(struct meta_value *dst, struct rtattr *rta)
+{
+ int len = RTA_PAYLOAD(rta);
+
+ dst->val = (unsigned long) kmalloc(len, GFP_KERNEL);
+ if (dst->val == 0UL)
+ return -ENOMEM;
+ memcpy((void *) dst->val, RTA_DATA(rta), len);
+ dst->len = len;
+ return 0;
+}
+
+static void meta_var_destroy(struct meta_value *v)
+{
+ kfree((void *) v->val);
+}
+
+static void meta_var_apply_extras(struct meta_value *v,
+ struct meta_obj *dst)
+{
+ int shift = v->hdr.shift;
+
+ if (shift && shift < dst->len)
+ dst->len -= shift;
+}
+
+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
+{
+ /* Let gcc optimize it, the unlikely is not really based on
+ * some numbers but jump free code for missmatches seems
+ * more logical.
+ */
+ if (unlikely(a == b))
+ return 0;
+ else if (a < b)
+ return -1;
+ else
+ return 1;
+}
+
+static int meta_int_change(struct meta_value *dst, struct rtattr *rta)
+{
+ if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) {
+ dst->val = *(unsigned long *) RTA_DATA(rta);
+ dst->len = sizeof(unsigned long);
+ } else if (RTA_PAYLOAD(rta) == sizeof(u32)) {
+ dst->val = *(u32 *) RTA_DATA(rta);
+ dst->len = sizeof(u32);
+ } else
+ return -EINVAL;
+
+ return 0;
+}
+
+static void meta_int_apply_extras(struct meta_value *v,
+ struct meta_obj *dst)
+{
+ if (v->hdr.shift)
+ dst->value >>= v->hdr.shift;
+
+ if (v->val)
+ dst->value &= v->val;
+}
+
+struct meta_type_ops
+{
+ void (*destroy)(struct meta_value *);
+ int (*compare)(struct meta_obj *, struct meta_obj *);
+ int (*change)(struct meta_value *, struct rtattr *);
+ void (*apply_extras)(struct meta_value *, struct meta_obj *);
+};
+
+static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
+ [TCF_META_TYPE_VAR] = {
+ .destroy = meta_var_destroy,
+ .compare = meta_var_compare,
+ .change = meta_var_change,
+ .apply_extras = meta_var_apply_extras
+ },
+ [TCF_META_TYPE_INT] = {
+ .compare = meta_int_compare,
+ .change = meta_int_change,
+ .apply_extras = meta_int_apply_extras
+ }
+};
+
+static inline struct meta_type_ops * meta_type_ops(struct meta_value *v)
+{
+ return &__meta_type_ops[meta_type(v)];
+}
+
+static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
+ struct meta_value *v, struct meta_obj *dst)
+{
+ int err;
+
+ if (meta_id(v) == TCF_META_ID_VALUE) {
+ dst->value = v->val;
+ dst->len = v->len;
+ return 0;
+ }
+
+ err = meta_ops(v)->get(skb, info, v, dst);
+ if (err < 0)
+ return err;
+
+ if (meta_type_ops(v)->apply_extras)
+ meta_type_ops(v)->apply_extras(v, dst);
+
+ return 0;
+}
+
+static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
+ struct tcf_pkt_info *info)
+{
+ int r;
+ struct meta_match *meta = (struct meta_match *) m->data;
+ struct meta_obj l_value, r_value;
+
+ if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+ meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+ return 0;
+
+ r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
+
+ switch (meta->lvalue.hdr.op) {
+ case TCF_EM_OPND_EQ:
+ return !r;
+ case TCF_EM_OPND_LT:
+ return r < 0;
+ case TCF_EM_OPND_GT:
+ return r > 0;
+ }
+
+ return 0;
+}
+
+static inline void meta_delete(struct meta_match *meta)
+{
+ struct meta_type_ops *ops = meta_type_ops(&meta->lvalue);
+
+ if (ops && ops->destroy) {
+ ops->destroy(&meta->lvalue);
+ ops->destroy(&meta->rvalue);
+ }
+
+ kfree(meta);
+}
+
+static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta)
+{
+ if (rta) {
+ if (RTA_PAYLOAD(rta) == 0)
+ return -EINVAL;
+
+ return meta_type_ops(dst)->change(dst, rta);
+ }
+
+ return 0;
+}
+
+static int em_meta_change(struct tcf_proto *tp, void *data, int len,
+ struct tcf_ematch *m)
+{
+ int err = -EINVAL;
+ struct rtattr *tb[TCA_EM_META_MAX];
+ struct tcf_meta_hdr *hdr;
+ struct meta_match *meta = NULL;
+
+ if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0)
+ goto errout;
+
+ if (tb[TCA_EM_META_HDR-1] == NULL ||
+ RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr))
+ goto errout;
+ hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]);
+
+ if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) ||
+ TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX ||
+ TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX ||
+ TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX)
+ goto errout;
+
+ meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+ if (meta == NULL)
+ goto errout;
+ memset(meta, 0, sizeof(*meta));
+
+ memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left));
+ memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right));
+
+ if (meta_ops(&meta->lvalue)->get == NULL ||
+ meta_ops(&meta->rvalue)->get == NULL) {
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
+ if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 ||
+ meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0)
+ goto errout;
+
+ m->datalen = sizeof(*meta);
+ m->data = (unsigned long) meta;
+
+ err = 0;
+errout:
+ if (err && meta)
+ meta_delete(meta);
+ return err;
+}
+
+static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m)
+{
+ meta_delete((struct meta_match *) m->data);
+}
+
+static struct tcf_ematch_ops em_meta_ops = {
+ .kind = TCF_EM_META,
+ .change = em_meta_change,
+ .match = em_meta_match,
+ .destroy = em_meta_destroy,
+ .owner = THIS_MODULE,
+ .link = LIST_HEAD_INIT(em_meta_ops.link)
+};
+
+static int __init init_em_meta(void)
+{
+ return tcf_em_register(&em_meta_ops);
+}
+
+static void __exit exit_em_meta(void)
+{
+ tcf_em_unregister(&em_meta_ops);
+}
+
+MODULE_LICENSE("GPL");
+
+module_init(init_em_meta);
+module_exit(exit_em_meta);
+
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] meta ematch
2005-01-14 15:14 ` Thomas Graf
@ 2005-01-16 14:58 ` jamal
2005-01-16 15:09 ` Thomas Graf
2005-01-16 16:11 ` jamal
1 sibling, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-16 14:58 UTC (permalink / raw)
To: Thomas Graf; +Cc: Patrick McHardy, netdev
On Fri, 2005-01-14 at 10:14, Thomas Graf wrote:
>
> Here's a revised patch. I fixed the numeric comparison issues and
> added meta_obj instead of using meta_data to give a better impression
> on the difference of a comparable object and meta data definitions.
I scanned the code very quickly; lets start with the big picture then i
will send some more comments:
Did i understand this correctly that a metamatch MUST have a lvalue +
rvalue pair?
What if all i wanted to say was
..
ematch indev eth0
I only see one value there - does that become l or r?
more comments coming - just ned some caffeine then i will stare.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-16 14:58 ` jamal
@ 2005-01-16 15:09 ` Thomas Graf
2005-01-16 15:37 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-16 15:09 UTC (permalink / raw)
To: jamal; +Cc: Patrick McHardy, netdev
* jamal <1105887519.1097.597.camel@jzny.localdomain> 2005-01-16 09:58
> On Fri, 2005-01-14 at 10:14, Thomas Graf wrote:
> >
> > Here's a revised patch. I fixed the numeric comparison issues and
> > added meta_obj instead of using meta_data to give a better impression
> > on the difference of a comparable object and meta data definitions.
>
> I scanned the code very quickly; lets start with the big picture then i
> will send some more comments:
> Did i understand this correctly that a metamatch MUST have a lvalue +
> rvalue pair?
They MAY have bove.
> What if all i wanted to say was
> ..
> ematch indev eth0
>
The lvalue will be TCF_META_ID_INDEV and your rvalue will be
TCF_META_TYPE_VAR with "eth0" as payload(TCF_EM_META_RVALUE).
TCF_EM_META_LVALUE will be unused in this case.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-16 15:09 ` Thomas Graf
@ 2005-01-16 15:37 ` jamal
2005-01-16 15:57 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-16 15:37 UTC (permalink / raw)
To: Thomas Graf; +Cc: Patrick McHardy, netdev
On Sun, 2005-01-16 at 10:09, Thomas Graf wrote:
> The lvalue will be TCF_META_ID_INDEV and your rvalue will be
> TCF_META_TYPE_VAR with "eth0" as payload(TCF_EM_META_RVALUE).
> TCF_EM_META_LVALUE will be unused in this case.
ok - i get it. So the rvalue is basically just the data that needs to be
compared against. rvalue confused me a little. If you had called it
meta_data i would have got it right away. But now that you explain it,
makes sense.
I am not sure iam following yet:
So in the case of indev, you would need to
- get indev ifindex from skb
- get indev name from skb
- compare the two??
Actually it may be a little overkill to have those two as separate
entities with their own headers etc, no? Why not just store it in the
same fashion you transported it from/to user space?
I will start looking at the code
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-16 15:37 ` jamal
@ 2005-01-16 15:57 ` Thomas Graf
2005-01-16 16:19 ` jamal
0 siblings, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-16 15:57 UTC (permalink / raw)
To: jamal; +Cc: Patrick McHardy, netdev
* jamal <1105889874.1090.613.camel@jzny.localdomain> 2005-01-16 10:37
> On Sun, 2005-01-16 at 10:09, Thomas Graf wrote:
>
> > The lvalue will be TCF_META_ID_INDEV and your rvalue will be
> > TCF_META_TYPE_VAR with "eth0" as payload(TCF_EM_META_RVALUE).
> > TCF_EM_META_LVALUE will be unused in this case.
>
> ok - i get it. So the rvalue is basically just the data that needs to be
> compared against. rvalue confused me a little. If you had called it
> meta_data i would have got it right away. But now that you explain it,
> makes sense.
The rvalue may also point to a metadata in the kernel. This gets
useful when comparing dev against real dev or if nfmark, tcindex,
you name it carries a ifindex for example. It would even be possible
to compare two strings from userspace but that wouldn't make sense.
The only difference between lvalue and rvalue is that the lvalue
carries the operand.
> I am not sure iam following yet:
>
> So in the case of indev, you would need to
> - get indev ifindex from skb
> - get indev name from skb
> - compare the two??
>
> Actually it may be a little overkill to have those two as separate
> entities with their own headers etc, no? Why not just store it in the
> same fashion you transported it from/to user space?
For devices, userspace can choose between comparing indices or
device names. The variable type will get more use once I add
the IPv6 routing meta matches.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-16 15:57 ` Thomas Graf
@ 2005-01-16 16:19 ` jamal
2005-01-16 16:49 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-16 16:19 UTC (permalink / raw)
To: Thomas Graf; +Cc: Patrick McHardy, netdev
On Sun, 2005-01-16 at 10:57, Thomas Graf wrote:
> The rvalue may also point to a metadata in the kernel. This gets
> useful when comparing dev against real dev or if nfmark, tcindex,
> you name it carries a ifindex for example. It would even be possible
> to compare two strings from userspace but that wouldn't make sense.
> The only difference between lvalue and rvalue is that the lvalue
> carries the operand.
>
ok, more clarity.
>
> > I am not sure iam following yet:
> >
> > So in the case of indev, you would need to
> > - get indev ifindex from skb
> > - get indev name from skb
> > - compare the two??
> >
Can you explain the above in context of indev = "eth0"? I am still not
sure i get it:
+ if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
+ meta_get(skb, info, &meta->rvalue, &r_value) < 0)
+ return 0;
+ r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-16 16:19 ` jamal
@ 2005-01-16 16:49 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-16 16:49 UTC (permalink / raw)
To: jamal; +Cc: Patrick McHardy, netdev
* jamal <1105892360.1091.655.camel@jzny.localdomain> 2005-01-16 11:19
> > > I am not sure iam following yet:
> > >
> > > So in the case of indev, you would need to
> > > - get indev ifindex from skb
> > > - get indev name from skb
> > > - compare the two??
> > >
>
> Can you explain the above in context of indev = "eth0"? I am still not
> sure i get it:
>
> + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
> + meta_get(skb, info, &meta->rvalue, &r_value) < 0)
> + return 0;
> + r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
Sure, indev = "eth0"
TCA_EM_META_HDR = {
.left = {
.kind = TCF_META_TYPE_VAR << 12 | TCF_META_ID_INDEV,
.op = TCF_EM_EQ,
},
.right = {
.kind = TCF_META_TYPE_VAR << 12 | TCF_META_ID_VALUE,
},
};
TCA_EM_META_LVALUE = EMPTY; /* unused */
TCA_EM_META_RVALUE = "eth0"
The configuration part will transform this into:
struct meta_match = {
.lvalue = {
.hdr = {
.kind = TCF_META_TYPE_VAR << 12 | TCF_META_ID_INDEV,
.op = TCF_EM_EQ,
},
},
.rvalue = {
.hdr = {
.kind = TCF_META_TYPE_VAR << 12 | TCF_META_ID_VALUE,
}
.val = strdup("eth0"),
.len = strlen("eth0"),
},
}
meta_get will make meta_obj's out of it:
struct meta_obj l_value = {
.value = dev->name,
.len = strlen(dev->name),
},
struct meta_obj r_value = {
.value = "eth0",
.len = strlen("eth0"),
}
the type specific compare will compare these.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-14 15:14 ` Thomas Graf
2005-01-16 14:58 ` jamal
@ 2005-01-16 16:11 ` jamal
2005-01-16 16:32 ` Thomas Graf
2005-01-16 16:32 ` Patrick McHardy
1 sibling, 2 replies; 44+ messages in thread
From: jamal @ 2005-01-16 16:11 UTC (permalink / raw)
To: Thomas Graf; +Cc: Patrick McHardy, netdev
On Fri, 2005-01-14 at 10:14, Thomas Graf wrote:
> diff -Nru linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h
[..]
> +struct tcf_meta_val
> +{
> + __u16 kind;
> + __u8 shift;
> + __u8 op;
> +};
> +
> +#define TCF_META_TYPE_MASK (0xf << 12)
> +#define TCF_META_TYPE(kind) (((kind) & TCF_META_TYPE_MASK) >> 12)
> +#define TCF_META_ID_MASK 0x7ff
> +#define TCF_META_ID(kind) ((kind) & TCF_META_ID_MASK)
Does it smell like there may be endianess issues? Probably not.
> +enum
> +{
> + TCF_META_TYPE_VAR,
> + TCF_META_TYPE_INT,
> + __TCF_META_TYPE_MAX
> +};
> +#define TCF_META_TYPE_MAX (__TCF_META_TYPE_MAX - 1)
> +
> +enum
> +{
> + TCF_META_ID_VALUE,
> + TCF_META_ID_RANDOM,
> + TCF_META_ID_LOADAVG_0,
> + TCF_META_ID_LOADAVG_1,
> + TCF_META_ID_LOADAVG_2,
> + TCF_META_ID_DEV,
Since filters are attached to devices - is TCF_META_ID_DEV of any value?
> diff -Nru linux-2.6.10-bk14.orig/net/sched/em_meta.c linux-2.6.10-bk14/net/sched/em_meta.c
> +
> +struct meta_obj
> +{
> + unsigned long value;
> + unsigned int len;
> +};
> +
> +struct meta_value
> +{
> + struct tcf_meta_val hdr;
> + unsigned long val;
> + unsigned int len;
Those last two look like meta_obj you defined above
> +/**************************************************************************
> + * System status & misc
> + **************************************************************************/
> +
> +static int meta_int_random(struct sk_buff *skb, struct tcf_pkt_info *info,
> + struct meta_value *v, struct meta_obj *dst)
> +{
> + get_random_bytes(&dst->value, sizeof(dst->value));
> + return 0;
> +}
> +
> +static inline unsigned long fixed_loadavg(unsigned long v)
> +{
> + return (v + (FIXED_1/200)) >> FSHIFT;
200 has some magic connotation to it - a define somewhere perhaps?
> +}
> +
> +static int meta_int_loadavg_0(struct sk_buff *skb, struct tcf_pkt_info *info,
> + struct meta_value *v, struct meta_obj *dst)
> +{
> + dst->value = fixed_loadavg(avenrun[0]);
> + return 0;
> +}
> +
> +static int meta_int_loadavg_1(struct sk_buff *skb, struct tcf_pkt_info *info,
> + struct meta_value *v, struct meta_obj *dst)
> +{
> + dst->value = fixed_loadavg(avenrun[1]);
> + return 0;
> +}
> +
> +static int meta_int_loadavg_2(struct sk_buff *skb, struct tcf_pkt_info *info,
> + struct meta_value *v, struct meta_obj *dst)
> +{
> + dst->value = fixed_loadavg(avenrun[2]);
> + return 0;
> +}
Theres a lot of parameters not used at all in these calls .get calls. So
far i have seen dst->value and some of the skb fields used. I apologize,
I normally dont pick on these things - so if you have future plans for
why you are passing those, keep them and ignore the comment.
BTW, it would probably be useful to return some mnemonic instead of 0.
> +/**************************************************************************
> + * Device names & indices
> + **************************************************************************/
> +
> +static inline int int_dev(struct net_device *dev, struct meta_obj *dst)
> +{
> + if (unlikely(dev == NULL))
> + return -1;
> +
> + dst->value = dev->ifindex;
> + return 0;
> +}
> +
> +static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
> +{
> + if (unlikely(dev == NULL))
> + return -1;
> +
> + dst->value = (unsigned long) dev->name;
> + dst->len = strlen(dev->name);
So if device dissapears ... what happens to the pointer?
[..]
> +static inline struct meta_ops * meta_ops(struct meta_value *v)
> +{
> + return &__meta_ops[meta_type(v)][meta_id(v)];
> +}
> +
> +static int meta_var_compare(struct meta_obj *a, struct meta_obj *b)
> +{
> + int r = a->len - b->len;
> +
> + if (r == 0)
> + r = memcmp((void *) a->value, (void *) b->value, a->len);
> +
> + return r;
> +}
clever
> +static void meta_var_apply_extras(struct meta_value *v,
> + struct meta_obj *dst)
> +{
> + int shift = v->hdr.shift;
> +
> + if (shift && shift < dst->len)
> + dst->len -= shift;
> +}
whats the purpose to the extras?
> +static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
> +{
> + /* Let gcc optimize it, the unlikely is not really based on
> + * some numbers but jump free code for missmatches seems
> + * more logical.
> + */
> + if (unlikely(a == b))
> + return 0;
> + else if (a < b)
> + return -1;
> + else
> + return 1;
> +}
Would be very useful to return mnemonics for readability.
> +static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
> + struct tcf_pkt_info *info)
> +{
> + int r;
> + struct meta_match *meta = (struct meta_match *) m->data;
> + struct meta_obj l_value, r_value;
> +
> + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
> + meta_get(skb, info, &meta->rvalue, &r_value) < 0)
> + return 0;
This is one part that confused me in my earlier email
> + r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
And this is where it started
Overall comment: Well done
usability comment:
Ok, I have to admit I am not a friend of too-friendly, which is one of
the faults IMO with netfilter; however, this would have
joenetfilterfireman sweat a little too profusely.
I think you could add a new metafield in 31 seconds. I could probably do
it in 95 seconds. Would be ideal to get average
janeorjoenetfilterfireman to do it in 30 minutes - I dont think you are
there.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] meta ematch
2005-01-16 16:11 ` jamal
@ 2005-01-16 16:32 ` Thomas Graf
2005-01-16 17:18 ` jamal
2005-01-16 16:32 ` Patrick McHardy
1 sibling, 1 reply; 44+ messages in thread
From: Thomas Graf @ 2005-01-16 16:32 UTC (permalink / raw)
To: jamal; +Cc: Patrick McHardy, netdev
* jamal <1105891871.1097.647.camel@jzny.localdomain> 2005-01-16 11:11
> On Fri, 2005-01-14 at 10:14, Thomas Graf wrote:
>
> > diff -Nru linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h
> [..]
> > +struct tcf_meta_val
> > +{
> > + __u16 kind;
> > + __u8 shift;
> > + __u8 op;
> > +};
> > +
> > +#define TCF_META_TYPE_MASK (0xf << 12)
> > +#define TCF_META_TYPE(kind) (((kind) & TCF_META_TYPE_MASK) >> 12)
> > +#define TCF_META_ID_MASK 0x7ff
> > +#define TCF_META_ID(kind) ((kind) & TCF_META_ID_MASK)
>
> Does it smell like there may be endianess issues? Probably not.
Not really as long as iproute2 uses the same byte ordering. It has the
same issues as all other rtnetlink users.
> > + TCF_META_ID_DEV,
>
> Since filters are attached to devices - is TCF_META_ID_DEV of any value?
Yes, to compare against realdev and indev.
> > +struct meta_value
> > +{
> > + struct tcf_meta_val hdr;
> > + unsigned long val;
> > + unsigned int len;
>
> Those last two look like meta_obj you defined above
Yes, they once were but the code is more readable this way
because one cannot mistake meta_obj (temporary data) with
the data/len in meta_value (persistent).
> > + return (v + (FIXED_1/200)) >> FSHIFT;
>
> 200 has some magic connotation to it - a define somewhere perhaps?
I coped this from the code for procfs ;->
> Theres a lot of parameters not used at all in these calls .get calls. So
> far i have seen dst->value and some of the skb fields used. I apologize,
> I normally dont pick on these things - so if you have future plans for
> why you are passing those, keep them and ignore the comment.
I do have plans but for more complicated meta matches and they will
take some time and will get pushed in a second iteration.
> > +static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
> > +{
> > + if (unlikely(dev == NULL))
> > + return -1;
> > +
> > + dst->value = (unsigned long) dev->name;
> > + dst->len = strlen(dev->name);
>
> So if device dissapears ... what happens to the pointer?
Not possible, the pointer is only legal to use while classifying,
dst is on the stack of meta_match and the device cannot
disappear while classyfing.
> > +static void meta_var_apply_extras(struct meta_value *v,
> > + struct meta_obj *dst)
> > +{
> > + int shift = v->hdr.shift;
> > +
> > + if (shift && shift < dst->len)
> > + dst->len -= shift;
> > +}
>
> whats the purpose to the extras?
"eth0" shift 1 gets "eth" so we can emulate eth%. It will
also get useful for ipv6 routing bits to implement the
prefix.
> > +static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
> > + struct tcf_pkt_info *info)
> > +{
> > + int r;
> > + struct meta_match *meta = (struct meta_match *) m->data;
> > + struct meta_obj l_value, r_value;
> > +
> > + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
> > + meta_get(skb, info, &meta->rvalue, &r_value) < 0)
> > + return 0;
>
> This is one part that confused me in my earlier email
It transforms the meta data info in (l|r)value into 2 temporary
meta objects for comparing. mgiht help if i find a better
name.
> Ok, I have to admit I am not a friend of too-friendly, which is one of
> the faults IMO with netfilter; however, this would have
> joenetfilterfireman sweat a little too profusely.
> I think you could add a new metafield in 31 seconds. I could probably do
> it in 95 seconds. Would be ideal to get average
> janeorjoenetfilterfireman to do it in 30 minutes - I dont think you are
> there.
ideas? a step-by-step guide in Documentation/? ;->
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] meta ematch
2005-01-16 16:32 ` Thomas Graf
@ 2005-01-16 17:18 ` jamal
2005-01-16 18:47 ` Thomas Graf
0 siblings, 1 reply; 44+ messages in thread
From: jamal @ 2005-01-16 17:18 UTC (permalink / raw)
To: Thomas Graf; +Cc: Patrick McHardy, netdev
On Sun, 2005-01-16 at 11:32, Thomas Graf wrote:
> * jamal <1105891871.1097.647.camel@jzny.localdomain> 2005-01-16 11:11
> > On Fri, 2005-01-14 at 10:14, Thomas Graf wrote:
> >
> > > diff -Nru linux-2.6.10-bk14.orig/include/linux/tc_ematch/tc_em_meta.h linux-2.6.10-bk14/include/linux/tc_ematch/tc_em_meta.h
> > [..]
> > > +struct tcf_meta_val
> > > +{
> > > + __u16 kind;
> > > + __u8 shift;
> > > + __u8 op;
> > > +};
> > > +
> > > +#define TCF_META_TYPE_MASK (0xf << 12)
> > > +#define TCF_META_TYPE(kind) (((kind) & TCF_META_TYPE_MASK) >> 12)
> > > +#define TCF_META_ID_MASK 0x7ff
> > > +#define TCF_META_ID(kind) ((kind) & TCF_META_ID_MASK)
> >
> > Does it smell like there may be endianess issues? Probably not.
>
> Not really as long as iproute2 uses the same byte ordering. It has the
> same issues as all other rtnetlink users.
wont harm to do a quick test if you have hardware. pedit for example
still has some occasional issues some issues with big endian which i
havent had time to chase.
> > > + TCF_META_ID_DEV,
> >
> > Since filters are attached to devices - is TCF_META_ID_DEV of any value?
>
> Yes, to compare against realdev and indev.
makes sense
> > > +struct meta_value
> > > +{
> > > + struct tcf_meta_val hdr;
> > > + unsigned long val;
> > > + unsigned int len;
> >
> > Those last two look like meta_obj you defined above
>
> Yes, they once were but the code is more readable this way
> because one cannot mistake meta_obj (temporary data) with
> the data/len in meta_value (persistent).
>
fine
> > > + return (v + (FIXED_1/200)) >> FSHIFT;
> >
> > 200 has some magic connotation to it - a define somewhere perhaps?
>
> I coped this from the code for procfs ;->
know why they have that number? It must have some significance - or
maybe someone just stuck their hand in the air and measured 200? ;->
> > Theres a lot of parameters not used at all in these calls .get calls. So
> > far i have seen dst->value and some of the skb fields used. I apologize,
> > I normally dont pick on these things - so if you have future plans for
> > why you are passing those, keep them and ignore the comment.
>
> I do have plans but for more complicated meta matches and they will
> take some time and will get pushed in a second iteration.
>
np
> > > +static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
> > > +{
> > > + if (unlikely(dev == NULL))
> > > + return -1;
> > > +
> > > + dst->value = (unsigned long) dev->name;
> > > + dst->len = strlen(dev->name);
> >
> > So if device dissapears ... what happens to the pointer?
>
> Not possible, the pointer is only legal to use while classifying,
> dst is on the stack of meta_match and the device cannot
> disappear while classyfing.
makes sense then
> > > +static void meta_var_apply_extras(struct meta_value *v,
> > > + struct meta_obj *dst)
> > > +{
> > > + int shift = v->hdr.shift;
> > > +
> > > + if (shift && shift < dst->len)
> > > + dst->len -= shift;
> > > +}
> >
> > whats the purpose to the extras?
>
> "eth0" shift 1 gets "eth" so we can emulate eth%. It will
> also get useful for ipv6 routing bits to implement the
> prefix.
>
makes sense - ala ppp*
> > > +static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
> > > + struct tcf_pkt_info *info)
> > > +{
> > > + int r;
> > > + struct meta_match *meta = (struct meta_match *) m->data;
> > > + struct meta_obj l_value, r_value;
> > > +
> > > + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 ||
> > > + meta_get(skb, info, &meta->rvalue, &r_value) < 0)
> > > + return 0;
> >
> > This is one part that confused me in my earlier email
>
> It transforms the meta data info in (l|r)value into 2 temporary
> meta objects for comparing. mgiht help if i find a better
> name.
>
> > Ok, I have to admit I am not a friend of too-friendly, which is one of
> > the faults IMO with netfilter; however, this would have
> > joenetfilterfireman sweat a little too profusely.
> > I think you could add a new metafield in 31 seconds. I could probably do
> > it in 95 seconds. Would be ideal to get average
> > janeorjoenetfilterfireman to do it in 30 minutes - I dont think you are
> > there.
>
> ideas? a step-by-step guide in Documentation/? ;->
I am not sure - probably more inlined commenting as you normally do.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] meta ematch
2005-01-16 17:18 ` jamal
@ 2005-01-16 18:47 ` Thomas Graf
0 siblings, 0 replies; 44+ messages in thread
From: Thomas Graf @ 2005-01-16 18:47 UTC (permalink / raw)
To: jamal; +Cc: Patrick McHardy, netdev
* jamal <1105895936.1090.717.camel@jzny.localdomain> 2005-01-16 12:18
> On Sun, 2005-01-16 at 11:32, Thomas Graf wrote:
> > Not really as long as iproute2 uses the same byte ordering. It has the
> > same issues as all other rtnetlink users.
>
> wont harm to do a quick test if you have hardware. pedit for example
> still has some occasional issues some issues with big endian which i
> havent had time to chase.
Uhmm.. yes. The endianess comes in at sutff like skb->protocol. Leaving
it to userspace makes comparison beyond simple equals quite difficult.
Providing a method to transform in kernel space adds more complexity.
> > > > + return (v + (FIXED_1/200)) >> FSHIFT;
> > >
> > > 200 has some magic connotation to it - a define somewhere perhaps?
> >
> > I coped this from the code for procfs ;->
>
> know why they have that number? It must have some significance - or
> maybe someone just stuck their hand in the air and measured 200? ;->
It is some kind of factor and has almost no impact in our case because
it only changes the first 4 bits in the exp part and I'm only interested
in the integer part. It might be a good idea to take a few bits in from
the exp part and provide the load as *10^n where n is either 2 or 3,
i.e. a load of 1.9 would be 190. I have to think a little more about
this.
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] meta ematch
2005-01-16 16:11 ` jamal
2005-01-16 16:32 ` Thomas Graf
@ 2005-01-16 16:32 ` Patrick McHardy
2005-01-16 17:24 ` jamal
1 sibling, 1 reply; 44+ messages in thread
From: Patrick McHardy @ 2005-01-16 16:32 UTC (permalink / raw)
To: hadi; +Cc: Thomas Graf, netdev
jamal wrote:
>>+static int meta_int_loadavg_2(struct sk_buff *skb, struct tcf_pkt_info *info,
>>+ struct meta_value *v, struct meta_obj *dst)
>>+{
>>+ dst->value = fixed_loadavg(avenrun[2]);
>>+ return 0;
>>+}
>>
>
>Theres a lot of parameters not used at all in these calls .get calls. So
>far i have seen dst->value and some of the skb fields used. I apologize,
>I normally dont pick on these things - so if you have future plans for
>why you are passing those, keep them and ignore the comment.
>BTW, it would probably be useful to return some mnemonic instead of 0.
>
Returning 0 for success and negative error codes is perfectly fine as long
as you don't need any magic numbers (1, 2, ..).
>>+static inline int var_dev(struct net_device *dev, struct meta_obj *dst)
>>+{
>>+ if (unlikely(dev == NULL))
>>+ return -1;
>>+
>>+ dst->value = (unsigned long) dev->name;
>>+ dst->len = strlen(dev->name);
>>
>>
>
>So if device dissapears ... what happens to the pointer?
>
>
Devices don't disappear during packet processing.
>>+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
>>+{
>>+ /* Let gcc optimize it, the unlikely is not really based on
>>+ * some numbers but jump free code for missmatches seems
>>+ * more logical.
>>+ */
>>+ if (unlikely(a == b))
>>+ return 0;
>>+ else if (a < b)
>>+ return -1;
>>+ else
>>+ return 1;
>>+}
>>
>>
>
>Would be very useful to return mnemonics for readability.
>
>
Same as for above, everyone knows what to expect from a *_compare function.
Returning stuff like CMP_LT, CMP_BT, .. is just ugly.
Regards
Patrick
^ permalink raw reply [flat|nested] 44+ messages in thread* Re: [RFC] meta ematch
2005-01-16 16:32 ` Patrick McHardy
@ 2005-01-16 17:24 ` jamal
0 siblings, 0 replies; 44+ messages in thread
From: jamal @ 2005-01-16 17:24 UTC (permalink / raw)
To: Patrick McHardy; +Cc: Thomas Graf, netdev
On Sun, 2005-01-16 at 11:32, Patrick McHardy wrote:
> jamal wrote:
>
[..]
> >BTW, it would probably be useful to return some mnemonic instead of 0.
> Returning 0 for success and negative error codes is perfectly fine as long
> as you don't need any magic numbers (1, 2, ..).
[..]
> >>+static int meta_int_compare(struct meta_obj *a, struct meta_obj *b)
> >>+{
> >>+ /* Let gcc optimize it, the unlikely is not really based on
> >>+ * some numbers but jump free code for missmatches seems
> >>+ * more logical.
> >>+ */
> >>+ if (unlikely(a == b))
> >>+ return 0;
> >>+ else if (a < b)
> >>+ return -1;
> >>+ else
> >>+ return 1;
> >>+}
> >>
> >>
> >
> >Would be very useful to return mnemonics for readability.
> >
> >
> Same as for above, everyone knows what to expect from a *_compare function.
> Returning stuff like CMP_LT, CMP_BT, .. is just ugly.
I am not sure i remember whether -1 or 1 is the LT even though i have
used strcmp for years ;-> Actually i try hard not to have my brain
remember. In the case of the .get function above, i may agree with you
that returning MATCH_SUCEEDED may be a little overkill.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-03 12:56 [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Thomas Graf
2005-01-04 4:13 ` jamal
2005-01-04 22:36 ` Thomas Graf
@ 2005-01-05 13:32 ` Florian Weimer
2005-01-05 13:45 ` jamal
2 siblings, 1 reply; 44+ messages in thread
From: Florian Weimer @ 2005-01-05 13:32 UTC (permalink / raw)
To: Thomas Graf; +Cc: netdev
* Thomas Graf:
> I attached 4 patches of a first ematch implementation. Comments
> and suggestions very much appreciated. Compiles but untested.
This might infringe US patent 5,793,954 and the patents that reference
it. 8-(
^ permalink raw reply [flat|nested] 44+ messages in thread
* Re: [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier
2005-01-05 13:32 ` [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Florian Weimer
@ 2005-01-05 13:45 ` jamal
0 siblings, 0 replies; 44+ messages in thread
From: jamal @ 2005-01-05 13:45 UTC (permalink / raw)
To: Florian Weimer; +Cc: Thomas Graf, netdev
On Wed, 2005-01-05 at 08:32, Florian Weimer wrote:
> * Thomas Graf:
>
> > I attached 4 patches of a first ematch implementation. Comments
> > and suggestions very much appreciated. Compiles but untested.
>
> This might infringe US patent 5,793,954 and the patents that reference
> it. 8-(
hehe. We can sleep better knowing we dont run Linux using C++ ;->
That patents reads like it owns every classifier ever written in C++
that runs on a processor.
cheers,
jamal
^ permalink raw reply [flat|nested] 44+ messages in thread
end of thread, other threads:[~2005-01-16 18:47 UTC | newest]
Thread overview: 44+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-01-03 12:56 [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Thomas Graf
2005-01-04 4:13 ` jamal
2005-01-04 12:03 ` Thomas Graf
2005-01-04 13:19 ` jamal
2005-01-04 13:46 ` Thomas Graf
2005-01-04 12:27 ` Thomas Graf
2005-01-04 13:22 ` jamal
2005-01-04 13:41 ` Thomas Graf
2005-01-05 2:54 ` jamal
2005-01-05 11:09 ` Thomas Graf
2005-01-04 22:36 ` Thomas Graf
2005-01-05 3:12 ` jamal
2005-01-05 11:00 ` Thomas Graf
2005-01-05 13:33 ` jamal
2005-01-05 14:45 ` Thomas Graf
2005-01-05 16:48 ` Thomas Graf
2005-01-06 14:03 ` jamal
2005-01-06 13:47 ` jamal
2005-01-06 19:41 ` Thomas Graf
2005-01-07 13:45 ` jamal
2005-01-08 14:54 ` Thomas Graf
2005-01-10 13:26 ` jamal
2005-01-10 21:17 ` Thomas Graf
2005-01-10 22:05 ` jamal
2005-01-10 23:30 ` Thomas Graf
2005-01-13 17:41 ` [RFC] meta ematch Thomas Graf
2005-01-13 18:54 ` Patrick McHardy
2005-01-13 19:20 ` Thomas Graf
2005-01-14 1:13 ` Patrick McHardy
2005-01-14 15:14 ` Thomas Graf
2005-01-16 14:58 ` jamal
2005-01-16 15:09 ` Thomas Graf
2005-01-16 15:37 ` jamal
2005-01-16 15:57 ` Thomas Graf
2005-01-16 16:19 ` jamal
2005-01-16 16:49 ` Thomas Graf
2005-01-16 16:11 ` jamal
2005-01-16 16:32 ` Thomas Graf
2005-01-16 17:18 ` jamal
2005-01-16 18:47 ` Thomas Graf
2005-01-16 16:32 ` Patrick McHardy
2005-01-16 17:24 ` jamal
2005-01-05 13:32 ` [RFC] ematch API, u32 ematch, nbyte ematch, basic classifier Florian Weimer
2005-01-05 13:45 ` jamal
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).