* [PATCH 25/79] IPVS: Split ports[2] into src_port and dst_port
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Hans Schillstrom <hans.schillstrom@ericsson.com>
Avoid sending invalid pointer due to skb_linearize() call.
This patch prepares for next patch where skb_linearize is a part.
In ip_vs_sched_persist() params the ports ptr will be replaced by
src and dst port.
Signed-off-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
net/netfilter/ipvs/ip_vs_core.c | 21 +++++++++++----------
1 files changed, 11 insertions(+), 10 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index e2bb3cd..9acdd79 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -200,7 +200,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
struct sk_buff *skb,
- __be16 ports[2])
+ __be16 src_port, __be16 dst_port)
{
struct ip_vs_conn *cp = NULL;
struct ip_vs_iphdr iph;
@@ -224,8 +224,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
"mnet %s\n",
- IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
- IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
+ IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
+ IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
IP_VS_DBG_ADDR(svc->af, &snet));
/*
@@ -247,14 +247,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
__be16 vport = 0;
- if (ports[1] == svc->port) {
+ if (dst_port == svc->port) {
/* non-FTP template:
* <protocol, caddr, 0, vaddr, vport, daddr, dport>
* FTP template:
* <protocol, caddr, 0, vaddr, 0, daddr, 0>
*/
if (svc->port != FTPPORT)
- vport = ports[1];
+ vport = dst_port;
} else {
/* Note: persistent fwmark-based services and
* persistent port zero service are handled here.
@@ -285,7 +285,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
return NULL;
}
- if (ports[1] == svc->port && svc->port != FTPPORT)
+ if (dst_port == svc->port && svc->port != FTPPORT)
dport = dest->port;
/* Create a template
@@ -306,7 +306,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
kfree(param.pe_data);
}
- dport = ports[1];
+ dport = dst_port;
if (dport == svc->port && dest->port)
dport = dest->port;
@@ -317,8 +317,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
/*
* Create a new connection according to the template
*/
- ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0],
- &iph.daddr, ports[1], ¶m);
+ ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, src_port,
+ &iph.daddr, dst_port, ¶m);
+
cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest, skb->mark);
if (cp == NULL) {
ip_vs_conn_put(ct);
@@ -388,7 +389,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
*/
if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
*ignored = 0;
- return ip_vs_sched_persist(svc, skb, pptr);
+ return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1]);
}
/*
--
1.7.2.3
^ permalink raw reply related
* [PATCH 20/79] ipvs: add static and read_mostly attributes
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Eric Dumazet <eric.dumazet@gmail.com>
ip_vs_conn_tab_bits & ip_vs_conn_tab_mask are static to
ipvs/ip_vs_conn.c
ip_vs_conn_tab_size, ip_vs_conn_tab_mask, ip_vs_conn_tab [the pointer],
ip_vs_conn_rnd are mostly read.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
net/netfilter/ipvs/ip_vs_conn.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 261db1a..7615f9e 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -48,18 +48,18 @@
/*
* Connection hash size. Default is what was selected at compile time.
*/
-int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
+static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
/* size and mask values */
-int ip_vs_conn_tab_size;
-int ip_vs_conn_tab_mask;
+int ip_vs_conn_tab_size __read_mostly;
+static int ip_vs_conn_tab_mask __read_mostly;
/*
* Connection hash table: for input and output packets lookups of IPVS
*/
-static struct list_head *ip_vs_conn_tab;
+static struct list_head *ip_vs_conn_tab __read_mostly;
/* SLAB cache for IPVS connections */
static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
@@ -71,7 +71,7 @@ static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
/* random value for IPVS connection hash */
-static unsigned int ip_vs_conn_rnd;
+static unsigned int ip_vs_conn_rnd __read_mostly;
/*
* Fine locking granularity for big connection hash table
--
1.7.2.3
^ permalink raw reply related
* [PATCH 17/79] IPVS: Make the cp argument to ip_vs_sync_conn() static
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Acked-by: Hans Schillstrom <hans.schillstrom@ericsson.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
include/net/ip_vs.h | 2 +-
net/netfilter/ipvs/ip_vs_sync.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index be2b569..d5a32e4 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -916,7 +916,7 @@ extern char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
extern char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
extern int start_sync_thread(int state, char *mcast_ifn, __u8 syncid);
extern int stop_sync_thread(int state);
-extern void ip_vs_sync_conn(struct ip_vs_conn *cp);
+extern void ip_vs_sync_conn(const struct ip_vs_conn *cp);
/*
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ab85aed..a4dccbc 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -236,7 +236,7 @@ get_curr_sync_buff(unsigned long time)
* Add an ip_vs_conn information into the current sync_buff.
* Called by ip_vs_in.
*/
-void ip_vs_sync_conn(struct ip_vs_conn *cp)
+void ip_vs_sync_conn(const struct ip_vs_conn *cp)
{
struct ip_vs_sync_mesg *m;
struct ip_vs_sync_conn *s;
--
1.7.2.3
^ permalink raw reply related
* [PATCH 16/79] IPVS: Only match pe_data created by the same pe
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
Only match persistence engine data if it was
created by the same persistence engine.
Reported-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
net/netfilter/ipvs/ip_vs_conn.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 64a9ca3..261db1a 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -354,7 +354,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
if (p->pe_data && p->pe->ct_match) {
- if (p->pe->ct_match(p, cp))
+ if (p->pe == cp->pe && p->pe->ct_match(p, cp))
goto out;
continue;
}
--
1.7.2.3
^ permalink raw reply related
* [PATCH 15/79] IPVS: Add persistence engine to connection entry
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Simon Horman <horms@verge.net.au>
The dest of a connection may not exist if it has been created as the result
of connection synchronisation. But in order for connection entries for
templates with persistence engine data created through connection
synchronisation to be valid access to the persistence engine pointer is
required. So add the persistence engine to the connection itself.
Signed-off-by: Simon Horman <horms@verge.net.au>
---
include/net/ip_vs.h | 16 ++++++++++++++--
net/netfilter/ipvs/ip_vs_conn.c | 19 ++++++++++---------
net/netfilter/ipvs/ip_vs_ctl.c | 4 ++--
net/netfilter/ipvs/ip_vs_pe.c | 14 ++++----------
4 files changed, 30 insertions(+), 23 deletions(-)
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index b7bbd6c..be2b569 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -422,6 +422,7 @@ struct ip_vs_conn {
struct ip_vs_seq in_seq; /* incoming seq. struct */
struct ip_vs_seq out_seq; /* outgoing seq. struct */
+ const struct ip_vs_pe *pe;
char *pe_data;
__u8 pe_data_len;
};
@@ -814,8 +815,19 @@ void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe);
void ip_vs_unbind_pe(struct ip_vs_service *svc);
int register_ip_vs_pe(struct ip_vs_pe *pe);
int unregister_ip_vs_pe(struct ip_vs_pe *pe);
-extern struct ip_vs_pe *ip_vs_pe_get(const char *name);
-extern void ip_vs_pe_put(struct ip_vs_pe *pe);
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name);
+
+static inline void ip_vs_pe_get(const struct ip_vs_pe *pe)
+{
+ if (pe && pe->module)
+ __module_get(pe->module);
+}
+
+static inline void ip_vs_pe_put(const struct ip_vs_pe *pe)
+{
+ if (pe && pe->module)
+ module_put(pe->module);
+}
/*
* IPVS protocol functions (from ip_vs_proto.c)
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e9adecd..64a9ca3 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -176,8 +176,8 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport,
NULL, 0, &p);
- if (cp->dest && cp->dest->svc->pe) {
- p.pe = cp->dest->svc->pe;
+ if (cp->pe) {
+ p.pe = cp->pe;
p.pe_data = cp->pe_data;
p.pe_data_len = cp->pe_data_len;
}
@@ -765,6 +765,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->flags & IP_VS_CONN_F_NFCT)
ip_vs_conn_drop_conntrack(cp);
+ ip_vs_pe_put(cp->pe);
kfree(cp->pe_data);
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
@@ -826,7 +827,9 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
&cp->daddr, daddr);
cp->dport = dport;
cp->flags = flags;
- if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) {
+ if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
+ ip_vs_pe_get(p->pe);
+ cp->pe = p->pe;
cp->pe_data = p->pe_data;
cp->pe_data_len = p->pe_data_len;
}
@@ -958,15 +961,13 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
size_t len = 0;
- if (cp->dest && cp->pe_data &&
- cp->dest->svc->pe->show_pe_data) {
+ if (cp->pe_data) {
pe_data[0] = ' ';
- len = strlen(cp->dest->svc->pe->name);
- memcpy(pe_data + 1, cp->dest->svc->pe->name, len);
+ len = strlen(cp->pe->name);
+ memcpy(pe_data + 1, cp->pe->name, len);
pe_data[len + 1] = ' ';
len += 2;
- len += cp->dest->svc->pe->show_pe_data(cp,
- pe_data + len);
+ len += cp->pe->show_pe_data(cp, pe_data + len);
}
pe_data[len] = '\0';
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5f5daa3..3e92558 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1139,7 +1139,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
}
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
@@ -1250,7 +1250,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
old_sched = sched;
if (u->pe_name && *u->pe_name) {
- pe = ip_vs_pe_get(u->pe_name);
+ pe = ip_vs_pe_getbyname(u->pe_name);
if (pe == NULL) {
pr_info("persistence engine module ip_vs_pe_%s "
"not found\n", u->pe_name);
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 3414af7..e99f920 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -30,7 +30,7 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
/* Get pe in the pe list by name */
static struct ip_vs_pe *
-ip_vs_pe_getbyname(const char *pe_name)
+__ip_vs_pe_getbyname(const char *pe_name)
{
struct ip_vs_pe *pe;
@@ -60,28 +60,22 @@ ip_vs_pe_getbyname(const char *pe_name)
}
/* Lookup pe and try to load it if it doesn't exist */
-struct ip_vs_pe *ip_vs_pe_get(const char *name)
+struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
{
struct ip_vs_pe *pe;
/* Search for the pe by name */
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
/* If pe not found, load the module and search again */
if (!pe) {
request_module("ip_vs_pe_%s", name);
- pe = ip_vs_pe_getbyname(name);
+ pe = __ip_vs_pe_getbyname(name);
}
return pe;
}
-void ip_vs_pe_put(struct ip_vs_pe *pe)
-{
- if (pe && pe->module)
- module_put(pe->module);
-}
-
/* Register a pe in the pe list */
int register_ip_vs_pe(struct ip_vs_pe *pe)
{
--
1.7.2.3
^ permalink raw reply related
* [PATCH 14/79] netfilter: rcu sparse cleanups
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Eric Dumazet <eric.dumazet@gmail.com>
Use RCU helpers to reduce number of sparse warnings
(CONFIG_SPARSE_RCU_POINTER=y), and adds lockdep checks.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/nf_conntrack_expect.c | 15 ++++++++++++---
net/netfilter/nf_conntrack_extend.c | 6 ++++--
net/netfilter/nf_conntrack_helper.c | 10 ++++++++--
net/netfilter/nf_conntrack_proto.c | 4 ++--
4 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index cab196c..bbb2140 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -337,7 +337,10 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
(unsigned long)exp);
if (master_help) {
- p = &master_help->helper->expect_policy[exp->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[exp->class];
exp->timeout.expires = jiffies + p->timeout * HZ;
}
add_timer(&exp->timeout);
@@ -373,7 +376,10 @@ static inline int refresh_timer(struct nf_conntrack_expect *i)
if (!del_timer(&i->timeout))
return 0;
- p = &master_help->helper->expect_policy[i->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[i->class];
i->timeout.expires = jiffies + p->timeout * HZ;
add_timer(&i->timeout);
return 1;
@@ -411,7 +417,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
}
/* Will be over limit? */
if (master_help) {
- p = &master_help->helper->expect_policy[expect->class];
+ p = &rcu_dereference_protected(
+ master_help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ )->expect_policy[expect->class];
if (p->max_expected &&
master_help->expecting[expect->class] >= p->max_expected) {
evict_oldest_expect(master, expect);
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 920f924..80a23ed 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -140,14 +140,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type)
/* This assumes that extended areas in conntrack for the types
whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
for (i = min; i <= max; i++) {
- t1 = nf_ct_ext_types[i];
+ t1 = rcu_dereference_protected(nf_ct_ext_types[i],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (!t1)
continue;
t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
t1->len;
for (j = 0; j < NF_CT_EXT_NUM; j++) {
- t2 = nf_ct_ext_types[j];
+ t2 = rcu_dereference_protected(nf_ct_ext_types[j],
+ lockdep_is_held(&nf_ct_ext_type_mutex));
if (t2 == NULL || t2 == t1 ||
(t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
continue;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 59e1a4c..767bbe9 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -158,7 +158,10 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
struct nf_conn_help *help = nfct_help(ct);
- if (help && help->helper == me) {
+ if (help && rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ ) == me) {
nf_conntrack_event(IPCT_HELPER, ct);
rcu_assign_pointer(help->helper, NULL);
}
@@ -210,7 +213,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
hlist_for_each_entry_safe(exp, n, next,
&net->ct.expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
- if ((help->helper == me || exp->helper == me) &&
+ if ((rcu_dereference_protected(
+ help->helper,
+ lockdep_is_held(&nf_conntrack_lock)
+ ) == me || exp->helper == me) &&
del_timer(&exp->timeout)) {
nf_ct_unlink_expect(exp);
nf_ct_expect_put(exp);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 03b56a0..5701c8d 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -284,7 +284,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
mutex_lock(&nf_ct_proto_mutex);
if (!nf_ct_protos[l4proto->l3proto]) {
/* l3proto may be loaded latter. */
- struct nf_conntrack_l4proto **proto_array;
+ struct nf_conntrack_l4proto __rcu **proto_array;
int i;
proto_array = kmalloc(MAX_NF_CT_PROTO *
@@ -296,7 +296,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
}
for (i = 0; i < MAX_NF_CT_PROTO; i++)
- proto_array[i] = &nf_conntrack_l4proto_generic;
+ RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
/* Before making proto_array visible to lockless readers,
* we must make sure its content is committed to memory.
--
1.7.2.3
^ permalink raw reply related
* [PATCH 10/79] netfilter: add __rcu annotations
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Eric Dumazet <eric.dumazet@gmail.com>
Add some __rcu annotations and use helpers to reduce number of sparse
warnings (CONFIG_SPARSE_RCU_POINTER=y)
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/linux/netfilter.h | 6 +++---
include/net/netfilter/nf_conntrack_ecache.h | 4 ++--
include/net/netfilter/nf_conntrack_l3proto.h | 2 +-
net/netfilter/core.c | 4 ++--
net/netfilter/nf_conntrack_expect.c | 6 +++---
net/netfilter/nf_conntrack_proto.c | 20 +++++++++++++++-----
net/netfilter/nf_conntrack_standalone.c | 9 ++++++---
net/netfilter/nf_log.c | 6 ++++--
net/netfilter/nf_queue.c | 18 ++++++++++++++----
net/netfilter/nfnetlink_log.c | 6 +++---
10 files changed, 53 insertions(+), 28 deletions(-)
diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 89341c3..928a35e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -265,7 +265,7 @@ struct nf_afinfo {
int route_key_size;
};
-extern const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO];
+extern const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO];
static inline const struct nf_afinfo *nf_get_afinfo(unsigned short family)
{
return rcu_dereference(nf_afinfo[family]);
@@ -355,9 +355,9 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
#endif /*CONFIG_NETFILTER*/
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
-extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+extern void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu;
extern void nf_ct_attach(struct sk_buff *, struct sk_buff *);
-extern void (*nf_ct_destroy)(struct nf_conntrack *);
+extern void (*nf_ct_destroy)(struct nf_conntrack *) __rcu;
#else
static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
#endif
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index f596b60..8fdb04b 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -67,7 +67,7 @@ struct nf_ct_event_notifier {
int (*fcn)(unsigned int events, struct nf_ct_event *item);
};
-extern struct nf_ct_event_notifier *nf_conntrack_event_cb;
+extern struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
@@ -167,7 +167,7 @@ struct nf_exp_event_notifier {
int (*fcn)(unsigned int events, struct nf_exp_event *item);
};
-extern struct nf_exp_event_notifier *nf_expect_event_cb;
+extern struct nf_exp_event_notifier __rcu *nf_expect_event_cb;
extern int nf_ct_expect_register_notifier(struct nf_exp_event_notifier *nb);
extern void nf_ct_expect_unregister_notifier(struct nf_exp_event_notifier *nb);
diff --git a/include/net/netfilter/nf_conntrack_l3proto.h b/include/net/netfilter/nf_conntrack_l3proto.h
index a754761..e8010f4 100644
--- a/include/net/netfilter/nf_conntrack_l3proto.h
+++ b/include/net/netfilter/nf_conntrack_l3proto.h
@@ -73,7 +73,7 @@ struct nf_conntrack_l3proto {
struct module *me;
};
-extern struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX];
+extern struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX];
/* Protocol registration. */
extern int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto);
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 85dabb8..5faec4f 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -212,7 +212,7 @@ EXPORT_SYMBOL(skb_make_writable);
/* This does not belong here, but locally generated errors need it if connection
tracking in use: without this, connection may not be in hash table, and hence
manufactured ICMP or RST packets will not be associated with it. */
-void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
+void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly;
EXPORT_SYMBOL(ip_ct_attach);
void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
@@ -229,7 +229,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
}
EXPORT_SYMBOL(nf_ct_attach);
-void (*nf_ct_destroy)(struct nf_conntrack *);
+void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
EXPORT_SYMBOL(nf_ct_destroy);
void nf_conntrack_destroy(struct nf_conntrack *nfct)
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 46e8966..cab196c 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -482,7 +482,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
- n = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
if (n)
return n;
}
@@ -495,11 +495,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
- head = rcu_dereference(net->ct.expect_hash[st->bucket].first);
+ head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
}
return head;
}
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index dc7bb74..03b56a0 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -166,6 +166,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto
int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
{
int ret = 0;
+ struct nf_conntrack_l3proto *old;
if (proto->l3proto >= AF_MAX)
return -EBUSY;
@@ -174,7 +175,9 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
return -EINVAL;
mutex_lock(&nf_ct_proto_mutex);
- if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) {
+ old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex));
+ if (old != &nf_conntrack_l3proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
@@ -201,7 +204,9 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
BUG_ON(proto->l3proto >= AF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_l3protos[proto->l3proto] != proto);
+ BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != proto);
rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
&nf_conntrack_l3proto_generic);
nf_ct_l3proto_unregister_sysctl(proto);
@@ -299,8 +304,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
smp_wmb();
nf_ct_protos[l4proto->l3proto] = proto_array;
- } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] !=
- &nf_conntrack_l4proto_generic) {
+ } else if (rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != &nf_conntrack_l4proto_generic) {
ret = -EBUSY;
goto out_unlock;
}
@@ -331,7 +338,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
BUG_ON(l4proto->l3proto >= PF_MAX);
mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto);
+ BUG_ON(rcu_dereference_protected(
+ nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
+ lockdep_is_held(&nf_ct_proto_mutex)
+ ) != l4proto);
rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
&nf_conntrack_l4proto_generic);
nf_ct_l4proto_unregister_sysctl(l4proto);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 0fb6570..328f1d2 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_zones.h>
+#include <linux/rculist_nulls.h>
MODULE_LICENSE("GPL");
@@ -56,7 +57,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket++) {
- n = rcu_dereference(net->ct.hash[st->bucket].first);
+ n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
@@ -69,13 +70,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
- head = rcu_dereference(head->next);
+ head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
return NULL;
}
- head = rcu_dereference(net->ct.hash[st->bucket].first);
+ head = rcu_dereference(
+ hlist_nulls_first_rcu(
+ &net->ct.hash[st->bucket]));
}
return head;
}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index b07393e..20c775c 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -161,7 +161,8 @@ static int seq_show(struct seq_file *s, void *v)
struct nf_logger *t;
int ret;
- logger = nf_loggers[*pos];
+ logger = rcu_dereference_protected(nf_loggers[*pos],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
ret = seq_printf(s, "%2lld NONE (", *pos);
@@ -249,7 +250,8 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
mutex_unlock(&nf_log_mutex);
} else {
mutex_lock(&nf_log_mutex);
- logger = nf_loggers[tindex];
+ logger = rcu_dereference_protected(nf_loggers[tindex],
+ lockdep_is_held(&nf_log_mutex));
if (!logger)
table->data = "NONE";
else
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 74aebed..1876f74 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -27,14 +27,17 @@ static DEFINE_MUTEX(queue_handler_mutex);
int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
{
int ret;
+ const struct nf_queue_handler *old;
if (pf >= ARRAY_SIZE(queue_handler))
return -EINVAL;
mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] == qh)
+ old = rcu_dereference_protected(queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex));
+ if (old == qh)
ret = -EEXIST;
- else if (queue_handler[pf])
+ else if (old)
ret = -EBUSY;
else {
rcu_assign_pointer(queue_handler[pf], qh);
@@ -49,11 +52,15 @@ EXPORT_SYMBOL(nf_register_queue_handler);
/* The caller must flush their queue before this */
int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
{
+ const struct nf_queue_handler *old;
+
if (pf >= ARRAY_SIZE(queue_handler))
return -EINVAL;
mutex_lock(&queue_handler_mutex);
- if (queue_handler[pf] && queue_handler[pf] != qh) {
+ old = rcu_dereference_protected(queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex));
+ if (old && old != qh) {
mutex_unlock(&queue_handler_mutex);
return -EINVAL;
}
@@ -73,7 +80,10 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
mutex_lock(&queue_handler_mutex);
for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) {
- if (queue_handler[pf] == qh)
+ if (rcu_dereference_protected(
+ queue_handler[pf],
+ lockdep_is_held(&queue_handler_mutex)
+ ) == qh)
rcu_assign_pointer(queue_handler[pf], NULL);
}
mutex_unlock(&queue_handler_mutex);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6a1572b..91592da 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -874,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st)
for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
if (!hlist_empty(&instance_table[st->bucket]))
- return rcu_dereference_bh(instance_table[st->bucket].first);
+ return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
}
return NULL;
}
static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
{
- h = rcu_dereference_bh(h->next);
+ h = rcu_dereference_bh(hlist_next_rcu(h));
while (!h) {
if (++st->bucket >= INSTANCE_BUCKETS)
return NULL;
- h = rcu_dereference_bh(instance_table[st->bucket].first);
+ h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
}
return h;
}
--
1.7.2.3
^ permalink raw reply related
* [PATCH 05/79] netfilter: nf_conntrack: define ct_*_info as needed
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
include/net/netfilter/nf_conntrack.h | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index abfff1e..8a58901 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -50,11 +50,24 @@ union nf_conntrack_expect_proto {
/* per conntrack: application helper private data */
union nf_conntrack_help {
/* insert conntrack helper private data (master) here */
+#if defined(CONFIG_NF_CONNTRACK_FTP) || defined(CONFIG_NF_CONNTRACK_FTP_MODULE)
struct nf_ct_ftp_master ct_ftp_info;
+#endif
+#if defined(CONFIG_NF_CONNTRACK_PPTP) || \
+ defined(CONFIG_NF_CONNTRACK_PPTP_MODULE)
struct nf_ct_pptp_master ct_pptp_info;
+#endif
+#if defined(CONFIG_NF_CONNTRACK_H323) || \
+ defined(CONFIG_NF_CONNTRACK_H323_MODULE)
struct nf_ct_h323_master ct_h323_info;
+#endif
+#if defined(CONFIG_NF_CONNTRACK_SANE) || \
+ defined(CONFIG_NF_CONNTRACK_SANE_MODULE)
struct nf_ct_sane_master ct_sane_info;
+#endif
+#if defined(CONFIG_NF_CONNTRACK_SIP) || defined(CONFIG_NF_CONNTRACK_SIP_MODULE)
struct nf_ct_sip_master ct_sip_info;
+#endif
};
#include <linux/types.h>
--
1.7.2.3
^ permalink raw reply related
* [PATCH 03/79] netfilter: xt_LOG: do print MAC header on FORWARD
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Jan Engelhardt <jengelh@medozas.de>
I am observing consistent behavior even with bridges, so let's unlock
this. xt_mac is already usable in FORWARD, too. Section 9 of
http://ebtables.sourceforge.net/br_fw_ia/br_fw_ia.html#section9 says
the MAC source address is changed, but my observation does not match
that claim -- the MAC header is retained.
Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
[Patrick; code inspection seems to confirm this]
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/ipv4/netfilter/ipt_LOG.c | 3 +--
net/ipv6/netfilter/ip6t_LOG.c | 3 +--
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8f..d76d6c9 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
}
#endif
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 09c8889..05027b7 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -452,8 +452,7 @@ ip6t_log_packet(u_int8_t pf,
in ? in->name : "",
out ? out->name : "");
- /* MAC logging for input path only. */
- if (in && !out)
+ if (in != NULL)
dump_mac_header(m, loginfo, skb);
dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
--
1.7.2.3
^ permalink raw reply related
* [PATCH 02/79] netfilter: xt_NFQUEUE: remove modulo operations
From: kaber @ 2011-01-19 19:14 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, netdev
In-Reply-To: <1295464519-21763-1-git-send-email-kaber@trash.net>
From: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
net/netfilter/xt_NFQUEUE.c | 6 ++++--
1 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 039cce1..3962770 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -72,10 +72,12 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
if (info->queues_total > 1) {
if (par->family == NFPROTO_IPV4)
- queue = hash_v4(skb) % info->queues_total + queue;
+ queue = (((u64) hash_v4(skb) * info->queues_total) >>
+ 32) + queue;
#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
else if (par->family == NFPROTO_IPV6)
- queue = hash_v6(skb) % info->queues_total + queue;
+ queue = (((u64) hash_v6(skb) * info->queues_total) >>
+ 32) + queue;
#endif
}
return NF_QUEUE_NR(queue);
--
1.7.2.3
^ permalink raw reply related
* Re: [PATCH] xen network backend driver
From: Ben Hutchings @ 2011-01-19 18:05 UTC (permalink / raw)
To: Ian Campbell
Cc: netdev@vger.kernel.org, xen-devel, Jeremy Fitzhardinge,
Konrad Rzeszutek Wilk
In-Reply-To: <1295459316.14981.3727.camel@zakaz.uk.xensource.com>
On Wed, 2011-01-19 at 17:48 +0000, Ian Campbell wrote:
> Hi Ben,
>
> Thanks for the very speedy review!
>
> I don't have many comments other than "yes, you are right".
>
> There are a couple of things inline below.
>
> On Wed, 2011-01-19 at 16:40 +0000, Ben Hutchings wrote:
> > On Wed, 2011-01-19 at 15:01 +0000, Ian Campbell wrote:
> > [...]
> > > + /*
> > > + * Initialise a dummy MAC address. We choose the numerically
> > > + * largest non-broadcast address to prevent the address getting
> > > + * stolen by an Ethernet bridge for STP purposes.
> > > + * (FE:FF:FF:FF:FF:FF)
> > > + */
> > > + memset(dev->dev_addr, 0xFF, ETH_ALEN);
> > > + dev->dev_addr[0] &= ~0x01;
> >
> > I'm a bit dubious about this.
>
> Which reminds me that I need to add the hook so that the Xen userspace
> stuff can actually do the right thing and set the MAC address to
> FE:FF:FF:FF:FF:FF itself as it puts the device on the bridge.
>
> The toolstack has only recently been fixed to even try that though.
>
> In use these devices aren't typically endpoints which generate or
> receive any actual traffic so letting it pick up a random MAC address by
> default isn't terribly useful. The actual useful MAC address is the one
> which is configured in the frontend.
Right, I understand that.
> > [...]
> > > +static int MODPARM_netback_kthread;
> > > +module_param_named(netback_kthread, MODPARM_netback_kthread, bool, 0);
> > > +MODULE_PARM_DESC(netback_kthread, "Use kernel thread to replace tasklet");
> > > +
> > > +/*
> > > + * Netback bottom half handler.
> > > + * dir indicates the data direction.
> > > + * rx: 1, tx: 0.
> > > + */
> > > +static inline void xen_netbk_bh_handler(struct xen_netbk *netbk, int dir)
> > > +{
> > > + if (MODPARM_netback_kthread)
> > > + wake_up(&netbk->kthread.netbk_action_wq);
> > > + else if (dir)
> > > + tasklet_schedule(&netbk->tasklet.net_rx_tasklet);
> > > + else
> > > + tasklet_schedule(&netbk->tasklet.net_tx_tasklet);
> > > +}
> >
> > Ugh, please just use NAPI.
>
> Although I only have a vague concept of what NAPI actually entails in
> practice I think it most likely makes sense.
>
> Am I right that NAPI only covers the RX case?
All completions should be processed via NAPI, if possible. The poll
function is given a work budget and each RX completion is assigned a
cost of 1. TX completions are cheap enough that they aren't budgetted
individually, but they must be limited somehow. The standard practice
is to consider the budget exhausted after processing an entire TX ring
once.
> Does NAPI processing happen in softirq context?
Yes.
> The reason for the
> existing option to use a kthread was that the tasklets would completely
> swamp the domain 0 CPU under load and prevent anything else from running
> (including e.g. ssh or the toolstack allowing you to fix the
> problem...).
I can see that that could be a problem if dom0's processing power is low
compared to the other domains.
> I guess this is just a case of setting the NAPI weight
> correctly (i.e. appropriately high in this case)?
Sorry, I have not looked at adjusting NAPI weights before.
> Last question before I go an actually investigate NAPI properly: Does
> NAPI also scale out across CPUs? Currently the threads/tasklets are per
> CPU and this is a significant scalability win.
[...]
Not in itself. NAPI polling will run on the same CPU which scheduled it
(so wherever the IRQ was initially handled). If the protocol used
between netfront and netback doesn't support RSS then RPS
<http://lwn.net/Articles/362339/> can be used to spread the RX work
across CPUs.
Ben.
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jan Engelhardt @ 2011-01-19 18:04 UTC (permalink / raw)
To: Jarek Poplawski
Cc: jamal, Pablo Neira Ayuso, David Miller, arthur.marsh,
eric.dumazet, netdev
In-Reply-To: <20110119165413.GB1845@del.dom.local>
On Wednesday 2011-01-19 17:54, Jarek Poplawski wrote:
>
>I still don't understand why you call this the nonsense. There are
>two dump flags NLM_F_ROOT and NLM_F_MATCH plus for convenience
>NLM_F_DUMP as 2 in 1. Avahi uses these specific flags. Why would
>anybody have added these specific flags if they can never be used
>separately?
It looks like the authors' intentinos were to make NLM_F_MATCH not
stop after a single entry has been found. So that sounds like dump,
ok.
But NLM_F_ROOT does not quite strike me as a dump request. What if I
wanted just a single item returned but still start at the root?
Or asking from a different direction, what's NLM_F_ROOT good for
when, say, struct rtmsg->rtm_table specifies (in rtnetlink) where to
start? (Particularly, 0 for an "invisible root" that contains all
tables.)
^ permalink raw reply
* Re: [PATCH] xen network backend driver
From: Ian Campbell @ 2011-01-19 17:48 UTC (permalink / raw)
To: Ben Hutchings
Cc: netdev@vger.kernel.org, xen-devel, Jeremy Fitzhardinge,
Konrad Rzeszutek Wilk
In-Reply-To: <1295455216.11126.39.camel@bwh-desktop>
Hi Ben,
Thanks for the very speedy review!
I don't have many comments other than "yes, you are right".
There are a couple of things inline below.
On Wed, 2011-01-19 at 16:40 +0000, Ben Hutchings wrote:
> On Wed, 2011-01-19 at 15:01 +0000, Ian Campbell wrote:
> [...]
> > + /*
> > + * Initialise a dummy MAC address. We choose the numerically
> > + * largest non-broadcast address to prevent the address getting
> > + * stolen by an Ethernet bridge for STP purposes.
> > + * (FE:FF:FF:FF:FF:FF)
> > + */
> > + memset(dev->dev_addr, 0xFF, ETH_ALEN);
> > + dev->dev_addr[0] &= ~0x01;
>
> I'm a bit dubious about this.
Which reminds me that I need to add the hook so that the Xen userspace
stuff can actually do the right thing and set the MAC address to
FE:FF:FF:FF:FF:FF itself as it puts the device on the bridge.
The toolstack has only recently been fixed to even try that though.
In use these devices aren't typically endpoints which generate or
receive any actual traffic so letting it pick up a random MAC address by
default isn't terribly useful. The actual useful MAC address is the one
which is configured in the frontend.
> [...]
> > +static int MODPARM_netback_kthread;
> > +module_param_named(netback_kthread, MODPARM_netback_kthread, bool, 0);
> > +MODULE_PARM_DESC(netback_kthread, "Use kernel thread to replace tasklet");
> > +
> > +/*
> > + * Netback bottom half handler.
> > + * dir indicates the data direction.
> > + * rx: 1, tx: 0.
> > + */
> > +static inline void xen_netbk_bh_handler(struct xen_netbk *netbk, int dir)
> > +{
> > + if (MODPARM_netback_kthread)
> > + wake_up(&netbk->kthread.netbk_action_wq);
> > + else if (dir)
> > + tasklet_schedule(&netbk->tasklet.net_rx_tasklet);
> > + else
> > + tasklet_schedule(&netbk->tasklet.net_tx_tasklet);
> > +}
>
> Ugh, please just use NAPI.
Although I only have a vague concept of what NAPI actually entails in
practice I think it most likely makes sense.
Am I right that NAPI only covers the RX case?
Does NAPI processing happen in softirq context? The reason for the
existing option to use a kthread was that the tasklets would completely
swamp the domain 0 CPU under load and prevent anything else from running
(including e.g. ssh or the toolstack allowing you to fix the
problem...). I guess this is just a case of setting the NAPI weight
correctly (i.e. appropriately high in this case)?
Last question before I go an actually investigate NAPI properly: Does
NAPI also scale out across CPUs? Currently the threads/tasklets are per
CPU and this is a significant scalability win.
> [...]
> > +#ifdef NETBE_DEBUG_INTERRUPT
> > +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
>
> This wouldn't compile on anything after 2.6.18! Clearly no-one defines
> NETBE_DEBUG_INTERRUPT, and you can remove this code entirely.
Heh, I actually enabled this and fixed it up as I was debugging this
stuff and then accidentally threw away the fixup hunk when I turned it
off again...
I think you are right to suggest removing the code though, it's not
actually all that helpful in practice and it is easy enough to hack
similar things in for local debugging as necessary.
> [...]
> > +module_init(netback_init);
> [...]
>
> No module_fini?
Not at the moment.
Ian.
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Pablo Neira Ayuso @ 2011-01-19 17:42 UTC (permalink / raw)
To: David Miller; +Cc: jarkao2, arthur.marsh, jengelh, eric.dumazet, netdev, hadi
In-Reply-To: <20110118.125051.260099262.davem@davemloft.net>
On 18/01/11 21:50, David Miller wrote:
> From: Pablo Neira Ayuso <pablo@netfilter.org>
> Date: Tue, 18 Jan 2011 21:31:31 +0100
>
>> The combination that avahi uses makes no sense.
>>
>> I've been auditing user-space tools that may have problems with this change:
>>
>> * iw (it uses libnl)
>> * acpid (it uses a mangled version of libnetlink shipped in iproute)
>> * tstime, for taskstats, it uses libnl
>> * wimax-tools, it uses libnl
>> * quota-tools, it uses libnl
>> * keepalived, no libs used
>>
>> Well, I can keep looking for more, but I think that avahi is the only
>> one doing this incorrectly.
>>
>> Please, fix avahi instead.
>
> That's a pretty compelling argument, so I'll hold off on the revert
> for now.
I've been reviewing user-space applications for a couple of hours (I've
got a big list here with no problems), unfortunately I found that:
ip route show cache
hangs after displaying the first line with the patch applied because it
uses:
req.nlh.nlmsg_type = RTM_GETROUTE;
req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_REQUEST;
to dump the routing cache.
We need something less agressive, some warning to be printed and accept
this flag combination for quite some time until it's removed as jamal
and jarek suggested.
Please, revert this patch until we find a better solution.
^ permalink raw reply
* Re: tool to send unsolicited neighbour advertisements?
From: Chris Friesen @ 2011-01-19 17:33 UTC (permalink / raw)
To: Martin Volf; +Cc: netdev, Linux Kernel Mailing List
In-Reply-To: <AANLkTi=g-tSrLMmu4JuuKrze4SJD67CSmbS3aONBw7wV@mail.gmail.com>
On 01/19/2011 03:42 AM, Martin Volf wrote:
> On 18 January 2011 21:42, Chris Friesen <chris.friesen@genband.com> wrote:
>> We're transitioning stuff to IPv6 and I've been trying (without much
>> luck) to find a standard tool for sending out unsolicited neighbour
>> advertisements for failover purposes.
>> Is there such a thing? In ipv4 arping works fine.
>
> Hello,
>
> probably http://www.remlab.net/ndisc6/
The ndisc6 tool appears to do neighbour discovery, but doesn't have any
options that I could see to send unsolicited neighbour advertisements.
There are various blackhat-type tools to do this, but I haven't been
able to find a real ipv6 replacement for arping.
Chris
--
Chris Friesen
Software Developer
GENBAND
chris.friesen@genband.com
www.genband.com
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jarek Poplawski @ 2011-01-19 17:33 UTC (permalink / raw)
To: jamal
Cc: Pablo Neira Ayuso, David Miller, arthur.marsh, jengelh,
eric.dumazet, netdev
In-Reply-To: <1295456377.2184.2.camel@mojatatu>
On Wed, Jan 19, 2011 at 11:59:37AM -0500, jamal wrote:
> On Wed, 2011-01-19 at 17:54 +0100, Jarek Poplawski wrote:
...
> > Aside from this question, if we still think it's the nonsense, a
> > warning would be nicer.
>
> That is what i was suggesting as well..
Except, I still don't think it's the nonsese, and suggest something
even nicer ;-)
Cheers,
Jarek P.
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jarek Poplawski @ 2011-01-19 17:19 UTC (permalink / raw)
To: jamal
Cc: Pablo Neira Ayuso, David Miller, arthur.marsh, jengelh,
eric.dumazet, netdev
In-Reply-To: <1295456377.2184.2.camel@mojatatu>
On Wed, Jan 19, 2011 at 11:59:37AM -0500, jamal wrote:
> On Wed, 2011-01-19 at 17:54 +0100, Jarek Poplawski wrote:
...
> gah! I already had plenty of caffeine when i typed that.
> I meant to say "If Avahi is popular and widely deployed,
> it makes sense to revert"
AFAIK, the most popular distros (XP, Vista, W7) don't use Avahi! ;-)
Other similar (desktop & userfriendly) should be affected.
Cheers,
Jarek P.
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: jamal @ 2011-01-19 16:59 UTC (permalink / raw)
To: Jarek Poplawski
Cc: Pablo Neira Ayuso, David Miller, arthur.marsh, jengelh,
eric.dumazet, netdev
In-Reply-To: <20110119165413.GB1845@del.dom.local>
On Wed, 2011-01-19 at 17:54 +0100, Jarek Poplawski wrote:
> On Wed, Jan 19, 2011 at 09:28:06AM -0500, jamal wrote:
> > So here is what i think the criteria should be:
> >
> > If Avahi is popular and widely deployed (I dont use it anywhere), it
> > makes no sense to revert.
> > A middle ground is: instead of rejecting the nonsense passed, maybe a
> > sane thing to do is a kernel warning for a period of time (sort of like
> > feature removal warnings).
>
> I still don't understand why you call this the nonsense.
gah! I already had plenty of caffeine when i typed that.
I meant to say "If Avahi is popular and widely deployed,
it makes sense to revert"
> There are
> two dump flags NLM_F_ROOT and NLM_F_MATCH plus for convenience
> NLM_F_DUMP as 2 in 1. Avahi uses these specific flags. Why would
> anybody have added these specific flags if they can never be used
> separately?
>
> Aside from this question, if we still think it's the nonsense, a
> warning would be nicer.
That is what i was suggesting as well..
cheers,
jamal
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jarek Poplawski @ 2011-01-19 16:54 UTC (permalink / raw)
To: jamal
Cc: Pablo Neira Ayuso, David Miller, arthur.marsh, jengelh,
eric.dumazet, netdev
In-Reply-To: <1295447286.2008.850.camel@mojatatu>
On Wed, Jan 19, 2011 at 09:28:06AM -0500, jamal wrote:
> On Tue, 2011-01-18 at 21:55 +0100, Jarek Poplawski wrote:
> > On Tue, Jan 18, 2011 at 09:31:31PM +0100, Pablo Neira Ayuso wrote:
>
> > > The combination that avahi uses makes no sense.
> >
> > I don't agree as explained in the reverting patch. Anyway, again,
> > this is an old problem, so no reason to force "fixing" it just now
> > at the expense of the obvious regression especially in stable kernels
> > Anyway, I'll accept any David's decision wrt this problem.
> >
>
> So here is what i think the criteria should be:
>
> If Avahi is popular and widely deployed (I dont use it anywhere), it
> makes no sense to revert.
> A middle ground is: instead of rejecting the nonsense passed, maybe a
> sane thing to do is a kernel warning for a period of time (sort of like
> feature removal warnings).
I still don't understand why you call this the nonsense. There are
two dump flags NLM_F_ROOT and NLM_F_MATCH plus for convenience
NLM_F_DUMP as 2 in 1. Avahi uses these specific flags. Why would
anybody have added these specific flags if they can never be used
separately?
Aside from this question, if we still think it's the nonsense, a
warning would be nicer.
Cheers,
Jarek P.
^ permalink raw reply
* Re: [PATCH v2] net: add Faraday FTMAC100 10/100 Ethernet driver
From: Joe Perches @ 2011-01-19 16:41 UTC (permalink / raw)
To: Po-Yu Chuang
Cc: netdev, linux-kernel, ratbert, bhutchings, eric.dumazet, dilinger
In-Reply-To: <AANLkTi=nJxwR89CF3i0+L6+yM4Bg_PN8PQByTAHP2nPM@mail.gmail.com>
On Wed, 2011-01-19 at 17:40 +0800, Po-Yu Chuang wrote:
> On Tue, Jan 18, 2011 at 1:19 AM, Joe Perches <joe@perches.com> wrote:
> > on split long line indentation style
> > and long function declarations.
[]
> > Most of drivers/net uses an alignment to open parenthesis
> > using maximal tabs and minimal necessary spaces instead of
> > an extra tabstop.
[]
> Well, TBH, I don't like this style because if I changed the
> function name, the indentation might need to be adjusted.
No worries. That could happen using either style.
There's no required style so you can use what you are
most comfortable doing. It's not a big deal at all.
> Even worse, I got an infeasible case :-(
>
> static struct ftmac100_rxdes *ftmac100_rx_locate_first_segment(
> struct ftmac100 *priv)
>
> I know my function names are quite long, but I like them to be descriptive.
> Do you really insist on it?
Here's a common alternative style for this case:
static struct ftmac100_rxdes *
ftmac100_rx_locate_first_segment(struct ftmac100 *priv)
^ permalink raw reply
* Re: [PATCH] xen network backend driver
From: Ben Hutchings @ 2011-01-19 16:40 UTC (permalink / raw)
To: Ian Campbell
Cc: netdev, xen-devel, Jeremy Fitzhardinge, Konrad Rzeszutek Wilk
In-Reply-To: <1295449318.14981.3484.camel@zakaz.uk.xensource.com>
On Wed, 2011-01-19 at 15:01 +0000, Ian Campbell wrote:
[...]
> diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
> index cbf0635..5b088f5 100644
> --- a/drivers/net/Kconfig
> +++ b/drivers/net/Kconfig
> @@ -2970,6 +2970,13 @@ config XEN_NETDEV_FRONTEND
> if you are compiling a kernel for a Xen guest, you almost
> certainly want to enable this.
>
> +config XEN_NETDEV_BACKEND
> + tristate "Xen backend network device"
> + depends on XEN_BACKEND
> + help
> + Implement the network backend driver, which passes packets
> + from the guest domain's frontend drivers to the network.
This is not an accurate description. The backend driver doesn't pass
packets to 'the network' (I assume that means physical network); that's
done by a bridge or by routing.
[...]
> diff --git a/drivers/net/xen-netback/Makefile b/drivers/net/xen-netback/Makefile
> new file mode 100644
> index 0000000..e346e81
> --- /dev/null
> +++ b/drivers/net/xen-netback/Makefile
> @@ -0,0 +1,3 @@
> +obj-$(CONFIG_XEN_NETDEV_BACKEND) := xen-netback.o
> +
> +xen-netback-y := netback.o xenbus.o interface.o
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> new file mode 100644
> index 0000000..2d55ed6
> --- /dev/null
> +++ b/drivers/net/xen-netback/common.h
> @@ -0,0 +1,250 @@
> +/******************************************************************************
> + * arch/xen/drivers/netif/backend/common.h
Doesn't match the actual filename, but then why include the filename at
all?
[...]
> +#ifndef __NETIF__BACKEND__COMMON_H__
> +#define __NETIF__BACKEND__COMMON_H__
Also doesn't match the filename.
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__
> +
> +#include <linux/version.h>
Don't include <linux/version.h> in an in-tree driver.
[...]
> +void netif_disconnect(struct xen_netif *netif);
> +
> +void netif_set_features(struct xen_netif *netif);
> +struct xen_netif *netif_alloc(struct device *parent, domid_t domid,
> + unsigned int handle);
[...]
The 'netif_' prefix belongs to the networking core; you should use some
other prefix for these.
[...]
> --- /dev/null
> +++ b/drivers/net/xen-netback/interface.c
[...]
> +/*
> + * Module parameter 'queue_length':
> + *
> + * Enables queuing in the network stack when a client has run out of receive
> + * descriptors.
> + */
> +static unsigned long netbk_queue_length = 32;
> +module_param_named(queue_length, netbk_queue_length, ulong, 0644);
This can be controlled through sysfs later, so it doesn't need to be a
module parameter.
[...]
> +static int netbk_set_tx_csum(struct net_device *dev, u32 data)
> +{
> + struct xen_netif *netif = netdev_priv(dev);
> + if (data) {
> + if (!netif->csum)
> + return -ENOSYS;
Should be -EOPNOTSUPP. Same in netbk_set_sg(), netbk_set_tso().
[...]
> +struct xen_netif *netif_alloc(struct device *parent, domid_t domid,
> + unsigned int handle)
> +{
[...]
> + /*
> + * Initialise a dummy MAC address. We choose the numerically
> + * largest non-broadcast address to prevent the address getting
> + * stolen by an Ethernet bridge for STP purposes.
> + * (FE:FF:FF:FF:FF:FF)
> + */
> + memset(dev->dev_addr, 0xFF, ETH_ALEN);
> + dev->dev_addr[0] &= ~0x01;
I'm a bit dubious about this.
[...]
> --- /dev/null
> +++ b/drivers/net/xen-netback/netback.c
[...]
> +static void net_tx_action(unsigned long data);
> +
> +static void net_rx_action(unsigned long data);
The 'net_' prefix belongs to the networking core.
[...]
> +static int netif_get_page_ext(struct page *pg,
> + unsigned int *_group, unsigned int *_idx)
The '_' prefix is usually meant to distinguish lower-level functions or
to avoid naming conflicts in macros. I don't see any justification for
using it here.
[...]
> +static int MODPARM_netback_kthread;
> +module_param_named(netback_kthread, MODPARM_netback_kthread, bool, 0);
> +MODULE_PARM_DESC(netback_kthread, "Use kernel thread to replace tasklet");
> +
> +/*
> + * Netback bottom half handler.
> + * dir indicates the data direction.
> + * rx: 1, tx: 0.
> + */
> +static inline void xen_netbk_bh_handler(struct xen_netbk *netbk, int dir)
> +{
> + if (MODPARM_netback_kthread)
> + wake_up(&netbk->kthread.netbk_action_wq);
> + else if (dir)
> + tasklet_schedule(&netbk->tasklet.net_rx_tasklet);
> + else
> + tasklet_schedule(&netbk->tasklet.net_tx_tasklet);
> +}
Ugh, please just use NAPI.
[...]
> +static struct sk_buff *netbk_copy_skb(struct sk_buff *skb)
> +{
This should be implemented in net/core/skbuff.c as a variant of
skb_copy() and pskb_copy(), sharing as much code as possible.
[...]
> +static int skb_checksum_setup(struct sk_buff *skb)
The 'skb_' prefix belongs to the networking core.
> +{
> + struct iphdr *iph;
> + unsigned char *th;
> + int err = -EPROTO;
> +
> + if (skb->protocol != htons(ETH_P_IP))
> + goto out;
> +
> + iph = (void *)skb->data;
> + th = skb->data + 4 * iph->ihl;
> + if (th >= skb_tail_pointer(skb))
> + goto out;
> +
> + skb->csum_start = th - skb->head;
> + switch (iph->protocol) {
> + case IPPROTO_TCP:
> + skb->csum_offset = offsetof(struct tcphdr, check);
> + break;
> + case IPPROTO_UDP:
> + skb->csum_offset = offsetof(struct udphdr, check);
> + break;
> + default:
> + if (net_ratelimit())
> + printk(KERN_ERR "Attempting to checksum a non-"
> + "TCP/UDP packet, dropping a protocol"
> + " %d packet", iph->protocol);
This is missing a newline, and missing any indicator of what generated
it. Use pr_err() to cover the latter.
[...]
> +#ifdef NETBE_DEBUG_INTERRUPT
> +static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs)
This wouldn't compile on anything after 2.6.18! Clearly no-one defines
NETBE_DEBUG_INTERRUPT, and you can remove this code entirely.
[...]
> +module_init(netback_init);
[...]
No module_fini?
Ben.
--
Ben Hutchings, Senior Software Engineer, Solarflare Communications
Not speaking for my employer; that's the marketing department's job.
They asked us to note that Solarflare product names are trademarked.
^ permalink raw reply
* Re: 2.6.37 regression: adding main interface to a bridge breaks vlan interface RX
From: Jesse Gross @ 2011-01-19 16:26 UTC (permalink / raw)
To: Simon Arlott; +Cc: Ben Hutchings, netdev, Linux Kernel Mailing List, Herbert Xu
In-Reply-To: <4D3487C2.7060506@simon.arlott.org.uk>
On Mon, Jan 17, 2011 at 10:17 AM, Simon Arlott <simon@fire.lp0.eu> wrote:
> On 17/01/11 16:00, Ben Hutchings wrote:
>> On Sun, 2011-01-16 at 14:09 +0000, Simon Arlott wrote:
>>> [ 1.666706] forcedeth 0000:00:08.0: ifname eth0, PHY OUI 0x5043 @ 16, addr 00:e0:81:4d:2b:ec
>>> [ 1.666767] forcedeth 0000:00:08.0: highdma csum vlan pwrctl mgmt gbit lnktim msi desc-v3
>>>
>>> I have eth0 and eth0.3840 which works until I add eth0 to a bridge.
>>> While eth0 is in a bridge (the bridge device is up), eth0.3840 is unable
>>> to receive packets. Using tcpdump on eth0 shows the packets being
>>> received with a VLAN tag but they don't appear on eth0.3840. They appear
>>> with the VLAN tag on the bridge interface.
>> [...]
>>
>> This means the behaviour is now consistent, whether or not hardware VLAN
>> tag stripping is enabled. (I previously pointed out the inconsistent
>> behaviour in <http://thread.gmane.org/gmane.linux.network/149864>.) I
>> would consider this an improvement.
>
> Shouldn't the kernel also prevent a device from being both part of a
> bridge and having VLANs? Instead everything appears to work except
> incoming traffic.
It might make sense, although you have similar effects with non-vlan
traffic to the device as well. You could make the same argument that
we shouldn't allow IP addresses, etc. to be assigned to bridged
devices. There are also other components that use the bridge hooks
that would need to be checked. All this starts to get a bit messy.
^ permalink raw reply
* Re: inbound connection problems when "netlink: test for all flags of the NLM_F_DUMP composite" commit applied
From: Jarek Poplawski @ 2011-01-19 16:18 UTC (permalink / raw)
To: Pablo Neira Ayuso
Cc: David Miller, arthur.marsh, jengelh, eric.dumazet, netdev, hadi
In-Reply-To: <4D36FADD.8000405@netfilter.org>
On Wed, Jan 19, 2011 at 03:53:17PM +0100, Pablo Neira Ayuso wrote:
> On 18/01/11 22:14, Jarek Poplawski wrote:
...
> > BTW, could you answer my earlier question, why NLM_F_ATOMIC flag isn't
> > handled now with dumps?
>
> The NLM_F_ATOMIC flag is not affected, the netlink header is still
> passed to netlink_dump_start() so you can check for it in the callback
> to start an atomic dump.
Right! I had some blackout, sorry.
Jarek P.
^ permalink raw reply
* Re: [PATCH] scm: provide full privilege set via SCM_PRIVILEGE
From: Eric W. Biederman @ 2011-01-19 16:18 UTC (permalink / raw)
To: Casey Schaufler
Cc: LKLM, xemul, David Miller, Sakkinen Jarkko.2 (EXT-Tieto/Tampere),
Janne Karhunen, Reshetova Elena (Nokia-D/Helsinki), netdev,
Serge E. Hallyn
In-Reply-To: <4D36FBDB.5050401@schaufler-ca.com>
Casey Schaufler <casey@schaufler-ca.com> writes:
> On 1/18/2011 9:45 PM, Eric W. Biederman wrote:
>> Casey Schaufler <casey@schaufler-ca.com> writes:
>>
>>> Subject: [PATCH] scm: provide full privilege set via SCM_PRIVILEGE
>>>
>>> The SCM mechanism currently provides interfaces for delivering
>>> the uid/gid and the "security context" (LSM information) of the
>>> peer on a UDS socket. All of the security credential information
>>> is available, but there is no interface available to obtain it.
>>> Further, the existing interfaces require that the user chose
>>> between the uid/gid and the context as the existing interfaces
>>> are exclusive.
>>>
>>> This patch introduces an additional interface that provides
>>> a complete set of security information from the peer credential.
>>> No additional work is required to provide the information
>>> internally, it is all being passed, just not exposed.
>> In ascii text?
>
> As is commonly done in /proc interfaces.
>
>> A bitmap in hex?
>
> As is done in /proc/<pid>/status. I seriously doubt
> that anyone would want the kernel doing the capability
> set to text conversion.
But when you have a perfectly good binary interface when reducing the
encoding efficiency for effectively no gain.
>> Maybe it is just me, but this seems harder to deal with than
>> if the data had been transferred in binary.
>
> There are a couple of issues with passing a binary structure
> in the modern cred case. First is the capability set, which
> has been proven to grow over time. Sure, it took a while to
> get past 32 bits, and hopefully will never go beyond 64, but
> given the long term problems caused by 16 bit uids (some of
> us still remember) I would hate to get bitten by this in my
> old age. Second is the LSM specific security context, which
> may not be there at all and if it is the size will depend on
> the LSM in use.
Sure but you have to use an interface that properly handles variable
length binary data, to get to the string. It feels like you are
violating the even more classic one value per file rule.
Maybe I am missing something but is there any reason you can't have
multiple cmsg types?
> There are classic C language techniques for dealing with
> both of these issues, and I've used them enough times to
> want to avoid them where possible. This is the same logic
> that the aforementioned /proc interface implementers have
> been using for some time. And while there are problems
> with formatting, passing and parsing a string they pale
> in comparison to maintaining multiple versions of kernel
> interface structures that are themselves variable depending
> on the kernel configuration.
If you are worrying about variable size structures that vary
depending on kernel configuration I pretty certain you are doing
it wrong.
The use of sprintf (not snprintf) and the crazy size computation needed
for your string also worries me. That part of the implementation
appears to be just asking for trouble.
Having a giant function like your scm_passpriv inline in
include/net/scm.h also seems very questionable.
I think there is a real impedance mismatch here between the interface
you are using and the way you are returning the data.
There is a show stopper bug. You don't translate uid/gid in the
receivers user namespace so passing a message between two processes in
different user namespaces can pass deceptive credentials. Given that the
reason we have the struct cred on unix domain sockets in the first place
is that we handled the user namespace conversion issues so we could
cross namespaces without security issues not handling that case in a new
scm cred is just inexcusable.
On that note Serge and I are slowly working to get credentials to be
namespace local as well, so shortly the credentials will also need to be
translated as well as just uid's and gid's.
The scm->secid is if I understand correctly a per packet label. Is that
what you want here? I thought you were interested in the information
off of struct cred.
In principle returning all of the credential should be fine. In
practice I think this patch has a lot of poorly thought through details
that will be a nightmare to maintain in practice.
Eric
>>> Signed-off-by: Casey Schaufler <casey@schaufler-ca.com>
>>> ---
>>>
>>> include/asm-generic/socket.h | 1 +
>>> include/linux/net.h | 1 +
>>> include/linux/socket.h | 1 +
>>> include/net/scm.h | 80 +++++++++++++++++++++++++++++++++++++++++-
>>> net/core/sock.c | 11 ++++++
>>> 5 files changed, 93 insertions(+), 1 deletions(-)
>>> diff --git a/include/asm-generic/socket.h b/include/asm-generic/socket.h
>>> index 9a6115e..7aa8e84 100644
>>> --- a/include/asm-generic/socket.h
>>> +++ b/include/asm-generic/socket.h
>>> @@ -64,4 +64,5 @@
>>> #define SO_DOMAIN 39
>>>
>>> #define SO_RXQ_OVFL 40
>>> +#define SO_PASSPRIV 41
>>> #endif /* __ASM_GENERIC_SOCKET_H */
>>> diff --git a/include/linux/net.h b/include/linux/net.h
>>> index 16faa13..159a929 100644
>>> --- a/include/linux/net.h
>>> +++ b/include/linux/net.h
>>> @@ -71,6 +71,7 @@ struct net;
>>> #define SOCK_NOSPACE 2
>>> #define SOCK_PASSCRED 3
>>> #define SOCK_PASSSEC 4
>>> +#define SOCK_PASSPRIV 5
>>>
>>> #ifndef ARCH_HAS_SOCKET_TYPES
>>> /**
>>> diff --git a/include/linux/socket.h b/include/linux/socket.h
>>> index 86b652f..e9cfd68 100644
>>> --- a/include/linux/socket.h
>>> +++ b/include/linux/socket.h
>>> @@ -147,6 +147,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr
>>> #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */
>>> #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */
>>> #define SCM_SECURITY 0x03 /* rw: security label */
>>> +#define SCM_PRIVILEGES 0x04 /* rw: privilege set */
>>>
>>> struct ucred {
>>> __u32 pid;
>>> diff --git a/include/net/scm.h b/include/net/scm.h
>>> index 3165650..4b8db21 100644
>>> --- a/include/net/scm.h
>>> +++ b/include/net/scm.h
>>> @@ -101,6 +101,83 @@ static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct sc
>>> { }
>>> #endif /* CONFIG_SECURITY_NETWORK */
>>>
>>> +static __inline__ void scm_passpriv(struct socket *sock, struct msghdr *msg,
>>> + struct scm_cookie *scm)
>>> +{
>>> + const struct cred *credp = scm->cred;
>>> + const struct group_info *gip;
>>> + char *result;
>>> + char *cp;
>>> + int i;
>>> +#ifdef CONFIG_SECURITY_NETWORK
>>> + char *secdata;
>>> + u32 seclen;
>>> + int err;
>>> +#endif /* CONFIG_SECURITY_NETWORK */
>>> +
>>> + if (!test_bit(SOCK_PASSPRIV, &sock->flags))
>>> + return;
>>> +
>>> + gip = credp->group_info;
>>> +
>>> + /*
>>> + * uid + euid + gid + egid + group-list + capabilities
>>> + * + "uid=" + "euid=" + "gid=" + "egid=" + "grps="
>>> + * + "cap-e=" + "cap-p=" + "cap-i="
>>> + * 10 + 10 + 10 + 10 + (ngrps * 10) + ecap + pcap + icap
>>> + * + 4 + 5 + 4 + 5 + 5 + 6 + 6 + 6
>>> + */
>>> + i = ((4 + gip->ngroups) * 11) + (3 * (_KERNEL_CAPABILITY_U32S * 8 + 1))
>>> + + 41;
>>> +
>>> +#ifdef CONFIG_SECURITY_NETWORK
>>> + err = security_secid_to_secctx(scm->secid, &secdata, &seclen);
>>> + if (!err)
>>> + /*
>>> + * " context="
>>> + */
>>> + i += seclen + 10;
>>> +#endif /* CONFIG_SECURITY_NETWORK */
>>> +
>>> + result = kzalloc(i, GFP_KERNEL);
>>> + if (result == NULL)
>>> + return;
>>> +
>>> + cp = result + sprintf(result, "euid=%d uid=%d egid=%d gid=%d",
>>> + credp->euid, credp->uid,
>>> + credp->egid, credp->gid);
>>> +
>>> + if (gip != NULL && gip->ngroups > 0) {
>>> + cp += sprintf(cp, " grps=%d", GROUP_AT(gip, 0));
>>> + for (i = 1 ; i < gip->ngroups; i++)
>>> + cp += sprintf(cp, ",%d", GROUP_AT(gip, i));
>>> + }
>>> +
>>> + cp += sprintf(cp, " cap-e=");
>>> + CAP_FOR_EACH_U32(i)
>>> + cp += sprintf(cp, "%08x", credp->cap_effective.cap[i]);
>>> + cp += sprintf(cp, " cap-p=");
>>> + CAP_FOR_EACH_U32(i)
>>> + cp += sprintf(cp, "%08x", credp->cap_permitted.cap[i]);
>>> + cp += sprintf(cp, " cap-i=");
>>> + CAP_FOR_EACH_U32(i)
>>> + cp += sprintf(cp, "%08x", credp->cap_inheritable.cap[i]);
>>> +
>>> +#ifdef CONFIG_SECURITY_NETWORK
>>> + cp += sprintf(cp, " context=");
>>> + strncpy(cp, secdata, seclen);
>>> + cp += seclen;
>>> + *cp = '\0';
>>> +
>>> + security_release_secctx(secdata, seclen);
>>> +#endif /* CONFIG_SECURITY_NETWORK */
>>> +
>>> + put_cmsg(msg, SOL_SOCKET, SCM_PRIVILEGES, strlen(result)+1, result);
>>> +
>>> + kfree(result);
>>> +}
>>> +
>>> +
>>> static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
>>> struct scm_cookie *scm, int flags)
>>> {
>>> @@ -114,6 +191,8 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
>>> if (test_bit(SOCK_PASSCRED, &sock->flags))
>>> put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(scm->creds), &scm->creds);
>>>
>>> + scm_passpriv(sock, msg, scm);
>>> +
>>> scm_destroy_cred(scm);
>>>
>>> scm_passec(sock, msg, scm);
>>> @@ -124,6 +203,5 @@ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg,
>>> scm_detach_fds(msg, scm);
>>> }
>>>
>>> -
>>> #endif /* __LINUX_NET_SCM_H */
>>>
>>> diff --git a/net/core/sock.c b/net/core/sock.c
>>> index fb60801..f134126 100644
>>> --- a/net/core/sock.c
>>> +++ b/net/core/sock.c
>>> @@ -725,6 +725,13 @@ set_rcvbuf:
>>> else
>>> clear_bit(SOCK_PASSSEC, &sock->flags);
>>> break;
>>> +
>>> + case SO_PASSPRIV:
>>> + if (valbool)
>>> + set_bit(SOCK_PASSPRIV, &sock->flags);
>>> + else
>>> + clear_bit(SOCK_PASSPRIV, &sock->flags);
>>> + break;
>>> case SO_MARK:
>>> if (!capable(CAP_NET_ADMIN))
>>> ret = -EPERM;
>>> @@ -950,6 +957,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>>> v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
>>> break;
>>>
>>> + case SO_PASSPRIV:
>>> + v.val = test_bit(SOCK_PASSPRIV, &sock->flags) ? 1 : 0;
>>> + break;
>>> +
>>> case SO_PEERSEC:
>>> return security_socket_getpeersec_stream(sock, optval, optlen, len);
>>>
>>
^ permalink raw reply
* Re: [PATCH 2.6.36] vlan: Avoid hwaccel vlan packets when vid not used
From: Jesse Gross @ 2011-01-19 16:15 UTC (permalink / raw)
To: Matt Carlson
Cc: Michael Leun, Michael Chan, Eric Dumazet, David Miller,
Ben Greear, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
In-Reply-To: <20110114183816.GB1288@mcarlson.broadcom.com>
On Fri, Jan 14, 2011 at 10:38 AM, Matt Carlson <mcarlson@broadcom.com> wrote:
>> >> > In the meantime, I think what we have should go upstream. ?Just to be
>> >> > absolutely clear though, your position is that VLAN tags should always
>> >> > be stripped?
>> >>
>> >> That's what the other converted drivers do by default (though most of
>> >> them also provide an ethtool set_flags() method to change this). ?It's
>> >> generally the most efficient and is now safe to do in all cases. ?It's
>> >> also the consistent with what was happening before, since stripping
>> >> was enabled when a vlan device was configured. ?So, yes, normally I
>> >> think stripping should be enabled.
>> >>
>> >> I assumed that disabling stripping in most situations was just an
>> >> oversight. ?Was there a reason why you feel it is better not to use
>> >> it?
>> >
>> > Actually, the tg3 driver was trying to disable VLAN tag stripping
>> > when possible. ?I believe this was primarily to support the raw packet
>> > interface.
>>
>> Hmm, is this really true?
>>
>> if (!tp->vlgrp &&
>> !(tp->tg3_flags & TG3_FLAG_ENABLE_ASF))
>> rx_mode |= RX_MODE_KEEP_VLAN_TAG;
>>
>> So if a vlan device is registered or ASF is enabled we will use
>> stripping. That seems like it is using stripping in the common case
>> when vlans are used.
>
> Right.
>
>> Before 2.6.37 it was only possible to deliver stripped tags if a vlan
>> group was configured. This means that if ASF was enabled and forced
>> stripping but no group was configured we would lose the tag. In this
>> situation tg3 manually reinserts the tags so raw packet capture will
>> see them, as you say.
>
> Right. VLAN tagged frames can still arrive if CONFIG_VLAN_8021Q or
> CONFIG_VLAN_8021Q_MODULE is not set. That's why the driver was trying
> to keep them inline. To eliminate the unnecessary overhead of
> reinjecting the VLAN tag.
OK, I understand now why you are saying that the driver was trying to
keep stripping disabled. I was thinking about having vlan groups
configured as the common case for receiving vlans and therefore the
case to optimize.
>
>> However, in the current tree this limitation no
>> longer exists and it is always possible to deliver stripped tags to
>> the networking core, which should do the right thing in all
>> situations.
>
> Yes. Even though the stack is capable of reinjecting the VLAN tag,
> doesn't it make sense to avoid that if we knew they would never be
> needed out-of-packet?
Maybe in theory but a relatively small optimization to the raw packet
interface doesn't seem terribly important to me. After all of the
drivers have been converted to the new vlan interfaces I'll be
removing the ndo_vlan_rx_register() function from net_device_ops, so
the driver won't actually know how the vlan tag will be used. The
impetus for this is that there were quite a few bugs and inconsistent
behavior in drivers around stripping logic. Therefore, by moving all
of this into the networking core we can reduce code and get consistent
behavior, which is more important than a corner case optimization.
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox