* [PATCH][XFRM] export SAD info
@ 2007-04-25 15:42 jamal
2007-04-25 15:54 ` jamal
2007-04-26 7:10 ` David Miller
0 siblings, 2 replies; 9+ messages in thread
From: jamal @ 2007-04-25 15:42 UTC (permalink / raw)
To: davem; +Cc: netdev
[-- Attachment #1: Type: text/plain, Size: 210 bytes --]
Dave,
Something ive been meaning to do since you made the hash changes. I will
be doing one also for policy. Against latest Linus tree because i am
having strange challenges syncing net-2.6.22.
cheers,
jamal
[-- Attachment #2: xfrm_sadinfo --]
[-- Type: text/plain, Size: 6441 bytes --]
[XFRM] export SAD info
On a system with a lot of SAs, counting SAD entries chews useful
CPU time since you need to dump the whole SAD to user space;
i.e something like ip xfrm state ls | grep -i src | wc -l
I have seen taking literally minutes on a 40K SAs when the system
is swapping.
With this patch, some of the SAD info (that was already being tracked)
is exposed to user space. i.e you do:
ip xfrm state count
And you get the count; you can also pass -s to the command line and
get the hash info.
Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
---
commit 1fb99604e38f27c1ad4cb74b11f148c34d0d3be6
tree 1bb35db627ac5d3d2f370d0fc993ba6b80392696
parent 146d97b89c83c9460012185bfd584d21a3b5fe19
author Jamal Hadi Salim <hadi@cyberus.ca> Wed, 25 Apr 2007 11:30:21 -0400
committer Jamal Hadi Salim <hadi@cyberus.ca> Wed, 25 Apr 2007 11:30:21 -0400
include/linux/xfrm.h | 25 ++++++++++++++++++++++
include/net/xfrm.h | 8 +++++++
net/xfrm/xfrm_state.c | 12 ++++++++++-
net/xfrm/xfrm_user.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 100 insertions(+), 1 deletions(-)
diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 15ca89e..9c656a5 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h
@@ -181,6 +181,10 @@ enum {
XFRM_MSG_MIGRATE,
#define XFRM_MSG_MIGRATE XFRM_MSG_MIGRATE
+ XFRM_MSG_NEWSADINFO,
+#define XFRM_MSG_NEWSADINFO XFRM_MSG_NEWSADINFO
+ XFRM_MSG_GETSADINFO,
+#define XFRM_MSG_GETSADINFO XFRM_MSG_GETSADINFO
__XFRM_MSG_MAX
};
#define XFRM_MSG_MAX (__XFRM_MSG_MAX - 1)
@@ -234,6 +238,17 @@ enum xfrm_ae_ftype_t {
#define XFRM_AE_MAX (__XFRM_AE_MAX - 1)
};
+/* SAD Table filter flags */
+enum xfrm_sad_ftype_t {
+ XFRM_SAD_UNSPEC,
+ XFRM_SAD_HMASK=1,
+ XFRM_SAD_HMAX=2,
+ XFRM_SAD_CNT=4,
+ __XFRM_SAD_MAX
+
+#define XFRM_SAD_MAX (__XFRM_SAD_MAX - 1)
+};
+
struct xfrm_userpolicy_type {
__u8 type;
__u16 reserved1;
@@ -265,6 +280,16 @@ enum xfrm_attr_type_t {
#define XFRMA_MAX (__XFRMA_MAX - 1)
};
+enum xfrm_sadattr_type_t {
+ XFRMA_SAD_UNSPEC,
+ XFRMA_SADHMASK,
+ XFRMA_SADHMAX,
+ XFRMA_SADCNT,
+ __XFRMA_SAD_MAX
+
+#define XFRMA_SAD_MAX (__XFRMA_SAD_MAX - 1)
+};
+
struct xfrm_usersa_info {
struct xfrm_selector sel;
struct xfrm_id id;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5a00aa8..4922e9f 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -416,6 +416,13 @@ struct xfrm_audit
u32 secid;
};
+/* SAD metadata, add more later */
+struct xfrm_sadinfo
+{
+ u32 sadhcnt; /* current hash bkts */
+ u32 sadhmcnt; /* max allowed hash bkts */
+ u32 sadcnt; /* current running count */
+};
#ifdef CONFIG_AUDITSYSCALL
extern void xfrm_audit_log(uid_t auid, u32 secid, int type, int result,
struct xfrm_policy *xp, struct xfrm_state *x);
@@ -938,6 +945,7 @@ static inline int xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **s
extern struct xfrm_state *xfrm_find_acq_byseq(u32 seq);
extern int xfrm_state_delete(struct xfrm_state *x);
extern void xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info);
+extern void xfrm_sad_getinfo(struct xfrm_sadinfo *si);
extern int xfrm_replay_check(struct xfrm_state *x, __be32 seq);
extern void xfrm_replay_advance(struct xfrm_state *x, __be32 seq);
extern void xfrm_replay_notify(struct xfrm_state *x, int event);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index c1581fb..98e5ce3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -53,7 +53,7 @@ static struct hlist_head *xfrm_state_bysrc __read_mostly;
static struct hlist_head *xfrm_state_byspi __read_mostly;
static unsigned int xfrm_state_hmask __read_mostly;
static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
-static u32 xfrm_state_num;
+static unsigned int xfrm_state_num;
static unsigned int xfrm_state_genid;
static inline unsigned int xfrm_dst_hash(xfrm_address_t *daddr,
@@ -421,6 +421,16 @@ restart:
}
EXPORT_SYMBOL(xfrm_state_flush);
+void xfrm_sad_getinfo(struct xfrm_sadinfo *si)
+{
+ spin_lock_bh(&xfrm_state_lock);
+ si->sadcnt = xfrm_state_num;
+ si->sadhcnt = xfrm_state_hmask;
+ si->sadhmcnt = xfrm_state_hashmax;
+ spin_unlock_bh(&xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_sad_getinfo);
+
static int
xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
struct xfrm_tmpl *tmpl,
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 816e369..089159a 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -672,6 +672,61 @@ static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb,
return skb;
}
+static int build_sadinfo(struct sk_buff *skb, u32 pid, u32 seq, u32 flags)
+{
+ struct xfrm_sadinfo si;
+ struct nlmsghdr *nlh;
+ u32 *f;
+
+ nlh = nlmsg_put(skb, pid, seq, XFRM_MSG_NEWSADINFO, sizeof(u32), 0);
+ if (nlh == NULL) /* shouldnt really happen ... */
+ return -EMSGSIZE;
+
+ f = nlmsg_data(nlh);
+ *f = flags;
+ xfrm_sad_getinfo(&si);
+
+ if (flags & XFRM_SAD_HMASK)
+ NLA_PUT_U32(skb, XFRMA_SADHMASK, si.sadhcnt);
+ if (flags & XFRM_SAD_HMAX)
+ NLA_PUT_U32(skb, XFRMA_SADHMAX, si.sadhmcnt);
+ if (flags & XFRM_SAD_CNT)
+ NLA_PUT_U32(skb, XFRMA_SADCNT, si.sadcnt);
+
+ return nlmsg_end(skb, nlh);
+
+nla_put_failure:
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+}
+
+static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct rtattr **xfrma)
+{
+ struct sk_buff *r_skb;
+ u32 *flags = NLMSG_DATA(nlh);
+ u32 spid = NETLINK_CB(skb).pid;
+ u32 seq = nlh->nlmsg_seq;
+ int len = NLMSG_LENGTH(sizeof(u32));
+
+ if (*flags & XFRM_SAD_HMASK)
+ len += RTA_SPACE(sizeof(u32));
+ if (*flags & XFRM_SAD_HMAX)
+ len += RTA_SPACE(sizeof(u32));
+ if (*flags & XFRM_SAD_CNT)
+ len += RTA_SPACE(sizeof(u32));
+
+ r_skb = alloc_skb(len, GFP_ATOMIC);
+
+ if (r_skb == NULL)
+ return -ENOMEM;
+
+ if (build_sadinfo(r_skb, spid, seq, *flags) < 0)
+ BUG();
+
+ return nlmsg_unicast(xfrm_nl, r_skb, spid);
+}
+
static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh,
struct rtattr **xfrma)
{
@@ -1850,6 +1905,7 @@ static struct xfrm_link {
[XFRM_MSG_NEWAE - XFRM_MSG_BASE] = { .doit = xfrm_new_ae },
[XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae },
[XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate },
+ [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo },
};
static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp)
^ permalink raw reply related [flat|nested] 9+ messages in thread* Re: [PATCH][XFRM] export SAD info
2007-04-25 15:42 [PATCH][XFRM] export SAD info jamal
@ 2007-04-25 15:54 ` jamal
2007-04-26 7:18 ` David Miller
2007-04-26 7:10 ` David Miller
1 sibling, 1 reply; 9+ messages in thread
From: jamal @ 2007-04-25 15:54 UTC (permalink / raw)
To: davem; +Cc: netdev
That patch has xfrm_state_num being mucked with; just ignore that bit.
I need to send a patch against net-2.6.22 and i will clean that up -
just need some feedback.
Would it make sense to have those vars as u32 instead of unsigned int?
cheers,
jamal
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH][XFRM] export SAD info
2007-04-25 15:54 ` jamal
@ 2007-04-26 7:18 ` David Miller
2007-04-26 13:10 ` jamal
0 siblings, 1 reply; 9+ messages in thread
From: David Miller @ 2007-04-26 7:18 UTC (permalink / raw)
To: hadi; +Cc: netdev
From: jamal <hadi@cyberus.ca>
Date: Wed, 25 Apr 2007 11:54:50 -0400
> That patch has xfrm_state_num being mucked with; just ignore that bit.
> I need to send a patch against net-2.6.22 and i will clean that up -
> just need some feedback.
>
> Would it make sense to have those vars as u32 instead of unsigned int?
I'm ambivalent, "unsigned int" happens to be 32-bit on every platform.
So changing it would cause no harm :-)
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH][XFRM] export SAD info
2007-04-26 7:18 ` David Miller
@ 2007-04-26 13:10 ` jamal
2007-04-26 21:18 ` David Miller
0 siblings, 1 reply; 9+ messages in thread
From: jamal @ 2007-04-26 13:10 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Thu, 2007-26-04 at 00:18 -0700, David Miller wrote:
> From: jamal <hadi@cyberus.ca>
> > Would it make sense to have those vars as u32 instead of unsigned int?
>
> I'm ambivalent, "unsigned int" happens to be 32-bit on every platform.
> So changing it would cause no harm :-)
If unsigned int is always u32 i will leave it as is.
I would have liked to just do a read_lock_bh when retrieving the table
metadata; however, the state table lock is defined as DEFINE_SPINLOCK
unlike the policy table which is defined as DEFINE_RWLOCK.
Any objection to change the state lock to be RW?
BTW, if i can get the SADinfo, then i should be able to set it from user
space too;->
So that would be my next change unless there is objection.
One other angle is start rejecting additions to the table after some
point. To test, I wrote a little DOS tool that just kept adding entries
until an OOM hit. It is a lot of fun to watch when you hit a point that
swap is guzzling 2G or more. The add latency starts going up
exponentially.
I would like to enable the admin to set the proper param settings for
upper bound. Exceeding the upper bounds of the max entries a table
should have returns ENOMEM for any new entries. By default current
behavior is maintained.
Thoughts?
cheers,
jamal
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH][XFRM] export SAD info
2007-04-26 13:10 ` jamal
@ 2007-04-26 21:18 ` David Miller
2007-04-27 14:21 ` jamal
0 siblings, 1 reply; 9+ messages in thread
From: David Miller @ 2007-04-26 21:18 UTC (permalink / raw)
To: hadi; +Cc: netdev
From: jamal <hadi@cyberus.ca>
Date: Thu, 26 Apr 2007 09:10:10 -0400
> I would have liked to just do a read_lock_bh when retrieving the table
> metadata; however, the state table lock is defined as DEFINE_SPINLOCK
> unlike the policy table which is defined as DEFINE_RWLOCK.
> Any objection to change the state lock to be RW?
I wouldn't mind if it actually helped anything.
The SMP cache line transactions are more expensive than the
execution of the code blocks they are protecting. rwlock's
rarely help, and when they do (the execution path is more
expensive than the SMP atomic operations) then you're holding
the lock too long :-)
> One other angle is start rejecting additions to the table after some
> point. To test, I wrote a little DOS tool that just kept adding entries
> until an OOM hit. It is a lot of fun to watch when you hit a point that
> swap is guzzling 2G or more. The add latency starts going up
> exponentially.
I would prefer a dynamic algorithm that reacts to system memory
pressure and yet-another-knob that people will get wrong and
there is no sane default for.
I plan to do away with all the GC threshold madness in the
routing cache, for example, and just let the MM layer call
back into us when there is memory pressure to trigger GC.
See set_shrinker() and friends. The MM calls into these
handlers in response to memory pressure. There is no
reason the networking can't hook into this and do things
properly instead of the ad-hoc manner we currently use.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH][XFRM] export SAD info
2007-04-26 21:18 ` David Miller
@ 2007-04-27 14:21 ` jamal
0 siblings, 0 replies; 9+ messages in thread
From: jamal @ 2007-04-27 14:21 UTC (permalink / raw)
To: David Miller; +Cc: netdev
On Thu, 2007-26-04 at 14:18 -0700, David Miller wrote:
> I wouldn't mind if it actually helped anything.
>
> The SMP cache line transactions are more expensive than the
> execution of the code blocks they are protecting. rwlock's
> rarely help, and when they do (the execution path is more
> expensive than the SMP atomic operations) then you're holding
> the lock too long :-)
>
Ok ;->
So if i was to make any change, it would be for consistency
with SPD. If this is sufficiently compelling i will send a patch.
> I would prefer a dynamic algorithm that reacts to system memory
> pressure and yet-another-knob that people will get wrong and
> there is no sane default for.
>
This would certainly be a better approach if doable.
> I plan to do away with all the GC threshold madness in the
> routing cache, for example, and just let the MM layer call
> back into us when there is memory pressure to trigger GC.
>
> See set_shrinker() and friends. The MM calls into these
> handlers in response to memory pressure. There is no
> reason the networking can't hook into this and do things
> properly instead of the ad-hoc manner we currently use.
Scanning the kernel ...
I wasnt aware of this, neat; not many areas in the kernel seem to use
it. I find this stuff interesting, so i may get too verbose ;->
One approach i tried was to write an oom_handler - but it seemed to
get invoked a little too late, i.e when shit has already hit the fan.
If only i could get notified just before swap kicks in or just when some
preconfigured (by me) memmory threshold is hit.... This may do it? I
will experiment. Actually for it to work well, I will need to know when
the memory threshold is crossed as it goes down and when it is going up
as more memory gets freed.
I can see the shrinker working well with dynamically createable
entries (route cache, arp cache, contrack etc); shrinking a SAD, SPD,
FIB etc that was created by some user space app without user space
consent or at least notification may be unacceptable (imagine Quagga/BGP
adding FIB entries and the kernel deciding its gonna run out of mem and
starting to delete entries; worse deleting SAD entries may be a security
risk etc etc). My problem is more related to these sorts of user
controlled tables.
One approach that may work to address the above is to send a signal to
user space when the low mem threshold is approaching.. User space then
uses that info to slow down its abuse of memory. I think that signaling
maybe achievable by a genlmsg being sent to a multicast group which a
user space app will have to subscribe to.
Another approach is to use the shrinker callback to set a lowmem
condition to start rejecting any new table additions. A timer to
retry would take it back; a callback from the VM to say "you can go
ahead and alloc more now" would be better of course - i couldnt see this
anywhere in the VM code, but it is one of those subsystem i dont pay
attention to, it may be there.
Thoughts? ;->
cheers,
jamal
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH][XFRM] export SAD info
2007-04-25 15:42 [PATCH][XFRM] export SAD info jamal
2007-04-25 15:54 ` jamal
@ 2007-04-26 7:10 ` David Miller
2007-04-26 12:55 ` jamal
1 sibling, 1 reply; 9+ messages in thread
From: David Miller @ 2007-04-26 7:10 UTC (permalink / raw)
To: hadi; +Cc: netdev
From: jamal <hadi@cyberus.ca>
Date: Wed, 25 Apr 2007 11:42:41 -0400
> [XFRM] export SAD info
>
> On a system with a lot of SAs, counting SAD entries chews useful
> CPU time since you need to dump the whole SAD to user space;
> i.e something like ip xfrm state ls | grep -i src | wc -l
> I have seen taking literally minutes on a 40K SAs when the system
> is swapping.
> With this patch, some of the SAD info (that was already being tracked)
> is exposed to user space. i.e you do:
> ip xfrm state count
> And you get the count; you can also pass -s to the command line and
> get the hash info.
>
> Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
Applied, thanks Jamal.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH][XFRM] export SAD info
2007-04-26 7:10 ` David Miller
@ 2007-04-26 12:55 ` jamal
2007-04-26 21:12 ` David Miller
0 siblings, 1 reply; 9+ messages in thread
From: jamal @ 2007-04-26 12:55 UTC (permalink / raw)
To: David Miller; +Cc: netdev
[-- Attachment #1: Type: text/plain, Size: 147 bytes --]
Thanks Dave.
Here's a missing bit to get in sync with latest net-2.6
I will send you the policy side sometime tonight or tommorow.
cheers,
jamal
[-- Attachment #2: xfrm_sadinfo-net26 --]
[-- Type: text/plain, Size: 1536 bytes --]
[XFRM] missing bits to SAD info
This brings the SAD info in sync with net-2.6.22/net-2.6
Signed-off-by: Jamal Hadi Salim <hadi@cyberus.ca>
---
commit e1dbdd993637d14c17bc30ba55e5c66640d39907
tree fef4015f92e4e50f98e1a1630333cc59ed4ec523
parent bfbf3c0968498f5232c02965cf41695edae1bc4d
author Jamal Hadi Salim <hadi@cyberus.ca> Thu, 26 Apr 2007 08:52:29 -0400
committer Jamal Hadi Salim <hadi@cyberus.ca> Thu, 26 Apr 2007 08:52:29 -0400
net/xfrm/xfrm_user.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index cb4cc1b..69110fe 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1878,6 +1878,7 @@ static const int xfrm_msg_min[XFRM_NR_MSGTYPES] = {
[XFRM_MSG_GETAE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_aevent_id),
[XFRM_MSG_REPORT - XFRM_MSG_BASE] = XMSGSIZE(xfrm_user_report),
[XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = XMSGSIZE(xfrm_userpolicy_id),
+ [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = NLMSG_LENGTH(sizeof(u32)),
};
#undef XMSGSIZE
@@ -1905,7 +1906,7 @@ static struct xfrm_link {
[XFRM_MSG_NEWAE - XFRM_MSG_BASE] = { .doit = xfrm_new_ae },
[XFRM_MSG_GETAE - XFRM_MSG_BASE] = { .doit = xfrm_get_ae },
[XFRM_MSG_MIGRATE - XFRM_MSG_BASE] = { .doit = xfrm_do_migrate },
- [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo },
+ [XFRM_MSG_GETSADINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_sadinfo },
};
static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2007-04-27 14:21 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-04-25 15:42 [PATCH][XFRM] export SAD info jamal
2007-04-25 15:54 ` jamal
2007-04-26 7:18 ` David Miller
2007-04-26 13:10 ` jamal
2007-04-26 21:18 ` David Miller
2007-04-27 14:21 ` jamal
2007-04-26 7:10 ` David Miller
2007-04-26 12:55 ` jamal
2007-04-26 21:12 ` David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).