* [patch v2.2 4/4] [PATCH v2.1 4/4] libxt_ipvs: user-space lib for netfilter matcher xt_ipvs
From: Simon Horman @ 2010-05-01 3:20 UTC (permalink / raw)
To: lvs-devel, netdev, linux-kernel, netfilter
Cc: Wensong Zhang, Julius Volz, Patrick McHardy, David S. Miller,
Hannes Eder
In-Reply-To: <20100501032014.406353538@vergenet.net>
[-- Attachment #1: 4.patch --]
[-- Type: text/plain, Size: 13571 bytes --]
From: Hannes Eder <heder@google.com>
The user-space library for the netfilter matcher xt_ipvs.
[ trivial up-port by Simon Horman <horms@verge.net.au> ]
Signed-off-by: Hannes Eder <heder@google.com>
Acked-by: Simon Horman <horms@verge.net.au>
configure.ac | 10 -
extensions/libxt_ipvs.c | 365 +++++++++++++++++++++++++++++++++++++
extensions/libxt_ipvs.man | 24 ++
include/linux/netfilter/xt_ipvs.h | 25 +++
4 files changed, 422 insertions(+), 2 deletions(-)
create mode 100644 extensions/libxt_ipvs.c
create mode 100644 extensions/libxt_ipvs.man
create mode 100644 include/linux/netfilter/xt_ipvs.h
diff --git a/configure.ac b/configure.ac
index 0419ea7..52e9223 100644
--- a/configure.ac
+++ b/configure.ac
@@ -47,12 +46,18 @@ AC_ARG_WITH([pkgconfigdir], AS_HELP_STRING([--with-pkgconfigdir=PATH],
[Path to the pkgconfig directory [[LIBDIR/pkgconfig]]]),
[pkgconfigdir="$withval"], [pkgconfigdir='${libdir}/pkgconfig'])
-AC_CHECK_HEADER([linux/dccp.h])
-
blacklist_modules="";
+
+AC_CHECK_HEADER([linux/dccp.h])
if test "$ac_cv_header_linux_dccp_h" != "yes"; then
blacklist_modules="$blacklist_modules dccp";
fi;
+
+AC_CHECK_HEADER([linux/ip_vs.h])
+if test "$ac_cv_header_linux_ip_vs_h" != "yes"; then
+ blacklist_modules="$blacklist_modules ipvs";
+fi;
+
AC_SUBST([blacklist_modules])
AM_CONDITIONAL([ENABLE_STATIC], [test "$enable_static" = "yes"])
diff --git a/extensions/libxt_ipvs.c b/extensions/libxt_ipvs.c
new file mode 100644
index 0000000..6843551
--- /dev/null
+++ b/extensions/libxt_ipvs.c
@@ -0,0 +1,365 @@
+/*
+ * Shared library add-on to iptables to add IPVS matching.
+ *
+ * Detailed doc is in the kernel module source net/netfilter/xt_ipvs.c
+ *
+ * Author: Hannes Eder <heder@google.com>
+ */
+#include <sys/types.h>
+#include <assert.h>
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <netdb.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <xtables.h>
+#include <linux/ip_vs.h>
+#include <linux/netfilter/xt_ipvs.h>
+
+static const struct option ipvs_mt_opts[] = {
+ { .name = "ipvs", .has_arg = false, .val = '0' },
+ { .name = "vproto", .has_arg = true, .val = '1' },
+ { .name = "vaddr", .has_arg = true, .val = '2' },
+ { .name = "vport", .has_arg = true, .val = '3' },
+ { .name = "vdir", .has_arg = true, .val = '4' },
+ { .name = "vmethod", .has_arg = true, .val = '5' },
+ { .name = "vportctl", .has_arg = true, .val = '6' },
+ { .name = NULL }
+};
+
+static void ipvs_mt_help(void)
+{
+ printf(
+"IPVS match options:\n"
+"[!] --ipvs packet belongs to an IPVS connection\n"
+"\n"
+"Any of the following options implies --ipvs (even negated)\n"
+"[!] --vproto protocol VIP protocol to match; by number or name,\n"
+" e.g. \"tcp\"\n"
+"[!] --vaddr address[/mask] VIP address to match\n"
+"[!] --vport port VIP port to match; by number or name,\n"
+" e.g. \"http\"\n"
+" --vdir {ORIGINAL|REPLY} flow direction of packet\n"
+"[!] --vmethod {GATE|IPIP|MASQ} IPVS forwarding method used\n"
+"[!] --vportctl port VIP port of the controlling connection to\n"
+" match, e.g. 21 for FTP\n"
+ );
+}
+
+static void ipvs_mt_parse_addr_and_mask(const char *arg,
+ union nf_inet_addr *address,
+ union nf_inet_addr *mask,
+ unsigned int family)
+{
+ struct in_addr *addr = NULL;
+ struct in6_addr *addr6 = NULL;
+ unsigned int naddrs = 0;
+
+ if (family == NFPROTO_IPV4) {
+ xtables_ipparse_any(arg, &addr, &mask->in, &naddrs);
+ if (naddrs > 1)
+ xtables_error(PARAMETER_PROBLEM,
+ "multiple IP addresses not allowed");
+ if (naddrs == 1)
+ memcpy(&address->in, addr, sizeof(*addr));
+ } else if (family == NFPROTO_IPV6) {
+ xtables_ip6parse_any(arg, &addr6, &mask->in6, &naddrs);
+ if (naddrs > 1)
+ xtables_error(PARAMETER_PROBLEM,
+ "multiple IP addresses not allowed");
+ if (naddrs == 1)
+ memcpy(&address->in6, addr6, sizeof(*addr6));
+ } else {
+ /* Hu? */
+ assert(false);
+ }
+}
+
+/* Function which parses command options; returns true if it ate an option */
+static int ipvs_mt_parse(int c, char **argv, int invert, unsigned int *flags,
+ const void *entry, struct xt_entry_match **match,
+ unsigned int family)
+{
+ struct xt_ipvs_mtinfo *data = (void *)(*match)->data;
+ char *p = NULL;
+ u_int8_t op = 0;
+
+ if ('0' <= c && c <= '6') {
+ static const int ops[] = {
+ XT_IPVS_IPVS_PROPERTY,
+ XT_IPVS_PROTO,
+ XT_IPVS_VADDR,
+ XT_IPVS_VPORT,
+ XT_IPVS_DIR,
+ XT_IPVS_METHOD,
+ XT_IPVS_VPORTCTL
+ };
+ op = ops[c - '0'];
+ } else
+ return 0;
+
+ if (*flags & op & XT_IPVS_ONCE_MASK)
+ goto multiple_use;
+
+ switch (c) {
+ case '0': /* --ipvs */
+ /* Nothing to do here. */
+ break;
+
+ case '1': /* --vproto */
+ /* Canonicalize into lower case */
+ for (p = optarg; *p != '\0'; ++p)
+ *p = tolower(*p);
+
+ data->l4proto = xtables_parse_protocol(optarg);
+ break;
+
+ case '2': /* --vaddr */
+ ipvs_mt_parse_addr_and_mask(optarg, &data->vaddr,
+ &data->vmask, family);
+ break;
+
+ case '3': /* --vport */
+ data->vport = htons(xtables_parse_port(optarg, "tcp"));
+ break;
+
+ case '4': /* --vdir */
+ xtables_param_act(XTF_NO_INVERT, "ipvs", "--vdir", invert);
+ if (strcasecmp(optarg, "ORIGINAL") == 0) {
+ data->bitmask |= XT_IPVS_DIR;
+ data->invert &= ~XT_IPVS_DIR;
+ } else if (strcasecmp(optarg, "REPLY") == 0) {
+ data->bitmask |= XT_IPVS_DIR;
+ data->invert |= XT_IPVS_DIR;
+ } else {
+ xtables_param_act(XTF_BAD_VALUE,
+ "ipvs", "--vdir", optarg);
+ }
+ break;
+
+ case '5': /* --vmethod */
+ if (strcasecmp(optarg, "GATE") == 0)
+ data->fwd_method = IP_VS_CONN_F_DROUTE;
+ else if (strcasecmp(optarg, "IPIP") == 0)
+ data->fwd_method = IP_VS_CONN_F_TUNNEL;
+ else if (strcasecmp(optarg, "MASQ") == 0)
+ data->fwd_method = IP_VS_CONN_F_MASQ;
+ else
+ xtables_param_act(XTF_BAD_VALUE,
+ "ipvs", "--vmethod", optarg);
+ break;
+
+ case '6': /* --vportctl */
+ data->vportctl = htons(xtables_parse_port(optarg, "tcp"));
+ break;
+
+ default:
+ /* Hu? How did we come here? */
+ assert(false);
+ return 0;
+ }
+
+ if (op & XT_IPVS_ONCE_MASK) {
+ if (data->invert & XT_IPVS_IPVS_PROPERTY)
+ xtables_error(PARAMETER_PROBLEM,
+ "! --ipvs cannot be together with"
+ " other options");
+ data->bitmask |= XT_IPVS_IPVS_PROPERTY;
+ }
+
+ data->bitmask |= op;
+ if (invert)
+ data->invert |= op;
+ *flags |= op;
+ return 1;
+
+multiple_use:
+ xtables_error(PARAMETER_PROBLEM,
+ "multiple use of the same IPVS option is not allowed");
+}
+
+static int ipvs_mt4_parse(int c, char **argv, int invert, unsigned int *flags,
+ const void *entry, struct xt_entry_match **match)
+{
+ return ipvs_mt_parse(c, argv, invert, flags, entry, match,
+ NFPROTO_IPV4);
+}
+
+static int ipvs_mt6_parse(int c, char **argv, int invert, unsigned int *flags,
+ const void *entry, struct xt_entry_match **match)
+{
+ return ipvs_mt_parse(c, argv, invert, flags, entry, match,
+ NFPROTO_IPV6);
+}
+
+static void ipvs_mt_check(unsigned int flags)
+{
+ if (flags == 0)
+ xtables_error(PARAMETER_PROBLEM,
+ "IPVS: At least one option is required");
+}
+
+/* Shamelessly copied from libxt_conntrack.c */
+static void ipvs_mt_dump_addr(const union nf_inet_addr *addr,
+ const union nf_inet_addr *mask,
+ unsigned int family, bool numeric)
+{
+ char buf[BUFSIZ];
+
+ if (family == NFPROTO_IPV4) {
+ if (!numeric && addr->ip == 0) {
+ printf("anywhere ");
+ return;
+ }
+ if (numeric)
+ strcpy(buf, xtables_ipaddr_to_numeric(&addr->in));
+ else
+ strcpy(buf, xtables_ipaddr_to_anyname(&addr->in));
+ strcat(buf, xtables_ipmask_to_numeric(&mask->in));
+ printf("%s ", buf);
+ } else if (family == NFPROTO_IPV6) {
+ if (!numeric && addr->ip6[0] == 0 && addr->ip6[1] == 0 &&
+ addr->ip6[2] == 0 && addr->ip6[3] == 0) {
+ printf("anywhere ");
+ return;
+ }
+ if (numeric)
+ strcpy(buf, xtables_ip6addr_to_numeric(&addr->in6));
+ else
+ strcpy(buf, xtables_ip6addr_to_anyname(&addr->in6));
+ strcat(buf, xtables_ip6mask_to_numeric(&mask->in6));
+ printf("%s ", buf);
+ }
+}
+
+static void ipvs_mt_dump(const void *ip, const struct xt_ipvs_mtinfo *data,
+ unsigned int family, bool numeric, const char *prefix)
+{
+ if (data->bitmask == XT_IPVS_IPVS_PROPERTY) {
+ if (data->invert & XT_IPVS_IPVS_PROPERTY)
+ printf("! ");
+ printf("%sipvs ", prefix);
+ }
+
+ if (data->bitmask & XT_IPVS_PROTO) {
+ if (data->invert & XT_IPVS_PROTO)
+ printf("! ");
+ printf("%sproto %u ", prefix, data->l4proto);
+ }
+
+ if (data->bitmask & XT_IPVS_VADDR) {
+ if (data->invert & XT_IPVS_VADDR)
+ printf("! ");
+
+ printf("%svaddr ", prefix);
+ ipvs_mt_dump_addr(&data->vaddr, &data->vmask, family, numeric);
+ }
+
+ if (data->bitmask & XT_IPVS_VPORT) {
+ if (data->invert & XT_IPVS_VPORT)
+ printf("! ");
+
+ printf("%svport %u ", prefix, ntohs(data->vport));
+ }
+
+ if (data->bitmask & XT_IPVS_DIR) {
+ if (data->invert & XT_IPVS_DIR)
+ printf("%svdir REPLY ", prefix);
+ else
+ printf("%svdir ORIGINAL ", prefix);
+ }
+
+ if (data->bitmask & XT_IPVS_METHOD) {
+ if (data->invert & XT_IPVS_METHOD)
+ printf("! ");
+
+ printf("%svmethod ", prefix);
+ switch (data->fwd_method) {
+ case IP_VS_CONN_F_DROUTE:
+ printf("GATE ");
+ break;
+ case IP_VS_CONN_F_TUNNEL:
+ printf("IPIP ");
+ break;
+ case IP_VS_CONN_F_MASQ:
+ printf("MASQ ");
+ break;
+ default:
+ /* Hu? */
+ printf("UNKNOWN ");
+ break;
+ }
+ }
+
+ if (data->bitmask & XT_IPVS_VPORTCTL) {
+ if (data->invert & XT_IPVS_VPORTCTL)
+ printf("! ");
+
+ printf("%svportctl %u ", prefix, ntohs(data->vportctl));
+ }
+}
+
+static void ipvs_mt4_print(const void *ip, const struct xt_entry_match *match,
+ int numeric)
+{
+ const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+ ipvs_mt_dump(ip, data, NFPROTO_IPV4, numeric, "");
+}
+
+static void ipvs_mt6_print(const void *ip, const struct xt_entry_match *match,
+ int numeric)
+{
+ const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+ ipvs_mt_dump(ip, data, NFPROTO_IPV6, numeric, "");
+}
+
+static void ipvs_mt4_save(const void *ip, const struct xt_entry_match *match)
+{
+ const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+ ipvs_mt_dump(ip, data, NFPROTO_IPV4, true, "--");
+}
+
+static void ipvs_mt6_save(const void *ip, const struct xt_entry_match *match)
+{
+ const struct xt_ipvs_mtinfo *data = (const void *)match->data;
+ ipvs_mt_dump(ip, data, NFPROTO_IPV6, true, "--");
+}
+
+static struct xtables_match ipvs_matches_reg[] = {
+ {
+ .version = XTABLES_VERSION,
+ .name = "ipvs",
+ .revision = 0,
+ .family = NFPROTO_IPV4,
+ .size = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+ .userspacesize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+ .help = ipvs_mt_help,
+ .parse = ipvs_mt4_parse,
+ .final_check = ipvs_mt_check,
+ .print = ipvs_mt4_print,
+ .save = ipvs_mt4_save,
+ .extra_opts = ipvs_mt_opts,
+ },
+ {
+ .version = XTABLES_VERSION,
+ .name = "ipvs",
+ .revision = 0,
+ .family = NFPROTO_IPV6,
+ .size = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+ .userspacesize = XT_ALIGN(sizeof(struct xt_ipvs_mtinfo)),
+ .help = ipvs_mt_help,
+ .parse = ipvs_mt6_parse,
+ .final_check = ipvs_mt_check,
+ .print = ipvs_mt6_print,
+ .save = ipvs_mt6_save,
+ .extra_opts = ipvs_mt_opts,
+ },
+};
+
+void _init(void)
+{
+ xtables_register_matches(ipvs_matches_reg,
+ ARRAY_SIZE(ipvs_matches_reg));
+}
diff --git a/extensions/libxt_ipvs.man b/extensions/libxt_ipvs.man
new file mode 100644
index 0000000..8968e1a
--- /dev/null
+++ b/extensions/libxt_ipvs.man
@@ -0,0 +1,24 @@
+Match IPVS connection properties.
+.TP
+[\fB!\fR] \fB\-\-ipvs\fP
+packet belongs to an IPVS connection
+.TP
+Any of the following options implies \-\-ipvs (even negated)
+.TP
+[\fB!\fR] \fB\-\-vproto\fP \fIprotocol\fP
+VIP protocol to match; by number or name, e.g. "tcp"
+.TP
+[\fB!\fR] \fB\-\-vaddr\fP \fIaddress\fP[\fB/\fP\fImask\fP]
+VIP address to match
+.TP
+[\fB!\fR] \fB\-\-vport\fP \fIport\fP
+VIP port to match; by number or name, e.g. "http"
+.TP
+\fB\-\-vdir\fP {\fBORIGINAL\fP|\fBREPLY\fP}
+flow direction of packet
+.TP
+[\fB!\fR] \fB\-\-vmethod\fP {\fBGATE\fP|\fBIPIP\fP|\fBMASQ\fP}
+IPVS forwarding method used
+.TP
+[\fB!\fR] \fB\-\-vportctl\fP \fIport\fP
+VIP port of the controlling connection to match, e.g. 21 for FTP
diff --git a/include/linux/netfilter/xt_ipvs.h b/include/linux/netfilter/xt_ipvs.h
new file mode 100644
index 0000000..32f3051
--- /dev/null
+++ b/include/linux/netfilter/xt_ipvs.h
@@ -0,0 +1,25 @@
+#ifndef _XT_IPVS_H
+#define _XT_IPVS_H 1
+
+#define XT_IPVS_IPVS_PROPERTY (1 << 0) /* all other options imply this one */
+#define XT_IPVS_PROTO (1 << 1)
+#define XT_IPVS_VADDR (1 << 2)
+#define XT_IPVS_VPORT (1 << 3)
+#define XT_IPVS_DIR (1 << 4)
+#define XT_IPVS_METHOD (1 << 5)
+#define XT_IPVS_VPORTCTL (1 << 6)
+#define XT_IPVS_MASK ((1 << 7) - 1)
+#define XT_IPVS_ONCE_MASK (XT_IPVS_MASK & ~XT_IPVS_IPVS_PROPERTY)
+
+struct xt_ipvs_mtinfo {
+ union nf_inet_addr vaddr, vmask;
+ __be16 vport;
+ __u16 l4proto;
+ __u16 fwd_method;
+ __be16 vportctl;
+
+ __u8 invert;
+ __u8 bitmask;
+};
+
+#endif /* _XT_IPVS_H */
^ permalink raw reply related
* [patch v2.2 2/4] [PATCH v2.1 2/4] IPVS: make friends with nf_conntrack
From: Simon Horman @ 2010-05-01 3:20 UTC (permalink / raw)
To: lvs-devel, netdev, linux-kernel, netfilter
Cc: Wensong Zhang, Julius Volz, Patrick McHardy, David S. Miller,
Hannes Eder
In-Reply-To: <20100501032014.406353538@vergenet.net>
[-- Attachment #1: 2.patch --]
[-- Type: text/plain, Size: 5469 bytes --]
From: Hannes Eder <heder@google.com>
Update the nf_conntrack tuple in reply direction, as we will see
traffic from the real server (RIP) to the client (CIP). Once this is
done we can use netfilters SNAT in POSTROUTING, especially with
xt_ipvs, to do source NAT, e.g.:
% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 --vport 80 \
> -j SNAT --to-source 192.168.10.10
Signed-off-by: Hannes Eder <heder@google.com>
Signed-off-by: Simon Horman <horms@verge.net.au>
net/netfilter/ipvs/Kconfig | 2 +-
net/netfilter/ipvs/ip_vs_core.c | 36 ------------------------------------
net/netfilter/ipvs/ip_vs_xmit.c | 30 ++++++++++++++++++++++++++++++
3 files changed, 31 insertions(+), 37 deletions(-)
Index: nf-next-2.6/net/netfilter/ipvs/Kconfig
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/Kconfig 2010-04-29 20:11:51.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/Kconfig 2010-04-29 20:11:59.000000000 +0900
@@ -3,7 +3,7 @@
#
menuconfig IP_VS
tristate "IP virtual server support"
- depends on NET && INET && NETFILTER
+ depends on NET && INET && NETFILTER && NF_CONNTRACK
---help---
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_core.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_core.c 2010-04-29 20:11:51.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_core.c 2010-04-29 20:11:59.000000000 +0900
@@ -521,26 +521,6 @@ int ip_vs_leave(struct ip_vs_service *sv
return NF_DROP;
}
-
-/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
- * chain, and is used for VS/NAT.
- * It detects packets for VS/NAT connections and sends the packets
- * immediately. This can avoid that iptable_nat mangles the packets
- * for VS/NAT.
- */
-static unsigned int ip_vs_post_routing(unsigned int hooknum,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- if (!skb->ipvs_property)
- return NF_ACCEPT;
- /* The packet was sent from IPVS, exit this chain */
- return NF_STOP;
-}
-
__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -1443,14 +1423,6 @@ static struct nf_hook_ops ip_vs_ops[] __
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
- /* Before the netfilter connection tracking, exit from POST_ROUTING */
- {
- .hook = ip_vs_post_routing,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC-1,
- },
#ifdef CONFIG_IP_VS_IPV6
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
@@ -1479,14 +1451,6 @@ static struct nf_hook_ops ip_vs_ops[] __
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
- /* Before the netfilter connection tracking, exit from POST_ROUTING */
- {
- .hook = ip_vs_post_routing,
- .owner = THIS_MODULE,
- .pf = PF_INET6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_NAT_SRC-1,
- },
#endif
};
Index: nf-next-2.6/net/netfilter/ipvs/ip_vs_xmit.c
===================================================================
--- nf-next-2.6.orig/net/netfilter/ipvs/ip_vs_xmit.c 2010-04-29 20:11:51.000000000 +0900
+++ nf-next-2.6/net/netfilter/ipvs/ip_vs_xmit.c 2010-04-29 20:11:59.000000000 +0900
@@ -27,6 +27,7 @@
#include <net/ip6_route.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
+#include <net/netfilter/nf_conntrack.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
@@ -347,6 +348,31 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb
}
#endif
+static void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+ struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || ct == &nf_conntrack_untracked ||
+ nf_ct_is_confirmed(ct))
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ new_tuple.src.u3 = cp->daddr;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ new_tuple.src.u.tcp.port = cp->dport;
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
/*
* NAT transmitter (only for outside-to-inside nat forwarding)
* Not used for related ICMP
@@ -402,6 +428,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, stru
IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+ ip_vs_update_conntrack(skb, cp);
+
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
@@ -478,6 +506,8 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, s
IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+ ip_vs_update_conntrack(skb, cp);
+
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
^ permalink raw reply
* [patch v2.2 0/4] IPVS full NAT support + netfilter 'ipvs' match support
From: Simon Horman @ 2010-05-01 3:20 UTC (permalink / raw)
To: lvs-devel, netdev, linux-kernel, netfilter
Cc: Wensong Zhang, Julius Volz, Patrick McHardy, David S. Miller,
Hannes Eder
[re-reposting without bogus headers that vger dislikes]
This is a repost of a patch-series posted by Hannes Eder last Steptember.
This is v2 of the patch series and I don't see any outstanding objections to
it in the mailing list archives. I would like it considered for inclusion
in the nf-next-2.6 kernel tree and iptables.
The original cover-email from Hannes follows.
The diffstat output has been updated to reflect minor up-porting by me.
From: Hannes Eder <heder@google.com>
The following series implements full NAT support for IPVS. The
approach is via a minimal change to IPVS (make friends with
nf_conntrack) and adding a netfilter matcher, kernel- and user-space
part, i.e. xt_ipvs and libxt_ipvs.
Example usage:
% ipvsadm -A -t 192.168.100.30:80 -s rr
% ipvsadm -a -t 192.168.100.30:80 -r 192.168.10.20:80 -m
# ...
# Source NAT for VIP 192.168.100.30:80
% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> --vport 80 -j SNAT --to-source 192.168.10.10
or SNAT-ing only a specific real server:
% iptables -t nat -A POSTROUTING --dst 192.168.11.20 \
> -m ipvs --vaddr 192.168.100.30/32 -j SNAT --to-source 192.168.10.10
First of all, thanks for all the feedback. This is the changelog for v2:
- Make ip_vs_ftp work again. Setup nf_conntrack expectations for
related data connections (based on Julian's patch see
http://www.ssi.bg/~ja/nfct/) and let nf_conntrack/nf_nat do the
packet mangling and the TCP sequence adjusting.
This change rises the question how to deal with ip_vs_sync? Does it
work together with conntrackd? Wild idea: what about getting rid of
ip_vs_sync and piggy packing all on nf_conntrack and use conntrackd?
Any comments on this?
- xt_ipvs: add new rule '--vportctl port' to match the VIP port of the
controlling connection, e.g. port 21 for FTP. Can be used to match
a related data connection for FTP:
# SNAT FTP control connection
% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> --vport 21 -j SNAT --to-source 192.168.10.10
# SNAT FTP passive data connection
% iptables -t nat -A POSTROUTING -m ipvs --vaddr 192.168.100.30/32 \
> --vportctl 21 -j SNAT --to-source 192.168.10.10
- xt_ipvs: use 'par->family' instead of 'skb->protocol'
- xt_ipvs: add ipvs_mt_check and restrict to NFPROTO_IPV4 and NFPROTO_IPV6
- Call nf_conntrack_alter_reply(), so helper lookup is performed based
on the changed tuple.
Changes to the linux kernel (rebased to next-20090925):
Hannes Eder (3):
netfilter: xt_ipvs (netfilter matcher for IPVS)
IPVS: make friends with nf_conntrack
IPVS: make FTP work with full NAT support
include/linux/netfilter/xt_ipvs.h | 25 +++++
include/net/ip_vs.h | 2
net/netfilter/Kconfig | 9 ++
net/netfilter/Makefile | 1
net/netfilter/ipvs/Kconfig | 4 -
net/netfilter/ipvs/ip_vs_app.c | 43 ---------
net/netfilter/ipvs/ip_vs_core.c | 37 -------
net/netfilter/ipvs/ip_vs_ftp.c | 178 ++++++++++++++++++++++++++++++++---
net/netfilter/ipvs/ip_vs_proto.c | 1
net/netfilter/ipvs/ip_vs_xmit.c | 30 ++++++
net/netfilter/xt_ipvs.c | 187 +++++++++++++++++++++++++++++++++++++
11 files changed, 418 insertions(+), 99 deletions(-)
create mode 100644 include/linux/netfilter/xt_ipvs.h
create mode 100644 net/netfilter/xt_ipvs.c
Changes to iptables (relative to 1.4.5):
Hannes Eder (1):
libxt_ipvs: user-space lib for netfilter matcher xt_ipvs
configure.ac | 10 1
extensions/libxt_ipvs.c | 365 +++++++++++++++++++++++++++++++++++++
extensions/libxt_ipvs.man | 24 ++
include/linux/netfilter/xt_ipvs.h | 25 +++
4 files changed, 422 insertions(+), 2 deletions(-)
create mode 100644 extensions/libxt_ipvs.c
create mode 100644 extensions/libxt_ipvs.man
create mode 100644 include/linux/netfilter/xt_ipvs.h
^ permalink raw reply
* sctp pull request for net-next-2.6
From: Vlad Yasevich @ 2010-05-01 2:52 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Hi David
The following changes since commit 83d7eb2979cd3390c375470225dd2d8f2009bc70:
Dan Carpenter (1):
ipv6: cleanup: remove unneeded null check
are available in the git repository at:
git://git.kernel.org/pub/scm/linux/kernel/git/vxy/lksctp-dev.git net-next
Dan Carpenter (1):
sctp: cleanup: remove duplicate assignment
Shan Wei (1):
sctp: use sctp_chunk_is_data macro to decide a chunk is data chunk
Vlad Yasevich (13):
sctp: Use correct address family in sctp_getsockopt_peer_addrs()
sctp: send SHUTDOWN-ACK chunk back to the source.
sctp: Do no select unconfirmed transports for retransmissions
sctp: Make sure we always return valid retransmit path
sctp: remove 'resent' bit from the chunk
sctp: Do not force T3 timer on fast retransmissions.
sctp: Save some room in the sctp_transport by using bitfields
sctp: update transport initializations
sctp: fast recovery algorithm is per association.
sctp: rwnd_press should be cumulative
sctp: correctly mark missing chunks in fast recovery
sctp: Optimize computation of highest new tsn in SACK.
sctp: Tag messages that can be Nagle delayed at creation.
Wei Yongjun (5):
sctp: assure at least one T3-rtx timer is running if a FORWARD TSN is sent
sctp: discard ABORT chunk with zero verification tag in COOKIE-WAIT state
sctp: missing set src and dest port while lookup output route
sctp: fix to retranmit at least one DATA chunk
sctp: implement sctp association probing module
include/net/sctp/sctp.h | 2 +-
include/net/sctp/sm.h | 2 +-
include/net/sctp/structs.h | 66 +++++++-------
net/sctp/Kconfig | 12 +++
net/sctp/Makefile | 3 +
net/sctp/associola.c | 13 ++--
net/sctp/chunk.c | 4 +-
net/sctp/endpointola.c | 2 -
net/sctp/output.c | 27 ++----
net/sctp/outqueue.c | 94 +++++++++-----------
net/sctp/probe.c | 213 ++++++++++++++++++++++++++++++++++++++++++++
net/sctp/protocol.c | 7 ++-
net/sctp/sm_make_chunk.c | 24 ++---
net/sctp/sm_sideeffect.c | 8 ++-
net/sctp/socket.c | 2 +-
net/sctp/transport.c | 61 ++++---------
16 files changed, 364 insertions(+), 176 deletions(-)
create mode 100644 net/sctp/probe.c
Please pull.
Thanks a lot
-vlad
^ permalink raw reply
* Re: [PATCH 1/1] net/usb: remove default in Kconfig for sierra_net driver
From: David Miller @ 2010-05-01 2:05 UTC (permalink / raw)
To: epasheva; +Cc: dbrownell, rfiler, netdev, linux-usb
In-Reply-To: <1272672300.7581.2.camel@Linuxdev4-laptop>
From: Elina Pasheva <epasheva@sierrawireless.com>
Date: Fri, 30 Apr 2010 17:05:00 -0700
> Subject: [PATCH 1/1] net/usb: remove default in Kconfig for sierra_net driver
> From: Elina Pasheva <epasheva@sierrawireless.com>
>
> The following patch removes the default from the Kconfig entry for sierra_net
> driver as recommended.
> All non-core drivers should default to "n".
> This patch has been checked against net-2.6 tree.
> Signed-off-by: Elina Pasheva <epasheva@sierrawireless.com>
> Signed-off-by: Rory Filer <rfiler@sierrawireless.com>
Applied, thanks.
^ permalink raw reply
* Re: question re: net-2.6 and net-next-2.6 trees re: patch submission
From: David Miller @ 2010-05-01 2:01 UTC (permalink / raw)
To: epasheva; +Cc: dbrownell, rfiler, netdev
In-Reply-To: <1272675194.21110.9.camel@Linuxdev3>
From: Elina Pasheva <epasheva@sierrawireless.com>
Date: Fri, 30 Apr 2010 17:53:14 -0700
> If I submit a new driver to net-2.6 tree (e.g. sierra_net driver that
> was applied to net-2.6 tree) where do I submit subsequent patches for
> that driver - net-2.6 tree or net-next-2.6 tree?
It depends upon the severity of the fix.
At this stage in the game on the most serious fixes are going
in, fixes for things that cause crashes and the like. However
since a new driver we might be a little bit more lenient since
changes to a new driver can harm less people.
^ permalink raw reply
* Re: [PATCH] [RFC] C/R: inet4 and inet6 unicast routes (v2)
From: Oren Laadan @ 2010-05-01 2:02 UTC (permalink / raw)
To: Dan Smith; +Cc: Daniel Lezcano, containers, Vlad Yasevich, David Miller, netdev
In-Reply-To: <87bpd0zl9l.fsf@caffeine.danplanet.com>
Dan Smith wrote:
> DL> Is it possible to enter the namespace and dump / restore the
> DL> routes with NETLINK_ROUTE from userspace ? Or is it something not
> DL> possible ?
>
> I'm sure it would be doable. However, checkpointing the routes that
> way would:
>
> (a) Be inconsistent with how we checkpoint all the other resources,
> including the other network resources we handle from the kernel
> with rtnl
> (b) Require merging of the data from the resources saved in userspace
> with those saved in kernelspace
See below suggestion for userspace.
> (c) Eliminate the ability for an application to easily checkpoint
> itself by making a single syscall
I can't think of a use-case of a networked application that takes
a checkpoint of itself (including live network).
Anyway, it's can still be useful to at least do the restore from
userspace (while checkpoint is done in kernel - like with pids).
We may reduce the complexity of restore (in kernel) greatly.
(BTW, instead of syscall one could have a library call that will
take care of the userspace "work").
> (d) Require this same sort of jumping back and forth between
> namespaces by the userspace task doing the checkpoint/restart
>
I wonder: if we could relatively simply recreate the network ns,
the interfaces in them, and then restore the routing information
all from userspace before calling sys_restart, it may be useful
in simplifying the kernel code, and allowing more flexibility for
userspace alterations.
I definitely should have asked the question much earlier when you
started the work on restoring network ns and interfaces ... (oh,
I reckon it's better late than never).
Just tossing out the idea, see what kind of thoughts it evokes.
Most likely I'll get a "that won't work because ...", but I'm
hoping for a "hmm.. maybe.. let me see.." :)
Oren.
^ permalink raw reply
* Re: [PATCH] [RFC] C/R: inet4 and inet6 unicast routes (v2)
From: Oren Laadan @ 2010-05-01 1:42 UTC (permalink / raw)
To: Daniel Lezcano
Cc: containers-qjLDD68F18O7TbgM5vRIOg, Vlad Yasevich, Dan Smith,
David Miller, netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <4BDB3F07.2030900-GANU6spQydw@public.gmane.org>
Daniel Lezcano wrote:
> Dan Smith wrote:
>> This patch adds support for checkpointing and restoring route information.
>> It keeps enough information to restore basic routes at the level of detail
>> of /proc/net/route. It uses RTNETLINK to extract the information during
>> checkpoint and also to insert it back during restore. This gives us a
>> nice layer of isolation between us and the various "fib" implementations.
>>
>> Changes in v2:
>>
>> This version of the patch actually moves the current task into the
>> desired network namespace temporarily, for the purposes of examining and
>> restoring the route information. This is a instead of creating a cross-
>> namespace socket to do the job, as was done in v1.
>>
>> This is just an RFC to see if this is an acceptable method. For a final
>> version, adding a helper to nsproxy.c would allow us to create a new
>> nsproxy with the desired netns instead of creating one with
>> copy_namespaces() just to kill it off and use the target one.
>>
>> I still think the previous method is cleaner, but this way may violate
>> fewer namespace boundaries (I'm still undecided :)
>>
>> Signed-off-by: Dan Smith <danms-r/Jw6+rmf7HQT0dZR+AlfA@public.gmane.org>
>> Cc: David Miller <davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
>> Cc: Vlad Yasevich <vladislav.yasevich-VXdhtT5mjnY@public.gmane.org>
>> Cc: jamal <hadi-fAAogVwAN2Kw5LPnMra/2Q@public.gmane.org>
>> ---
> Hi Dan,
>
> Eric did a patchset (as Jamal mentioned it) where you can have a process
> to enter a specific namespace from userspace.
>
> http://git.kernel.org/?p=linux/kernel/git/ebiederm/linux-2.6.33-nsfd-v5.git;a=commit;h=9c2f86a44d9ca93e78fd8e81a4e2a8c2a4cdb054
>
> Is it possible to enter the namespace and dump / restore the routes with
> NETLINK_ROUTE from userspace ? Or is it something not possible ?
>
I also think that restoring routes from userspace, if feasible,
will be advantageous.
Besides, that will simplify cases in which userspace would like to
restore something different (in terms of routes) than what was
saved in the checkpoint.
So the question is, what would it take ?
Oren.
^ permalink raw reply
* [RFC PATCH] sctp: Fix a race between ICMP protocol unreachable and connect()
From: Vlad Yasevich @ 2010-05-01 1:22 UTC (permalink / raw)
To: Linux SCTP Dev Mailing list, netdev
ICMP protocol unreachable handling completely disregarded
the fact that the user may have locked the socket. It proceeded
to destroy the association, even though the user may have
held the lock and had a ref on the association. This resulted
in the following bug:
Attempt to release alive inet socket f6afcc00
=========================
[ BUG: held lock freed! ]
-------------------------
somenu/2672 is freeing memory f6afcc00-f6afcfff, with a lock still held
there!
(sk_lock-AF_INET){+.+.+.}, at: [<c122098a>] sctp_connect+0x13/0x4c
1 lock held by somenu/2672:
#0: (sk_lock-AF_INET){+.+.+.}, at: [<c122098a>] sctp_connect+0x13/0x4c
stack backtrace:
Pid: 2672, comm: somenu Not tainted 2.6.32-telco #55
Call Trace:
[<c1232266>] ? printk+0xf/0x11
[<c1038553>] debug_check_no_locks_freed+0xce/0xff
[<c10620b4>] kmem_cache_free+0x21/0x66
[<c1185f25>] __sk_free+0x9d/0xab
[<c1185f9c>] sk_free+0x1c/0x1e
[<c1216e38>] sctp_association_put+0x32/0x89
[<c1220865>] __sctp_connect+0x36d/0x3f4
[<c122098a>] ? sctp_connect+0x13/0x4c
[<c102d073>] ? autoremove_wake_function+0x0/0x33
[<c12209a8>] sctp_connect+0x31/0x4c
[<c11d1e80>] inet_dgram_connect+0x4b/0x55
[<c11834fa>] sys_connect+0x54/0x71
[<c103a3a2>] ? lock_release_non_nested+0x88/0x239
[<c1054026>] ? might_fault+0x42/0x7c
[<c1054026>] ? might_fault+0x42/0x7c
[<c11847ab>] sys_socketcall+0x6d/0x178
[<c10da994>] ? trace_hardirqs_on_thunk+0xc/0x10
[<c1002959>] syscall_call+0x7/0xb
This was because the sctp_wait_for_connect() would acquire the socket
lock and then proceed to release the last reference count on the
association, thus causing the fully destruction path to finish freeing
the socket.
The simplest solution is to start a very short timer in case the socket
is owned by user. When the timer expires, we can do some verification
and be able to do the release properly.
Signed-off-by: Vlad Yasevich <vladislav.yasevich@hp.com>
---
include/net/sctp/sm.h | 1 +
include/net/sctp/structs.h | 3 +++
net/sctp/input.c | 23 +++++++++++++++++++----
net/sctp/sm_sideeffect.c | 35 +++++++++++++++++++++++++++++++++++
net/sctp/transport.c | 2 ++
5 files changed, 60 insertions(+), 4 deletions(-)
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 851c813..61d73e3 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -279,6 +279,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
/* 2nd level prototypes */
void sctp_generate_t3_rtx_event(unsigned long peer);
void sctp_generate_heartbeat_event(unsigned long peer);
+void sctp_generate_proto_unreach_event(unsigned long peer);
void sctp_ootb_pkt_free(struct sctp_packet *);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 597f8e2..219043a 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1010,6 +1010,9 @@ struct sctp_transport {
/* Heartbeat timer is per destination. */
struct timer_list hb_timer;
+ /* Timer to handle ICMP proto unreachable envets */
+ struct timer_list proto_unreach_timer;
+
/* Since we're using per-destination retransmission timers
* (see above), we're also using per-destination "transmitted"
* queues. This probably ought to be a private struct
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 2a57018..94b2eb2 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -440,11 +440,25 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
{
SCTP_DEBUG_PRINTK("%s\n", __func__);
- sctp_do_sm(SCTP_EVENT_T_OTHER,
- SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
- asoc->state, asoc->ep, asoc, t,
- GFP_ATOMIC);
+ if (sock_owned_by_user(sk)) {
+ if (timer_pending(&t->proto_unreach_timer))
+ return;
+ else {
+ if (!mod_timer(&t->proto_unreach_timer,
+ jiffies + (HZ/20)))
+ sctp_association_hold(asoc);
+ }
+
+ } else {
+ if (timer_pending(&t->proto_unreach_timer) &&
+ del_timer(&t->proto_unreach_timer))
+ sctp_association_put(asoc);
+ sctp_do_sm(SCTP_EVENT_T_OTHER,
+ SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
+ asoc->state, asoc->ep, asoc, t,
+ GFP_ATOMIC);
+ }
}
/* Common lookup code for icmp/icmpv6 error handler. */
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index d5ae450..eb1f42f 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -397,6 +397,41 @@ out_unlock:
sctp_transport_put(transport);
}
+/* Handle the timeout of the ICMP protocol unreachable timer. Trigger
+ * the correct state machine transition that will close the association.
+ */
+void sctp_generate_proto_unreach_event(unsigned long data)
+{
+ struct sctp_transport *transport = (struct sctp_transport *) data;
+ struct sctp_association *asoc = transport->asoc;
+
+ sctp_bh_lock_sock(asoc->base.sk);
+ if (sock_owned_by_user(asoc->base.sk)) {
+ SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __func__);
+
+ /* Try again later. */
+ if (!mod_timer(&transport->proto_unreach_timer,
+ jiffies + (HZ/20)))
+ sctp_association_hold(asoc);
+ goto out_unlock;
+ }
+
+ /* Is this structure just waiting around for us to actually
+ * get destroyed?
+ */
+ if (asoc->base.dead)
+ goto out_unlock;
+
+ sctp_do_sm(SCTP_EVENT_T_OTHER,
+ SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
+ asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC);
+
+out_unlock:
+ sctp_bh_unlock_sock(asoc->base.sk);
+ sctp_association_put(asoc);
+}
+
+
/* Inject a SACK Timeout event into the state machine. */
static void sctp_generate_sack_event(unsigned long data)
{
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index be4d63d..4a36803 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -108,6 +108,8 @@ static struct sctp_transport *sctp_transport_init(struct
sctp_transport *peer,
(unsigned long)peer);
setup_timer(&peer->hb_timer, sctp_generate_heartbeat_event,
(unsigned long)peer);
+ setup_timer(&peer->proto_unreach_timer,
+ sctp_generate_proto_unreach_event, (unsigned long)peer);
/* Initialize the 64-bit random nonce sent with heartbeat. */
get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce));
--
1.6.0.4
^ permalink raw reply related
* question re: net-2.6 and net-next-2.6 trees re: patch submission
From: Elina Pasheva @ 2010-05-01 0:53 UTC (permalink / raw)
To: davem, David Brownell; +Cc: Rory Filer, epasheva, netdev
Hi,
If I submit a new driver to net-2.6 tree (e.g. sierra_net driver that
was applied to net-2.6 tree) where do I submit subsequent patches for
that driver - net-2.6 tree or net-next-2.6 tree?
Thanks,
Elina
^ permalink raw reply
* Re: [PATCH] [RFC] C/R: inet4 and inet6 unicast routes (v2)
From: jamal @ 2010-05-01 0:26 UTC (permalink / raw)
To: Dan Smith
Cc: containers-qjLDD68F18O7TbgM5vRIOg, Vlad Yasevich, David Miller,
netdev-u79uwXL29TY76Z2rM5mHXA
In-Reply-To: <87bpd0zl9l.fsf-FLMGYpZoEPULwtHQx/6qkW3U47Q5hpJU@public.gmane.org>
On Fri, 2010-04-30 at 14:24 -0700, Dan Smith wrote:
>
> I'm sure it would be doable. However, checkpointing the routes that
> way would:
>
> (a) Be inconsistent with how we checkpoint all the other resources,
> including the other network resources we handle from the kernel
> with rtnl
My 2c:
The problem as i see it (with all net structures not just routes - i was
equally pessimistic when i saw those other net structure
checkpoint/restore changes) is you are faced with a herculean
high-maintainance effort...
You have a separate piece of code which populates structures that _you_
maintain for attributes that are defined elsewhere by other people.
Nobody adding a new attribute that is very important to route
restoration for example is likely to change your code. Unless you tie
the two together (so changing one forces the coder to change the other).
And once people deploy kernels it is hard to change. Historically (for
pragmatic reasons) such rich interfaces sit in user space - much easier
to update user space.
cheers,
jamal
^ permalink raw reply
* Re: [PATCH net-next-2.6] net: speedup udp receive path
From: jamal @ 2010-05-01 0:06 UTC (permalink / raw)
To: Eric Dumazet
Cc: Changli Gao, David Miller, therbert, shemminger, netdev,
Eilon Greenstein, Brian Bloniarz
In-Reply-To: <1272660000.2230.4.camel@edumazet-laptop>
On Fri, 2010-04-30 at 22:40 +0200, Eric Dumazet wrote:
>
> I used your program, and with RPS off, I can get at most 220.000 pps
> with my "old" hardware. I dont understand how you can reach 700.000 pps
> with RPS off. Or is it with your Nehalem ?
Yes, Nehalem.
RPS off is better (~700Kpp) than RPS on(~650kpps). Are you seeing the
same trend on the old hardware?
cheers,
jamal
^ permalink raw reply
* [PATCH 1/1] net/usb: remove default in Kconfig for sierra_net driver
From: Elina Pasheva @ 2010-05-01 0:05 UTC (permalink / raw)
To: dbrownell-Rn4VEauK+AKRv+LV9MX5uipxlwaOVQ5f,
davem-fT/PcQaiUtIeIZ0/mPfg9Q
Cc: epasheva-ywE8TTl5eJHWpu6QEFMNjNBPR1lH4CV8,
rfiler-ywE8TTl5eJHWpu6QEFMNjNBPR1lH4CV8,
netdev-u79uwXL29TY76Z2rM5mHXA, linux-usb-u79uwXL29TY76Z2rM5mHXA
Subject: [PATCH 1/1] net/usb: remove default in Kconfig for sierra_net driver
From: Elina Pasheva <epasheva-ywE8TTl5eJHWpu6QEFMNjNBPR1lH4CV8@public.gmane.org>
The following patch removes the default from the Kconfig entry for sierra_net
driver as recommended.
All non-core drivers should default to "n".
This patch has been checked against net-2.6 tree.
Signed-off-by: Elina Pasheva <epasheva-ywE8TTl5eJHWpu6QEFMNjNBPR1lH4CV8@public.gmane.org>
Signed-off-by: Rory Filer <rfiler-ywE8TTl5eJHWpu6QEFMNjNBPR1lH4CV8@public.gmane.org>
---
drivers/net/usb/Kconfig | 1 -
1 files changed, 0 insertions(+), 1 deletions(-)
diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index 5d58abc..d7b7018 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -400,7 +400,6 @@ config USB_IPHETH
config USB_SIERRA_NET
tristate "USB-to-WWAN Driver for Sierra Wireless modems"
depends on USB_USBNET
- default y
help
Choose this option if you have a Sierra Wireless USB-to-WWAN device.
--
1.5.4.3
--
To unsubscribe from this list: send the line "unsubscribe linux-usb" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply related
* Re: [patch] ipv6: cleanup: remove unneeded null check
From: David Miller @ 2010-04-30 23:42 UTC (permalink / raw)
To: error27
Cc: netdev, kuznet, pekkas, jmorris, yoshfuji, kaber, eric.dumazet,
sri, herbert, emils.tantilov, kernel-janitors
In-Reply-To: <20100429143034.GI29093@bicker>
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 29 Apr 2010 16:30:35 +0200
> We dereference "sk" unconditionally elsewhere in the function.
>
> This was left over from: b30bd282 "ip6_xmit: remove unnecessary NULL
> ptr check". According to that commit message, "the sk argument to
> ip6_xmit is never NULL nowadays since the skb->priority assigment
> expects a valid socket."
>
> Signed-off-by: Dan Carpenter <error27@gmail.com>
Applied, thanks Dan.
^ permalink raw reply
* Re: [PATCH] tcp: SO_TIMESTAMP implementation for TCP
From: David Miller @ 2010-04-30 23:41 UTC (permalink / raw)
To: therbert; +Cc: netdev
In-Reply-To: <i2i65634d661004300058s5ee1b177oee249d3d22baad62@mail.gmail.com>
From: Tom Herbert <therbert@google.com>
Date: Fri, 30 Apr 2010 00:58:32 -0700
>> All these new checks and branches for a feature of questionable value.
>
>> If you can modify you apps to grab this information you can also probe
>> for the information using external probing tools.
>>
> I don't see an nice way to do that, we're profiling a significant
> percentage of millions of connections over thousands of paths as part
> of standard operations while incurring negligible overhead. The app
> can can easily timestamp its operations, but without some mechanism
> for getting timestamps out of a TCP connection, the networking portion
> of servicing requests is pretty much a black box in that.
If other people have an opinion about this, now would be the time
to speak up. :-)
^ permalink raw reply
* Re: [PATCH] xfrm: potential uninitialized variable num_xfrms
From: David Miller @ 2010-04-30 23:40 UTC (permalink / raw)
To: xiaosuo; +Cc: hadi, timo.teras, herbert, adobriyan, netdev
In-Reply-To: <1272439222-2935-1-git-send-email-xiaosuo@gmail.com>
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 28 Apr 2010 15:20:22 +0800
> potential uninitialized variable num_xfrms
>
> fix compiler warning: 'num_xfrms' may be used uninitialized in this function.
>
> Signed-off-by: Changli Gao <xiaosuo@gmail.com>
I decided to apply this after all, thanks!
^ permalink raw reply
* Re: [PATCH v6] net: batch skb dequeueing from softnet input_pkt_queue
From: David Miller @ 2010-04-30 23:38 UTC (permalink / raw)
To: ak
Cc: eric.dumazet, andi, hadi, xiaosuo, therbert, shemminger, netdev,
lenb, arjan
In-Reply-To: <20100429214144.GA10663@gargoyle.fritz.box>
From: Andi Kleen <ak@gargoyle.fritz.box>
Date: Thu, 29 Apr 2010 23:41:44 +0200
> Use io_schedule() in network stack to tell cpuidle governour to guarantee lower latencies
>
> XXX: probably too aggressive, some of these sleeps are not under high load.
>
> Based on a bug report from Eric Dumazet.
>
> Signed-off-by: Andi Kleen <ak@linux.intel.com>
I like this, except that we probably don't want the delayacct_blkio_*() calls
these things do.
Probably the rest of what these things do should remain in the io_schedule*()
functions and the block layer can call it's own versions which add in the
delayacct_blkio_*() bits.
Or, if the delacct stuff is useful for socket I/O too, then it's interfaces
names should have the "blk" stripped from them :-)
^ permalink raw reply
* Re: [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
From: David Miller @ 2010-04-30 23:35 UTC (permalink / raw)
To: eric.dumazet; +Cc: hadi, xiaosuo, therbert, shemminger, netdev, eilong, bmb
In-Reply-To: <1272574909.2209.150.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 29 Apr 2010 23:01:49 +0200
> [PATCH net-next-2.6] net: sock_def_readable() and friends RCU conversion
So what's the difference between call_rcu() freeing this little waitqueue
struct and doing it for the entire socket?
We'll still be doing an RCU call every socket destroy, and now we also have
a new memory allocation/free per connection.
This has to show up in things like 'lat_connect' and friends, does it not?
^ permalink raw reply
* Re: [PATCH net-next-2.6] net: speedup sock_recv_ts_and_drops()
From: David Miller @ 2010-04-30 23:30 UTC (permalink / raw)
To: eric.dumazet; +Cc: netdev
In-Reply-To: <1272518083.2201.119.camel@edumazet-laptop>
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 29 Apr 2010 07:14:43 +0200
> sock_recv_ts_and_drops() is fat and slow (~ 4% of cpu time on some
> profiles)
>
> We can test all socket flags at once to make fast path fast again.
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Nice, applied, thanks Eric.
I have a sneaking suspicion that there may be a few other places
like this. :-)
^ permalink raw reply
* Re: [PATCH/RFC Resubmission] cdc_ether: Identify MBM devices by GUID in MDLM descriptor
From: David Miller @ 2010-04-30 23:28 UTC (permalink / raw)
To: oneukum; +Cc: jonas.sjoquist, netdev
In-Reply-To: <201004281051.44212.oneukum@suse.de>
From: Oliver Neukum <oneukum@suse.de>
Date: Wed, 28 Apr 2010 10:51:44 +0200
> Am Mittwoch, 28. April 2010 02:09:59 schrieb David Miller:
>> From: Jonas Sjoquist <jonas.sjoquist@ericsson.com>
>> Date: Fri, 23 Apr 2010 13:07:45 +0200
>>
>> > From: Jonas Sjöquist <jonas.sjoquist@ericsson.com>
>> >
>> > This patch removes vid/pid for Ericsson MBM devices from the whitelist set of
>> > devices. The MBM devices are instead identified by GUID.
>> >
>> > In order for cdc_ether to handle these devices the GUID in the MDLM descriptor
>> > is tested. All MBM devices currently handled by cdc_ether as well as future
>> > CDC Ethernet MBM devices can be identified by the GUID.
>> >
>> > This is the same solution used in Carl Nordbeck's mbm driver,
>> > http://kerneltrap.org/mailarchive/linux-usb/2008/11/17/4141384/thread
>> >
>> > I post this as RFC to get feedback on however cdc_ether is the correct place to
>> > do the binding, or if it should be done in a separate driver, e.g. zaurus.
>> >
>> > Signed-off-by: Jonas Sjöquist <jonas.sjoquist@ericsson.com>
>>
>> Can someone knowledgable with the cdc_ether driver review this change?
>
> The patch looks good.
Applied to net-next-2.6, thanks everyone.
^ permalink raw reply
* Re: r8169 INFO: inconsistent lock state
From: David Miller @ 2010-04-30 23:20 UTC (permalink / raw)
To: romieu
Cc: eric.dumazet, sergey.senozhatsky, oleg, mingo, a.p.zijlstra,
netdev, linux-kernel
In-Reply-To: <20100430211556.GA4903@electric-eye.fr.zoreil.com>
From: Francois Romieu <romieu@fr.zoreil.com>
Date: Fri, 30 Apr 2010 23:15:56 +0200
> Eric Dumazet <eric.dumazet@gmail.com> :
> [...]
>> So we have following illegal chain (process context, not softirq)
>>
>> rtl8169_reset_task() -> tl8169_rx_interrupt() -> netif_receive_skb()
>>
>> And normally, commit 630b943c tried to change this chain to :
>>
>> rtl8169_reset_task() -> tl8169_rx_interrupt() -> netif_rx()
>>
>> I have no idea why it doesnt work.
>
> 630b943c appears to be in net-next.
>
> Oops ?
I just tossed this into net-2.6, thanks for noticing.
^ permalink raw reply
* Re: [PATCH] forcedeth: Stay in NAPI as long as there's work
From: David Miller @ 2010-04-30 23:17 UTC (permalink / raw)
To: therbert; +Cc: shemminger, joe, netdev, aabdulla
In-Reply-To: <h2m65634d661004281656j9f29fc2btdec0d6bcbee20f35@mail.gmail.com>
From: Tom Herbert <therbert@google.com>
Date: Wed, 28 Apr 2010 16:56:30 -0700
> On Wed, Apr 28, 2010 at 2:25 PM, David Miller <davem@davemloft.net> wrote:
>> From: Stephen Hemminger <shemminger@vyatta.com>
>> Date: Wed, 28 Apr 2010 11:25:28 -0700
>>
>>> The following does the same thing without the extra overhead
>>> of testing all the registers. It also handles the out of memory
>>> case.
>>>
>>> Compile tested only...
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> Tom can you test this version?
>>
>
> Looks good. 406038 tps in my quick test which still is showing the
> benefits. Thanks for cleaning this up Stephen!
Thanks for testing, applied, thanks everyone.
^ permalink raw reply
* Re: [PATCH]PM QOS refresh against next-20100430
From: Rafael J. Wysocki @ 2010-04-30 23:08 UTC (permalink / raw)
To: mgross
Cc: Kevin Hilman, aili, dwalker, tiwai, bruce.w.allan, davidb, mcgrof,
pavel, linux-pm, lkml, NetDev, Johannes Berg,
ACPI Devel Maling List, Len Brown, John W. Linville
In-Reply-To: <20100430230529.GA31391@linux.intel.com>
On Saturday 01 May 2010, mark gross wrote:
> On Sat, May 01, 2010 at 12:13:16AM +0200, Rafael J. Wysocki wrote:
> > On Friday 30 April 2010, mark gross wrote:
> > > The following is a refresh of the PM_QOS implementation, this patch
> > > updates some documentation input I got from Randy.
> > >
> > > This patch changes the string based list management to a handle base
> > > implementation to help with the hot path use of pm-qos, it also renames
> > > much of the API to use "request" as opposed to "requirement" that was
> > > used in the initial implementation. I did this because request more
> > > accurately represents what it actually does.
> > >
> > > Also, I added a string based ABI for users wanting to use a string
> > > interface. So if the user writes 0xDDDDDDDD formatted hex it will be
> > > accepted by the interface. (someone asked me for it and I don't think
> > > it hurts anything.)
> > >
> > > I really would like to get this refresh taken care of. Its been taking
> > > me too long to close this. please review or include it in next.
> > >
> > > Thanks!
> >
> > Well, I'd take it to suspend-2.6/linux-next, but first, it touches
> > subsystems whose maintainers were not in the Cc list, like the network
> > drivers, wireless and ACPI. The changes are trivial, so I hope they don't
> > mind.
> >
> > Second, my tree is based on the Linus' tree rather than linux-next and
> > the change in net/mac80211/scan.c doesn't seem to match that. Please tell me
> > what I'm supposed to do about that.
>
> You can waite for monday and I'll send a rebased version to linus' tree.
>
> I thought linux-next was where folks wanted me to put it.
>
> I'll email out a new one monday.
Great, thanks!
Rafael
^ permalink raw reply
* Re: [PATCH]PM QOS refresh against next-20100430
From: mark gross @ 2010-04-30 23:05 UTC (permalink / raw)
To: Rafael J. Wysocki
Cc: Kevin Hilman, aili, dwalker, tiwai, bruce.w.allan, davidb, mcgrof,
pavel, linux-pm, lkml, NetDev, Johannes Berg,
ACPI Devel Maling List, Len Brown, John W. Linville
In-Reply-To: <201005010013.16262.rjw@sisk.pl>
On Sat, May 01, 2010 at 12:13:16AM +0200, Rafael J. Wysocki wrote:
> On Friday 30 April 2010, mark gross wrote:
> > The following is a refresh of the PM_QOS implementation, this patch
> > updates some documentation input I got from Randy.
> >
> > This patch changes the string based list management to a handle base
> > implementation to help with the hot path use of pm-qos, it also renames
> > much of the API to use "request" as opposed to "requirement" that was
> > used in the initial implementation. I did this because request more
> > accurately represents what it actually does.
> >
> > Also, I added a string based ABI for users wanting to use a string
> > interface. So if the user writes 0xDDDDDDDD formatted hex it will be
> > accepted by the interface. (someone asked me for it and I don't think
> > it hurts anything.)
> >
> > I really would like to get this refresh taken care of. Its been taking
> > me too long to close this. please review or include it in next.
> >
> > Thanks!
>
> Well, I'd take it to suspend-2.6/linux-next, but first, it touches
> subsystems whose maintainers were not in the Cc list, like the network
> drivers, wireless and ACPI. The changes are trivial, so I hope they don't
> mind.
>
> Second, my tree is based on the Linus' tree rather than linux-next and
> the change in net/mac80211/scan.c doesn't seem to match that. Please tell me
> what I'm supposed to do about that.
You can waite for monday and I'll send a rebased version to linus' tree.
I thought linux-next was where folks wanted me to put it.
I'll email out a new one monday.
Thanks,
--mgross
> Thanks,
> Rafael
>
>
> > Ooops! forgot the signed off by line!
> >
> > Signed-off-by: mark gross <mgross@linux.intel.com>
> >
> > From c45d8d86f89ac55fbb9a499fbc754e35258bf818 Mon Sep 17 00:00:00 2001
> > From: mgross <mark.gross@gmail.com>
> > Date: Sat, 13 Mar 2010 08:18:36 -0800
> > Subject: [PATCH 1/2] PM_QOS to use handle based list implementation and exported function name changes to be more descriptive of what is actually happening.
> >
> > ---
> > Documentation/power/pm_qos_interface.txt | 48 ++++---
> > drivers/acpi/processor_idle.c | 2 +-
> > drivers/cpuidle/governors/ladder.c | 2 +-
> > drivers/cpuidle/governors/menu.c | 2 +-
> > drivers/net/e1000e/netdev.c | 22 ++--
> > drivers/net/igbvf/netdev.c | 6 +-
> > drivers/net/wireless/ipw2x00/ipw2100.c | 11 +-
> > include/linux/netdevice.h | 4 +
> > include/linux/pm_qos_params.h | 14 +-
> > include/sound/pcm.h | 3 +-
> > kernel/pm_qos_params.c | 214 ++++++++++++++---------------
> > net/mac80211/mlme.c | 2 +-
> > net/mac80211/scan.c | 2 +-
> > sound/core/pcm.c | 3 -
> > sound/core/pcm_native.c | 14 +-
> > 15 files changed, 177 insertions(+), 172 deletions(-)
> >
> > diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt
> > index c40866e..bfed898 100644
> > --- a/Documentation/power/pm_qos_interface.txt
> > +++ b/Documentation/power/pm_qos_interface.txt
> > @@ -18,44 +18,46 @@ and pm_qos_params.h. This is done because having the available parameters
> > being runtime configurable or changeable from a driver was seen as too easy to
> > abuse.
> >
> > -For each parameter a list of performance requirements is maintained along with
> > +For each parameter a list of performance requests is maintained along with
> > an aggregated target value. The aggregated target value is updated with
> > -changes to the requirement list or elements of the list. Typically the
> > -aggregated target value is simply the max or min of the requirement values held
> > +changes to the request list or elements of the list. Typically the
> > +aggregated target value is simply the max or min of the request values held
> > in the parameter list elements.
> >
> > From kernel mode the use of this interface is simple:
> > -pm_qos_add_requirement(param_id, name, target_value):
> > -Will insert a named element in the list for that identified PM_QOS parameter
> > -with the target value. Upon change to this list the new target is recomputed
> > -and any registered notifiers are called only if the target value is now
> > -different.
> >
> > -pm_qos_update_requirement(param_id, name, new_target_value):
> > -Will search the list identified by the param_id for the named list element and
> > -then update its target value, calling the notification tree if the aggregated
> > -target is changed. with that name is already registered.
> > +handle = pm_qos_add_request(param_class, target_value):
> > +Will insert an element into the list for that identified PM_QOS class with the
> > +target value. Upon change to this list the new target is recomputed and any
> > +registered notifiers are called only if the target value is now different.
> > +Clients of pm_qos need to save the returned handle.
> >
> > -pm_qos_remove_requirement(param_id, name):
> > -Will search the identified list for the named element and remove it, after
> > -removal it will update the aggregate target and call the notification tree if
> > -the target was changed as a result of removing the named requirement.
> > +void pm_qos_update_request(handle, new_target_value):
> > +Will update the list element pointed to by the handle with the new target value
> > +and recompute the new aggregated target, calling the notification tree if the
> > +target is changed.
> > +
> > +void pm_qos_remove_request(handle):
> > +Will remove the element. After removal it will update the aggregate target and
> > +call the notification tree if the target was changed as a result of removing
> > +the request.
> >
> >
> > From user mode:
> > -Only processes can register a pm_qos requirement. To provide for automatic
> > -cleanup for process the interface requires the process to register its
> > -parameter requirements in the following way:
> > +Only processes can register a pm_qos request. To provide for automatic
> > +cleanup of a process, the interface requires the process to register its
> > +parameter requests in the following way:
> >
> > To register the default pm_qos target for the specific parameter, the process
> > must open one of /dev/[cpu_dma_latency, network_latency, network_throughput]
> >
> > As long as the device node is held open that process has a registered
> > -requirement on the parameter. The name of the requirement is "process_<PID>"
> > -derived from the current->pid from within the open system call.
> > +request on the parameter.
> >
> > -To change the requested target value the process needs to write a s32 value to
> > -the open device node. This translates to a pm_qos_update_requirement call.
> > +To change the requested target value the process needs to write an s32 value to
> > +the open device node. Alternatively the user mode program could write a hex
> > +string for the value using 10 char long format e.g. "0x12345678". This
> > +translates to a pm_qos_update_request call.
> >
> > To remove the user mode request for a target value simply close the device
> > node.
> > diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
> > index 5939e7f..c3817e1 100644
> > --- a/drivers/acpi/processor_idle.c
> > +++ b/drivers/acpi/processor_idle.c
> > @@ -698,7 +698,7 @@ static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
> > "max_cstate: C%d\n"
> > "maximum allowed latency: %d usec\n",
> > pr->power.state ? pr->power.state - pr->power.states : 0,
> > - max_cstate, pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY));
> > + max_cstate, pm_qos_request(PM_QOS_CPU_DMA_LATENCY));
> >
> > seq_puts(seq, "states:\n");
> >
> > diff --git a/drivers/cpuidle/governors/ladder.c b/drivers/cpuidle/governors/ladder.c
> > index 1c1ceb4..12c9890 100644
> > --- a/drivers/cpuidle/governors/ladder.c
> > +++ b/drivers/cpuidle/governors/ladder.c
> > @@ -67,7 +67,7 @@ static int ladder_select_state(struct cpuidle_device *dev)
> > struct ladder_device *ldev = &__get_cpu_var(ladder_devices);
> > struct ladder_device_state *last_state;
> > int last_residency, last_idx = ldev->last_state_idx;
> > - int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
> > + int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
> >
> > /* Special case when user has set very strict latency requirement */
> > if (unlikely(latency_req == 0)) {
> > diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
> > index 1aea715..61ca939 100644
> > --- a/drivers/cpuidle/governors/menu.c
> > +++ b/drivers/cpuidle/governors/menu.c
> > @@ -183,7 +183,7 @@ static u64 div_round64(u64 dividend, u32 divisor)
> > static int menu_select(struct cpuidle_device *dev)
> > {
> > struct menu_device *data = &__get_cpu_var(menu_devices);
> > - int latency_req = pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY);
> > + int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
> > int i;
> > int multiplier;
> >
> > diff --git a/drivers/net/e1000e/netdev.c b/drivers/net/e1000e/netdev.c
> > index 904bd6b..7550879 100644
> > --- a/drivers/net/e1000e/netdev.c
> > +++ b/drivers/net/e1000e/netdev.c
> > @@ -2882,12 +2882,12 @@ static void e1000_configure_rx(struct e1000_adapter *adapter)
> > * excessive C-state transition latencies result in
> > * dropped transactions.
> > */
> > - pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - adapter->netdev->name, 55);
> > + pm_qos_update_request(
> > + adapter->netdev->pm_qos_req, 55);
> > } else {
> > - pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - adapter->netdev->name,
> > - PM_QOS_DEFAULT_VALUE);
> > + pm_qos_update_request(
> > + adapter->netdev->pm_qos_req,
> > + PM_QOS_DEFAULT_VALUE);
> > }
> > }
> >
> > @@ -3181,8 +3181,8 @@ int e1000e_up(struct e1000_adapter *adapter)
> >
> > /* DMA latency requirement to workaround early-receive/jumbo issue */
> > if (adapter->flags & FLAG_HAS_ERT)
> > - pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - adapter->netdev->name,
> > + adapter->netdev->pm_qos_req =
> > + pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
> > PM_QOS_DEFAULT_VALUE);
> >
> > /* hardware has been reset, we need to reload some things */
> > @@ -3244,9 +3244,11 @@ void e1000e_down(struct e1000_adapter *adapter)
> > e1000_clean_tx_ring(adapter);
> > e1000_clean_rx_ring(adapter);
> >
> > - if (adapter->flags & FLAG_HAS_ERT)
> > - pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - adapter->netdev->name);
> > + if (adapter->flags & FLAG_HAS_ERT) {
> > + pm_qos_remove_request(
> > + adapter->netdev->pm_qos_req);
> > + adapter->netdev->pm_qos_req = NULL;
> > + }
> >
> > /*
> > * TODO: for power management, we could drop the link and
> > diff --git a/drivers/net/igbvf/netdev.c b/drivers/net/igbvf/netdev.c
> > index 7012e3d..5e2b2a8 100644
> > --- a/drivers/net/igbvf/netdev.c
> > +++ b/drivers/net/igbvf/netdev.c
> > @@ -48,6 +48,7 @@
> > #define DRV_VERSION "1.0.0-k0"
> > char igbvf_driver_name[] = "igbvf";
> > const char igbvf_driver_version[] = DRV_VERSION;
> > +struct pm_qos_request_list *igbvf_driver_pm_qos_req;
> > static const char igbvf_driver_string[] =
> > "Intel(R) Virtual Function Network Driver";
> > static const char igbvf_copyright[] = "Copyright (c) 2009 Intel Corporation.";
> > @@ -2901,7 +2902,7 @@ static int __init igbvf_init_module(void)
> > printk(KERN_INFO "%s\n", igbvf_copyright);
> >
> > ret = pci_register_driver(&igbvf_driver);
> > - pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, igbvf_driver_name,
> > + igbvf_driver_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
> > PM_QOS_DEFAULT_VALUE);
> >
> > return ret;
> > @@ -2917,7 +2918,8 @@ module_init(igbvf_init_module);
> > static void __exit igbvf_exit_module(void)
> > {
> > pci_unregister_driver(&igbvf_driver);
> > - pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, igbvf_driver_name);
> > + pm_qos_remove_request(igbvf_driver_pm_qos_req);
> > + igbvf_driver_pm_qos_req = NULL;
> > }
> > module_exit(igbvf_exit_module);
> >
> > diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
> > index 2088ac0..7040e3b 100644
> > --- a/drivers/net/wireless/ipw2x00/ipw2100.c
> > +++ b/drivers/net/wireless/ipw2x00/ipw2100.c
> > @@ -174,6 +174,8 @@ that only one external action is invoked at a time.
> > #define DRV_DESCRIPTION "Intel(R) PRO/Wireless 2100 Network Driver"
> > #define DRV_COPYRIGHT "Copyright(c) 2003-2006 Intel Corporation"
> >
> > +struct pm_qos_request_list *ipw2100_pm_qos_req;
> > +
> > /* Debugging stuff */
> > #ifdef CONFIG_IPW2100_DEBUG
> > #define IPW2100_RX_DEBUG /* Reception debugging */
> > @@ -1739,7 +1741,7 @@ static int ipw2100_up(struct ipw2100_priv *priv, int deferred)
> > /* the ipw2100 hardware really doesn't want power management delays
> > * longer than 175usec
> > */
> > - pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100", 175);
> > + pm_qos_update_request(ipw2100_pm_qos_req, 175);
> >
> > /* If the interrupt is enabled, turn it off... */
> > spin_lock_irqsave(&priv->low_lock, flags);
> > @@ -1887,8 +1889,7 @@ static void ipw2100_down(struct ipw2100_priv *priv)
> > ipw2100_disable_interrupts(priv);
> > spin_unlock_irqrestore(&priv->low_lock, flags);
> >
> > - pm_qos_update_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100",
> > - PM_QOS_DEFAULT_VALUE);
> > + pm_qos_update_request(ipw2100_pm_qos_req, PM_QOS_DEFAULT_VALUE);
> >
> > /* We have to signal any supplicant if we are disassociating */
> > if (associated)
> > @@ -6669,7 +6670,7 @@ static int __init ipw2100_init(void)
> > if (ret)
> > goto out;
> >
> > - pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100",
> > + ipw2100_pm_qos_req = pm_qos_add_request(PM_QOS_CPU_DMA_LATENCY,
> > PM_QOS_DEFAULT_VALUE);
> > #ifdef CONFIG_IPW2100_DEBUG
> > ipw2100_debug_level = debug;
> > @@ -6692,7 +6693,7 @@ static void __exit ipw2100_exit(void)
> > &driver_attr_debug_level);
> > #endif
> > pci_unregister_driver(&ipw2100_pci_driver);
> > - pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY, "ipw2100");
> > + pm_qos_remove_request(ipw2100_pm_qos_req);
> > }
> >
> > module_init(ipw2100_init);
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index 40d4c20..5dd6d8c 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> > @@ -31,6 +31,7 @@
> > #include <linux/if_link.h>
> >
> > #ifdef __KERNEL__
> > +#include <linux/pm_qos_params.h>
> > #include <linux/timer.h>
> > #include <linux/delay.h>
> > #include <linux/mm.h>
> > @@ -778,6 +779,9 @@ struct net_device {
> > * the interface.
> > */
> > char name[IFNAMSIZ];
> > +
> > + struct pm_qos_request_list *pm_qos_req;
> > +
> > /* device name hash chain */
> > struct hlist_node name_hlist;
> > /* snmp alias */
> > diff --git a/include/linux/pm_qos_params.h b/include/linux/pm_qos_params.h
> > index d74f75e..8ba440e 100644
> > --- a/include/linux/pm_qos_params.h
> > +++ b/include/linux/pm_qos_params.h
> > @@ -14,12 +14,14 @@
> > #define PM_QOS_NUM_CLASSES 4
> > #define PM_QOS_DEFAULT_VALUE -1
> >
> > -int pm_qos_add_requirement(int qos, char *name, s32 value);
> > -int pm_qos_update_requirement(int qos, char *name, s32 new_value);
> > -void pm_qos_remove_requirement(int qos, char *name);
> > +struct pm_qos_request_list;
> >
> > -int pm_qos_requirement(int qos);
> > +struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value);
> > +void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
> > + s32 new_value);
> > +void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req);
> >
> > -int pm_qos_add_notifier(int qos, struct notifier_block *notifier);
> > -int pm_qos_remove_notifier(int qos, struct notifier_block *notifier);
> > +int pm_qos_request(int pm_qos_class);
> > +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier);
> > +int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier);
> >
> > diff --git a/include/sound/pcm.h b/include/sound/pcm.h
> > index 8b611a5..dd76cde 100644
> > --- a/include/sound/pcm.h
> > +++ b/include/sound/pcm.h
> > @@ -29,6 +29,7 @@
> > #include <linux/poll.h>
> > #include <linux/mm.h>
> > #include <linux/bitops.h>
> > +#include <linux/pm_qos_params.h>
> >
> > #define snd_pcm_substream_chip(substream) ((substream)->private_data)
> > #define snd_pcm_chip(pcm) ((pcm)->private_data)
> > @@ -365,7 +366,7 @@ struct snd_pcm_substream {
> > int number;
> > char name[32]; /* substream name */
> > int stream; /* stream (direction) */
> > - char latency_id[20]; /* latency identifier */
> > + struct pm_qos_request_list *latency_pm_qos_req; /* pm_qos request */
> > size_t buffer_bytes_max; /* limit ring buffer size */
> > struct snd_dma_buffer dma_buffer;
> > unsigned int dma_buf_id;
> > diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
> > index 3db49b9..a1aea04 100644
> > --- a/kernel/pm_qos_params.c
> > +++ b/kernel/pm_qos_params.c
> > @@ -2,7 +2,7 @@
> > * This module exposes the interface to kernel space for specifying
> > * QoS dependencies. It provides infrastructure for registration of:
> > *
> > - * Dependents on a QoS value : register requirements
> > + * Dependents on a QoS value : register requests
> > * Watchers of QoS value : get notified when target QoS value changes
> > *
> > * This QoS design is best effort based. Dependents register their QoS needs.
> > @@ -14,19 +14,21 @@
> > * timeout: usec <-- currently not used.
> > * throughput: kbs (kilo byte / sec)
> > *
> > - * There are lists of pm_qos_objects each one wrapping requirements, notifiers
> > + * There are lists of pm_qos_objects each one wrapping requests, notifiers
> > *
> > - * User mode requirements on a QOS parameter register themselves to the
> > + * User mode requests on a QOS parameter register themselves to the
> > * subsystem by opening the device node /dev/... and writing there request to
> > * the node. As long as the process holds a file handle open to the node the
> > * client continues to be accounted for. Upon file release the usermode
> > - * requirement is removed and a new qos target is computed. This way when the
> > - * requirement that the application has is cleaned up when closes the file
> > + * request is removed and a new qos target is computed. This way when the
> > + * request that the application has is cleaned up when closes the file
> > * pointer or exits the pm_qos_object will get an opportunity to clean up.
> > *
> > * Mark Gross <mgross@linux.intel.com>
> > */
> >
> > +/*#define DEBUG*/
> > +
> > #include <linux/pm_qos_params.h>
> > #include <linux/sched.h>
> > #include <linux/spinlock.h>
> > @@ -42,25 +44,25 @@
> > #include <linux/uaccess.h>
> >
> > /*
> > - * locking rule: all changes to requirements or notifiers lists
> > + * locking rule: all changes to requests or notifiers lists
> > * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
> > * held, taken with _irqsave. One lock to rule them all
> > */
> > -struct requirement_list {
> > +struct pm_qos_request_list {
> > struct list_head list;
> > union {
> > s32 value;
> > s32 usec;
> > s32 kbps;
> > };
> > - char *name;
> > + int pm_qos_class;
> > };
> >
> > static s32 max_compare(s32 v1, s32 v2);
> > static s32 min_compare(s32 v1, s32 v2);
> >
> > struct pm_qos_object {
> > - struct requirement_list requirements;
> > + struct pm_qos_request_list requests;
> > struct blocking_notifier_head *notifiers;
> > struct miscdevice pm_qos_power_miscdev;
> > char *name;
> > @@ -72,7 +74,7 @@ struct pm_qos_object {
> > static struct pm_qos_object null_pm_qos;
> > static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
> > static struct pm_qos_object cpu_dma_pm_qos = {
> > - .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)},
> > + .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
> > .notifiers = &cpu_dma_lat_notifier,
> > .name = "cpu_dma_latency",
> > .default_value = 2000 * USEC_PER_SEC,
> > @@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
> >
> > static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
> > static struct pm_qos_object network_lat_pm_qos = {
> > - .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)},
> > + .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
> > .notifiers = &network_lat_notifier,
> > .name = "network_latency",
> > .default_value = 2000 * USEC_PER_SEC,
> > @@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
> >
> > static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
> > static struct pm_qos_object network_throughput_pm_qos = {
> > - .requirements =
> > - {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
> > + .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
> > .notifiers = &network_throughput_notifier,
> > .name = "network_throughput",
> > .default_value = 0,
> > @@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
> > }
> >
> >
> > -static void update_target(int target)
> > +static void update_target(int pm_qos_class)
> > {
> > s32 extreme_value;
> > - struct requirement_list *node;
> > + struct pm_qos_request_list *node;
> > unsigned long flags;
> > int call_notifier = 0;
> >
> > spin_lock_irqsave(&pm_qos_lock, flags);
> > - extreme_value = pm_qos_array[target]->default_value;
> > + extreme_value = pm_qos_array[pm_qos_class]->default_value;
> > list_for_each_entry(node,
> > - &pm_qos_array[target]->requirements.list, list) {
> > - extreme_value = pm_qos_array[target]->comparitor(
> > + &pm_qos_array[pm_qos_class]->requests.list, list) {
> > + extreme_value = pm_qos_array[pm_qos_class]->comparitor(
> > extreme_value, node->value);
> > }
> > - if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
> > + if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
> > + extreme_value) {
> > call_notifier = 1;
> > - atomic_set(&pm_qos_array[target]->target_value, extreme_value);
> > - pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
> > - atomic_read(&pm_qos_array[target]->target_value));
> > + atomic_set(&pm_qos_array[pm_qos_class]->target_value,
> > + extreme_value);
> > + pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
> > + atomic_read(&pm_qos_array[pm_qos_class]->target_value));
> > }
> > spin_unlock_irqrestore(&pm_qos_lock, flags);
> >
> > if (call_notifier)
> > - blocking_notifier_call_chain(pm_qos_array[target]->notifiers,
> > - (unsigned long) extreme_value, NULL);
> > + blocking_notifier_call_chain(
> > + pm_qos_array[pm_qos_class]->notifiers,
> > + (unsigned long) extreme_value, NULL);
> > }
> >
> > static int register_pm_qos_misc(struct pm_qos_object *qos)
> > @@ -185,125 +189,110 @@ static int find_pm_qos_object_by_minor(int minor)
> > }
> >
> > /**
> > - * pm_qos_requirement - returns current system wide qos expectation
> > + * pm_qos_request - returns current system wide qos expectation
> > * @pm_qos_class: identification of which qos value is requested
> > *
> > * This function returns the current target value in an atomic manner.
> > */
> > -int pm_qos_requirement(int pm_qos_class)
> > +int pm_qos_request(int pm_qos_class)
> > {
> > return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
> > }
> > -EXPORT_SYMBOL_GPL(pm_qos_requirement);
> > +EXPORT_SYMBOL_GPL(pm_qos_request);
> >
> > /**
> > - * pm_qos_add_requirement - inserts new qos request into the list
> > + * pm_qos_add_request - inserts new qos request into the list
> > * @pm_qos_class: identifies which list of qos request to us
> > - * @name: identifies the request
> > * @value: defines the qos request
> > *
> > * This function inserts a new entry in the pm_qos_class list of requested qos
> > * performance characteristics. It recomputes the aggregate QoS expectations
> > - * for the pm_qos_class of parameters.
> > + * for the pm_qos_class of parameters, and returns the pm_qos_request list
> > + * element as a handle for use in updating and removal. Call needs to save
> > + * this handle for later use.
> > */
> > -int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
> > +struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
> > {
> > - struct requirement_list *dep;
> > + struct pm_qos_request_list *dep;
> > unsigned long flags;
> >
> > - dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL);
> > + dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
> > if (dep) {
> > if (value == PM_QOS_DEFAULT_VALUE)
> > dep->value = pm_qos_array[pm_qos_class]->default_value;
> > else
> > dep->value = value;
> > - dep->name = kstrdup(name, GFP_KERNEL);
> > - if (!dep->name)
> > - goto cleanup;
> > + dep->pm_qos_class = pm_qos_class;
> >
> > spin_lock_irqsave(&pm_qos_lock, flags);
> > list_add(&dep->list,
> > - &pm_qos_array[pm_qos_class]->requirements.list);
> > + &pm_qos_array[pm_qos_class]->requests.list);
> > spin_unlock_irqrestore(&pm_qos_lock, flags);
> > update_target(pm_qos_class);
> > -
> > - return 0;
> > }
> >
> > -cleanup:
> > - kfree(dep);
> > - return -ENOMEM;
> > + return dep;
> > }
> > -EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
> > +EXPORT_SYMBOL_GPL(pm_qos_add_request);
> >
> > /**
> > - * pm_qos_update_requirement - modifies an existing qos request
> > - * @pm_qos_class: identifies which list of qos request to us
> > - * @name: identifies the request
> > + * pm_qos_update_request - modifies an existing qos request
> > + * @pm_qos_req : handle to list element holding a pm_qos request to use
> > * @value: defines the qos request
> > *
> > - * Updates an existing qos requirement for the pm_qos_class of parameters along
> > + * Updates an existing qos request for the pm_qos_class of parameters along
> > * with updating the target pm_qos_class value.
> > *
> > - * If the named request isn't in the list then no change is made.
> > + * Attempts are made to make this code callable on hot code paths.
> > */
> > -int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
> > +void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
> > + s32 new_value)
> > {
> > unsigned long flags;
> > - struct requirement_list *node;
> > int pending_update = 0;
> > + s32 temp;
> >
> > spin_lock_irqsave(&pm_qos_lock, flags);
> > - list_for_each_entry(node,
> > - &pm_qos_array[pm_qos_class]->requirements.list, list) {
> > - if (strcmp(node->name, name) == 0) {
> > - if (new_value == PM_QOS_DEFAULT_VALUE)
> > - node->value =
> > - pm_qos_array[pm_qos_class]->default_value;
> > - else
> > - node->value = new_value;
> > - pending_update = 1;
> > - break;
> > - }
> > + if (new_value == PM_QOS_DEFAULT_VALUE)
> > + temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
> > + else
> > + temp = new_value;
> > +
> > + if (temp != pm_qos_req->value) {
> > + pending_update = 1;
> > + pm_qos_req->value = temp;
> > }
> > spin_unlock_irqrestore(&pm_qos_lock, flags);
> > if (pending_update)
> > - update_target(pm_qos_class);
> > -
> > - return 0;
> > + update_target(pm_qos_req->pm_qos_class);
> > }
> > -EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
> > +EXPORT_SYMBOL_GPL(pm_qos_update_request);
> >
> > /**
> > - * pm_qos_remove_requirement - modifies an existing qos request
> > - * @pm_qos_class: identifies which list of qos request to us
> > - * @name: identifies the request
> > + * pm_qos_remove_request - modifies an existing qos request
> > + * @pm_qos_req: handle to request list element
> > *
> > - * Will remove named qos request from pm_qos_class list of parameters and
> > - * recompute the current target value for the pm_qos_class.
> > + * Will remove pm qos request from the list of requests and
> > + * recompute the current target value for the pm_qos_class. Call this
> > + * on slow code paths.
> > */
> > -void pm_qos_remove_requirement(int pm_qos_class, char *name)
> > +void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
> > {
> > unsigned long flags;
> > - struct requirement_list *node;
> > - int pending_update = 0;
> > + int qos_class;
> > +
> > + if (pm_qos_req == NULL)
> > + return;
> > + /* silent return to keep pcm code cleaner */
> >
> > + qos_class = pm_qos_req->pm_qos_class;
> > spin_lock_irqsave(&pm_qos_lock, flags);
> > - list_for_each_entry(node,
> > - &pm_qos_array[pm_qos_class]->requirements.list, list) {
> > - if (strcmp(node->name, name) == 0) {
> > - kfree(node->name);
> > - list_del(&node->list);
> > - kfree(node);
> > - pending_update = 1;
> > - break;
> > - }
> > - }
> > + list_del(&pm_qos_req->list);
> > + kfree(pm_qos_req);
> > spin_unlock_irqrestore(&pm_qos_lock, flags);
> > - if (pending_update)
> > - update_target(pm_qos_class);
> > + update_target(qos_class);
> > }
> > -EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
> > +EXPORT_SYMBOL_GPL(pm_qos_remove_request);
> >
> > /**
> > * pm_qos_add_notifier - sets notification entry for changes to target value
> > @@ -313,7 +302,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
> > * will register the notifier into a notification chain that gets called
> > * upon changes to the pm_qos_class target value.
> > */
> > - int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
> > +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
> > {
> > int retval;
> >
> > @@ -343,21 +332,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
> > }
> > EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
> >
> > -#define PID_NAME_LEN 32
> > -
> > static int pm_qos_power_open(struct inode *inode, struct file *filp)
> > {
> > - int ret;
> > long pm_qos_class;
> > - char name[PID_NAME_LEN];
> >
> > pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
> > if (pm_qos_class >= 0) {
> > - filp->private_data = (void *)pm_qos_class;
> > - snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
> > - ret = pm_qos_add_requirement(pm_qos_class, name,
> > - PM_QOS_DEFAULT_VALUE);
> > - if (ret >= 0)
> > + filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
> > + PM_QOS_DEFAULT_VALUE);
> > +
> > + if (filp->private_data)
> > return 0;
> > }
> > return -EPERM;
> > @@ -365,32 +349,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
> >
> > static int pm_qos_power_release(struct inode *inode, struct file *filp)
> > {
> > - int pm_qos_class;
> > - char name[PID_NAME_LEN];
> > + struct pm_qos_request_list *req;
> >
> > - pm_qos_class = (long)filp->private_data;
> > - snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
> > - pm_qos_remove_requirement(pm_qos_class, name);
> > + req = (struct pm_qos_request_list *)filp->private_data;
> > + pm_qos_remove_request(req);
> >
> > return 0;
> > }
> >
> > +
> > static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
> > size_t count, loff_t *f_pos)
> > {
> > s32 value;
> > - int pm_qos_class;
> > - char name[PID_NAME_LEN];
> > -
> > - pm_qos_class = (long)filp->private_data;
> > - if (count != sizeof(s32))
> > + int x;
> > + char ascii_value[11];
> > + struct pm_qos_request_list *pm_qos_req;
> > +
> > + if (count == sizeof(s32)) {
> > + if (copy_from_user(&value, buf, sizeof(s32)))
> > + return -EFAULT;
> > + } else if (count == 11) { /* len('0x12345678/0') */
> > + if (copy_from_user(ascii_value, buf, 11))
> > + return -EFAULT;
> > + x = sscanf(ascii_value, "%x", &value);
> > + if (x != 1)
> > + return -EINVAL;
> > + pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
> > + } else
> > return -EINVAL;
> > - if (copy_from_user(&value, buf, sizeof(s32)))
> > - return -EFAULT;
> > - snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
> > - pm_qos_update_requirement(pm_qos_class, name, value);
> >
> > - return sizeof(s32);
> > + pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
> > + pm_qos_update_request(pm_qos_req, value);
> > +
> > + return count;
> > }
> >
> >
> > diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
> > index 358226f..3deaac3 100644
> > --- a/net/mac80211/mlme.c
> > +++ b/net/mac80211/mlme.c
> > @@ -507,7 +507,7 @@ void ieee80211_recalc_ps(struct ieee80211_local *local, s32 latency)
> > s32 beaconint_us;
> >
> > if (latency < 0)
> > - latency = pm_qos_requirement(PM_QOS_NETWORK_LATENCY);
> > + latency = pm_qos_request(PM_QOS_NETWORK_LATENCY);
> >
> > beaconint_us = ieee80211_tu_to_usec(
> > found->vif.bss_conf.beacon_int);
> > diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
> > index 8bc961f..be52bb8 100644
> > --- a/net/mac80211/scan.c
> > +++ b/net/mac80211/scan.c
> > @@ -510,7 +510,7 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local,
> > bad_latency = time_after(jiffies +
> > ieee80211_scan_get_channel_time(next_chan),
> > local->leave_oper_channel_time +
> > - usecs_to_jiffies(pm_qos_requirement(PM_QOS_NETWORK_LATENCY)));
> > + usecs_to_jiffies(pm_qos_request(PM_QOS_NETWORK_LATENCY)));
> >
> > listen_int_exceeded = time_after(jiffies +
> > ieee80211_scan_get_channel_time(next_chan),
> > diff --git a/sound/core/pcm.c b/sound/core/pcm.c
> > index 0d428d0..cbe815d 100644
> > --- a/sound/core/pcm.c
> > +++ b/sound/core/pcm.c
> > @@ -648,9 +648,6 @@ int snd_pcm_new_stream(struct snd_pcm *pcm, int stream, int substream_count)
> > substream->number = idx;
> > substream->stream = stream;
> > sprintf(substream->name, "subdevice #%i", idx);
> > - snprintf(substream->latency_id, sizeof(substream->latency_id),
> > - "ALSA-PCM%d-%d%c%d", pcm->card->number, pcm->device,
> > - (stream ? 'c' : 'p'), idx);
> > substream->buffer_bytes_max = UINT_MAX;
> > if (prev == NULL)
> > pstr->substream = substream;
> > diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
> > index c22ebb0..1bb0e23 100644
> > --- a/sound/core/pcm_native.c
> > +++ b/sound/core/pcm_native.c
> > @@ -481,11 +481,13 @@ static int snd_pcm_hw_params(struct snd_pcm_substream *substream,
> > snd_pcm_timer_resolution_change(substream);
> > runtime->status->state = SNDRV_PCM_STATE_SETUP;
> >
> > - pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - substream->latency_id);
> > + if (substream->latency_pm_qos_req) {
> > + pm_qos_remove_request(substream->latency_pm_qos_req);
> > + substream->latency_pm_qos_req = NULL;
> > + }
> > if ((usecs = period_to_usecs(runtime)) >= 0)
> > - pm_qos_add_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - substream->latency_id, usecs);
> > + substream->latency_pm_qos_req = pm_qos_add_request(
> > + PM_QOS_CPU_DMA_LATENCY, usecs);
> > return 0;
> > _error:
> > /* hardware might be unuseable from this time,
> > @@ -540,8 +542,8 @@ static int snd_pcm_hw_free(struct snd_pcm_substream *substream)
> > if (substream->ops->hw_free)
> > result = substream->ops->hw_free(substream);
> > runtime->status->state = SNDRV_PCM_STATE_OPEN;
> > - pm_qos_remove_requirement(PM_QOS_CPU_DMA_LATENCY,
> > - substream->latency_id);
> > + pm_qos_remove_request(substream->latency_pm_qos_req);
> > + substream->latency_pm_qos_req = NULL;
> > return result;
> > }
> >
> >
^ permalink raw reply
* Re: [Patch 1/3] sysctl: refactor integer handling proc code
From: Changli Gao @ 2010-04-30 22:49 UTC (permalink / raw)
To: Amerigo Wang
Cc: linux-kernel, Octavian Purdila, Eric Dumazet, penguin-kernel,
netdev, Neil Horman, ebiederm, David Miller, adobriyan
In-Reply-To: <20100430082925.5630.58453.sendpatchset@localhost.localdomain>
On Fri, Apr 30, 2010 at 4:25 PM, Amerigo Wang <amwang@redhat.com> wrote:
> (Based on Octavian's work, and I modified a lot.)
>
> As we are about to add another integer handling proc function a little
> bit of cleanup is in order: add a few helper functions to improve code
> readability and decrease code duplication.
>
> In the process a bug is also fixed: if the user specifies a number
> with more then 20 digits it will be interpreted as two integers
> (e.g. 10000...13 will be interpreted as 100.... and 13).
>
> Behavior for EFAULT handling was changed as well. Previous to this
> patch, when an EFAULT error occurred in the middle of a write
> operation, although some of the elements were set, that was not
> acknowledged to the user (by shorting the write and returning the
> number of bytes accepted). EFAULT is now treated just like any other
> errors by acknowledging the amount of bytes accepted.
>
> Signed-off-by: Octavian Purdila <opurdila@ixiacom.com>
> Signed-off-by: WANG Cong <amwang@redhat.com>
> Cc: Eric W. Biederman <ebiederm@xmission.com>
> ---
>
> Index: linux-2.6/kernel/sysctl.c
> ===================================================================
> --- linux-2.6.orig/kernel/sysctl.c
> +++ linux-2.6/kernel/sysctl.c
> @@ -2040,8 +2040,122 @@ int proc_dostring(struct ctl_table *tabl
> buffer, lenp, ppos);
> }
>
> +static size_t proc_skip_spaces(char **buf)
> +{
> + size_t ret;
> + char *tmp = skip_spaces(*buf);
> + ret = tmp - *buf;
> + *buf = tmp;
> + return ret;
> +}
> +
> +#define TMPBUFLEN 22
> +/**
> + * proc_get_long - reads an ASCII formated integer from a user buffer
> + *
> + * @buf - a kernel buffer
> + * @size - size of the kernel buffer
> + * @val - this is where the number will be stored
> + * @neg - set to %TRUE if number is negative
> + * @perm_tr - a vector which contains the allowed trailers
> + * @perm_tr_len - size of the perm_tr vector
> + * @tr - pointer to store the trailer character
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read. If tr is non NULL and a trailing
> + * character exist (size is non zero after returning from this
> + * function) tr is updated with the trailing character.
> + */
> +static int proc_get_long(char **buf, size_t *size,
> + unsigned long *val, bool *neg,
> + const char *perm_tr, unsigned perm_tr_len, char *tr)
> +{
> + int len;
> + char *p, tmp[TMPBUFLEN];
> +
> + if (!*size)
> + return -EINVAL;
> +
> + len = *size;
> + if (len > TMPBUFLEN-1)
> + len = TMPBUFLEN-1;
> +
> + memcpy(tmp, *buf, len);
> +
> + tmp[len] = 0;
> + p = tmp;
> + if (*p == '-' && *size > 1) {
> + *neg = 1;
As neg is bool*, you should use true and false instead of 1 and 0.
> + p++;
> + } else
> + *neg = 0;
> + if (!isdigit(*p))
> + return -EINVAL;
> +
> + *val = simple_strtoul(p, &p, 0);
>
> -static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
> + len = p - tmp;
> +
> + /* We don't know if the next char is whitespace thus we may accept
> + * invalid integers (e.g. 1234...a) or two integers instead of one
> + * (e.g. 123...1). So lets not allow such large numbers. */
> + if (len == TMPBUFLEN - 1)
> + return -EINVAL;
> +
> + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
> + return -EINVAL;
> +
> + if (tr && (len < *size))
> + *tr = *p;
> +
> + *buf += len;
> + *size -= len;
> +
> + return 0;
> +}
> +
> +/**
> + * proc_put_long - coverts an integer to a decimal ASCII formated string
> + *
> + * @buf - the user buffer
> + * @size - the size of the user buffer
> + * @val - the integer to be converted
> + * @neg - sign of the number, %TRUE for negative
> + *
> + * In case of success 0 is returned and buf and size are updated with
> + * the amount of bytes read.
> + */
> +static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
> + bool neg)
> +{
> + int len;
> + char tmp[TMPBUFLEN], *p = tmp;
> +
> + sprintf(p, "%s%lu", neg ? "-" : "", val);
> + len = strlen(tmp);
> + if (len > *size)
> + len = *size;
> + if (copy_to_user(*buf, tmp, len))
> + return -EFAULT;
> + *size -= len;
> + *buf += len;
> + return 0;
> +}
> +#undef TMPBUFLEN
> +
> +static int proc_put_char(void __user **buf, size_t *size, char c)
> +{
> + if (*size) {
> + char __user **buffer = (char __user **)buf;
> + if (put_user(c, *buffer))
> + return -EFAULT;
> + (*size)--, (*buffer)++;
> + *buf = *buffer;
> + }
> + return 0;
> +}
> +
> +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
> int *valp,
> int write, void *data)
> {
> @@ -2050,7 +2164,7 @@ static int do_proc_dointvec_conv(int *ne
> } else {
> int val = *valp;
> if (val < 0) {
> - *negp = -1;
> + *negp = 1;
> *lvalp = (unsigned long)-val;
> } else {
> *negp = 0;
> @@ -2060,23 +2174,21 @@ static int do_proc_dointvec_conv(int *ne
> return 0;
> }
>
> +static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
> +
> static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
> int write, void __user *buffer,
> size_t *lenp, loff_t *ppos,
> - int (*conv)(int *negp, unsigned long *lvalp, int *valp,
> + int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
> int write, void *data),
> void *data)
> {
> -#define TMPBUFLEN 21
> - int *i, vleft, first = 1, neg;
> - unsigned long lval;
> - size_t left, len;
> + int *i, vleft, first = 1, err = 0;
> + unsigned long page = 0;
> + size_t left;
> + char *kbuf;
>
> - char buf[TMPBUFLEN], *p;
> - char __user *s = buffer;
> -
> - if (!tbl_data || !table->maxlen || !*lenp ||
> - (*ppos && !write)) {
> + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
> *lenp = 0;
> return 0;
> }
> @@ -2088,89 +2200,69 @@ static int __do_proc_dointvec(void *tbl_
> if (!conv)
> conv = do_proc_dointvec_conv;
>
> + if (write) {
> + if (left > PAGE_SIZE - 1)
> + left = PAGE_SIZE - 1;
> + page = __get_free_page(GFP_TEMPORARY);
> + kbuf = (char *) page;
> + if (!kbuf)
> + return -ENOMEM;
> + if (copy_from_user(kbuf, buffer, left)) {
> + err = -EFAULT;
> + goto free;
> + }
> + kbuf[left] = 0;
> + }
> +
> for (; left && vleft--; i++, first=0) {
> - if (write) {
> - while (left) {
> - char c;
> - if (get_user(c, s))
> - return -EFAULT;
> - if (!isspace(c))
> - break;
> - left--;
> - s++;
> - }
> - if (!left)
> - break;
> - neg = 0;
> - len = left;
> - if (len > sizeof(buf) - 1)
> - len = sizeof(buf) - 1;
> - if (copy_from_user(buf, s, len))
> - return -EFAULT;
> - buf[len] = 0;
> - p = buf;
> - if (*p == '-' && left > 1) {
> - neg = 1;
> - p++;
> - }
> - if (*p < '0' || *p > '9')
> - break;
> + unsigned long lval;
> + bool neg;
>
> - lval = simple_strtoul(p, &p, 0);
> + if (write) {
> + left -= proc_skip_spaces(&kbuf);
>
> - len = p-buf;
> - if ((len < left) && *p && !isspace(*p))
> + err = proc_get_long(&kbuf, &left, &lval, &neg,
> + proc_wspace_sep,
> + sizeof(proc_wspace_sep), NULL);
> + if (err)
> break;
> - s += len;
> - left -= len;
> -
> - if (conv(&neg, &lval, i, 1, data))
> + if (conv(&neg, &lval, i, 1, data)) {
> + err = -EINVAL;
> break;
> + }
> } else {
> - p = buf;
> + if (conv(&neg, &lval, i, 0, data)) {
> + err = -EINVAL;
> + break;
> + }
> if (!first)
> - *p++ = '\t';
> -
> - if (conv(&neg, &lval, i, 0, data))
> + err = proc_put_char(&buffer, &left, '\t');
> + if (err)
> + break;
> + err = proc_put_long(&buffer, &left, lval, neg);
> + if (err)
> break;
> -
> - sprintf(p, "%s%lu", neg ? "-" : "", lval);
> - len = strlen(buf);
> - if (len > left)
> - len = left;
> - if(copy_to_user(s, buf, len))
> - return -EFAULT;
> - left -= len;
> - s += len;
> }
> }
>
> - if (!write && !first && left) {
> - if(put_user('\n', s))
> - return -EFAULT;
> - left--, s++;
> - }
> + if (!write && !first && left && !err)
> + err = proc_put_char(&buffer, &left, '\n');
> + if (write && !err)
> + left -= proc_skip_spaces(&kbuf);
> +free:
> if (write) {
> - while (left) {
> - char c;
> - if (get_user(c, s++))
> - return -EFAULT;
> - if (!isspace(c))
> - break;
> - left--;
> - }
> + free_page(page);
> + if (first)
> + return err ? : -EINVAL;
> }
> - if (write && first)
> - return -EINVAL;
> *lenp -= left;
> *ppos += *lenp;
> - return 0;
> -#undef TMPBUFLEN
> + return err;
> }
>
> static int do_proc_dointvec(struct ctl_table *table, int write,
> void __user *buffer, size_t *lenp, loff_t *ppos,
> - int (*conv)(int *negp, unsigned long *lvalp, int *valp,
> + int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
> int write, void *data),
> void *data)
> {
> @@ -2238,8 +2330,8 @@ struct do_proc_dointvec_minmax_conv_para
> int *max;
> };
>
> -static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
> - int *valp,
> +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
> + int *valp,
> int write, void *data)
> {
> struct do_proc_dointvec_minmax_conv_param *param = data;
> @@ -2252,7 +2344,7 @@ static int do_proc_dointvec_minmax_conv(
> } else {
> int val = *valp;
> if (val < 0) {
> - *negp = -1;
> + *negp = 1;
> *lvalp = (unsigned long)-val;
> } else {
> *negp = 0;
> @@ -2295,102 +2387,78 @@ static int __do_proc_doulongvec_minmax(v
> unsigned long convmul,
> unsigned long convdiv)
> {
> -#define TMPBUFLEN 21
> - unsigned long *i, *min, *max, val;
> - int vleft, first=1, neg;
> - size_t len, left;
> - char buf[TMPBUFLEN], *p;
> - char __user *s = buffer;
> -
> - if (!data || !table->maxlen || !*lenp ||
> - (*ppos && !write)) {
> + unsigned long *i, *min, *max;
> + int vleft, first = 1, err = 0;
> + unsigned long page = 0;
> + size_t left;
> + char *kbuf;
> +
> + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
> *lenp = 0;
> return 0;
> }
> -
> +
> i = (unsigned long *) data;
> min = (unsigned long *) table->extra1;
> max = (unsigned long *) table->extra2;
> vleft = table->maxlen / sizeof(unsigned long);
> left = *lenp;
> -
> +
> + if (write) {
> + if (left > PAGE_SIZE - 1)
> + left = PAGE_SIZE - 1;
> + page = __get_free_page(GFP_TEMPORARY);
> + kbuf = (char *) page;
> + if (!kbuf)
> + return -ENOMEM;
> + if (copy_from_user(kbuf, buffer, left)) {
> + err = -EFAULT;
> + goto free;
> + }
> + kbuf[left] = 0;
> + }
> +
> for (; left && vleft--; i++, min++, max++, first=0) {
> + unsigned long val;
> +
> if (write) {
> - while (left) {
> - char c;
> - if (get_user(c, s))
> - return -EFAULT;
> - if (!isspace(c))
> - break;
> - left--;
> - s++;
> - }
> - if (!left)
> - break;
> - neg = 0;
> - len = left;
> - if (len > TMPBUFLEN-1)
> - len = TMPBUFLEN-1;
> - if (copy_from_user(buf, s, len))
> - return -EFAULT;
> - buf[len] = 0;
> - p = buf;
> - if (*p == '-' && left > 1) {
> - neg = 1;
> - p++;
> - }
> - if (*p < '0' || *p > '9')
> - break;
> - val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
> - len = p-buf;
> - if ((len < left) && *p && !isspace(*p))
> + bool neg;
> +
> + left -= proc_skip_spaces(&kbuf);
> +
> + err = proc_get_long(&kbuf, &left, &val, &neg,
> + proc_wspace_sep,
> + sizeof(proc_wspace_sep), NULL);
> + if (err)
> break;
> if (neg)
> - val = -val;
> - s += len;
> - left -= len;
> -
> - if(neg)
> continue;
> if ((min && val < *min) || (max && val > *max))
> continue;
> *i = val;
> } else {
> - p = buf;
> + val = convdiv * (*i) / convmul;
> if (!first)
> - *p++ = '\t';
> - sprintf(p, "%lu", convdiv * (*i) / convmul);
> - len = strlen(buf);
> - if (len > left)
> - len = left;
> - if(copy_to_user(s, buf, len))
> - return -EFAULT;
> - left -= len;
> - s += len;
> + err = proc_put_char(&buffer, &left, '\t');
> + err = proc_put_long(&buffer, &left, val, false);
> + if (err)
> + break;
> }
> }
>
> - if (!write && !first && left) {
> - if(put_user('\n', s))
> - return -EFAULT;
> - left--, s++;
> - }
> + if (!write && !first && left && !err)
> + err = proc_put_char(&buffer, &left, '\n');
> + if (write && !err)
> + left -= proc_skip_spaces(&kbuf);
> +free:
> if (write) {
> - while (left) {
> - char c;
> - if (get_user(c, s++))
> - return -EFAULT;
> - if (!isspace(c))
> - break;
> - left--;
> - }
> + free_page(page);
> + if (first)
> + return err ? : -EINVAL;
> }
> - if (write && first)
> - return -EINVAL;
> *lenp -= left;
> *ppos += *lenp;
> - return 0;
> -#undef TMPBUFLEN
> + return err;
> }
>
> static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
> @@ -2451,7 +2519,7 @@ int proc_doulongvec_ms_jiffies_minmax(st
> }
>
>
> -static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
> +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
> int *valp,
> int write, void *data)
> {
> @@ -2463,7 +2531,7 @@ static int do_proc_dointvec_jiffies_conv
> int val = *valp;
> unsigned long lval;
> if (val < 0) {
> - *negp = -1;
> + *negp = 1;
> lval = (unsigned long)-val;
> } else {
> *negp = 0;
> @@ -2474,7 +2542,7 @@ static int do_proc_dointvec_jiffies_conv
> return 0;
> }
>
> -static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
> +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
> int *valp,
> int write, void *data)
> {
> @@ -2486,7 +2554,7 @@ static int do_proc_dointvec_userhz_jiffi
> int val = *valp;
> unsigned long lval;
> if (val < 0) {
> - *negp = -1;
> + *negp = 1;
> lval = (unsigned long)-val;
> } else {
> *negp = 0;
> @@ -2497,7 +2565,7 @@ static int do_proc_dointvec_userhz_jiffi
> return 0;
> }
>
> -static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
> +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
> int *valp,
> int write, void *data)
> {
> @@ -2507,7 +2575,7 @@ static int do_proc_dointvec_ms_jiffies_c
> int val = *valp;
> unsigned long lval;
> if (val < 0) {
> - *negp = -1;
> + *negp = 1;
> lval = (unsigned long)-val;
> } else {
> *negp = 0;
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
--
Regards,
Changli Gao(xiaosuo@gmail.com)
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox