* [PATCH iproute2 0/3] CAN Filter/Classifier
From: Rostislav Lisovy @ 2012-05-25 9:11 UTC (permalink / raw)
To: netdev; +Cc: linux-can, pisa, sojkam1, oliver, Rostislav Lisovy
The CAN classifier may be used with any available qdisc on Controller
Area Network (CAN) frames passed through AF_CAN networking subsystem.
The classifier classifies CAN frames according to their identifiers.
It can be used on CAN frames with both SFF or EFF identifiers.
The filtering rules for EFF frames are stored in an array, which
is traversed during classification. A bitmap is used to store SFF
rules -- one bit for each ID.
More info about the project:
http://rtime.felk.cvut.cz/can/socketcan-qdisc-final.pdf
Rostislav Lisovy (3):
Added missing can.h
CAN Filter/Classifier -- Source code
CAN Filter/Classifier -- Documentation
include/linux/can.h | 112 ++++++++++++++++++++++
include/linux/pkt_cls.h | 10 ++
man/man8/tc-can.8 | 97 +++++++++++++++++++
tc/Makefile | 1 +
tc/f_can.c | 238 +++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 458 insertions(+)
create mode 100644 include/linux/can.h
create mode 100644 man/man8/tc-can.8
create mode 100644 tc/f_can.c
--
1.7.9.5
^ permalink raw reply
* [PATCH iproute2 1/3] Added missing can.h
From: Rostislav Lisovy @ 2012-05-25 9:11 UTC (permalink / raw)
To: netdev; +Cc: linux-can, pisa, sojkam1, oliver, Rostislav Lisovy
In-Reply-To: <1337937106-7640-1-git-send-email-lisovy@gmail.com>
This header file is slightly modified version copied from
Linux kernel v. 3.3. It contains defines necessary for AF_CAN
communication.
Signed-off-by: Rostislav Lisovy <lisovy@gmail.com>
---
include/linux/can.h | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 112 insertions(+)
create mode 100644 include/linux/can.h
diff --git a/include/linux/can.h b/include/linux/can.h
new file mode 100644
index 0000000..08d1610
--- /dev/null
+++ b/include/linux/can.h
@@ -0,0 +1,112 @@
+/*
+ * linux/can.h
+ *
+ * Definitions for CAN network layer (socket addr / CAN frame / CAN filter)
+ *
+ * Authors: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Urs Thuermann <urs.thuermann@volkswagen.de>
+ * Copyright (c) 2002-2007 Volkswagen Group Electronic Research
+ * All rights reserved.
+ *
+ */
+
+#ifndef CAN_H
+#define CAN_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+
+/* controller area network (CAN) kernel definitions */
+
+/* special address description flags for the CAN_ID */
+#define CAN_EFF_FLAG 0x80000000U /* EFF/SFF is set in the MSB */
+#define CAN_RTR_FLAG 0x40000000U /* remote transmission request */
+#define CAN_ERR_FLAG 0x20000000U /* error frame */
+
+/* valid bits in CAN ID for frame formats */
+#define CAN_SFF_MASK 0x000007FFU /* standard frame format (SFF) */
+#define CAN_EFF_MASK 0x1FFFFFFFU /* extended frame format (EFF) */
+#define CAN_ERR_MASK 0x1FFFFFFFU /* omit EFF, RTR, ERR flags */
+
+/*
+ * Controller Area Network Identifier structure
+ *
+ * bit 0-28 : CAN identifier (11/29 bit)
+ * bit 29 : error frame flag (0 = data frame, 1 = error frame)
+ * bit 30 : remote transmission request flag (1 = rtr frame)
+ * bit 31 : frame format flag (0 = standard 11 bit, 1 = extended 29 bit)
+ */
+typedef __u32 canid_t;
+
+#define CAN_SFF_ID_BITS 11
+#define CAN_EFF_ID_BITS 29
+
+/*
+ * Controller Area Network Error Frame Mask structure
+ *
+ * bit 0-28 : error class mask (see include/linux/can/error.h)
+ * bit 29-31 : set to zero
+ */
+typedef __u32 can_err_mask_t;
+
+/**
+ * struct can_frame - basic CAN frame structure
+ * @can_id: the CAN ID of the frame and CAN_*_FLAG flags, see above.
+ * @can_dlc: the data length field of the CAN frame
+ * @data: the CAN frame payload.
+ */
+struct can_frame {
+ canid_t can_id; /* 32 bit CAN_ID + EFF/RTR/ERR flags */
+ __u8 can_dlc; /* data length code: 0 .. 8 */
+ __u8 data[8] __attribute__((aligned(8)));
+};
+
+/* particular protocols of the protocol family PF_CAN */
+#define CAN_RAW 1 /* RAW sockets */
+#define CAN_BCM 2 /* Broadcast Manager */
+#define CAN_TP16 3 /* VAG Transport Protocol v1.6 */
+#define CAN_TP20 4 /* VAG Transport Protocol v2.0 */
+#define CAN_MCNET 5 /* Bosch MCNet */
+#define CAN_ISOTP 6 /* ISO 15765-2 Transport Protocol */
+#define CAN_NPROTO 7
+
+#define SOL_CAN_BASE 100
+
+/**
+ * struct sockaddr_can - the sockaddr structure for CAN sockets
+ * @can_family: address family number AF_CAN.
+ * @can_ifindex: CAN network interface index.
+ * @can_addr: protocol specific address information
+ */
+struct sockaddr_can {
+ __kernel_sa_family_t can_family;
+ int can_ifindex;
+ union {
+ /* transport protocol class address information (e.g. ISOTP) */
+ struct { canid_t rx_id, tx_id; } tp;
+
+ /* reserved for future CAN protocols address information */
+ } can_addr;
+};
+
+/**
+ * struct can_filter - CAN ID based filter in can_register().
+ * @can_id: relevant bits of CAN ID which are not masked out.
+ * @can_mask: CAN mask (see description)
+ *
+ * Description:
+ * A filter matches, when
+ *
+ * <received_can_id> & mask == can_id & mask
+ *
+ * The filter can be inverted (CAN_INV_FILTER bit set in can_id) or it can
+ * filter for error frames (CAN_ERR_FLAG bit set in mask).
+ */
+struct can_filter {
+ canid_t can_id;
+ canid_t can_mask;
+};
+
+#define CAN_INV_FILTER 0x20000000U /* to be set in can_filter.can_id */
+
+#endif /* CAN_H */
--
1.7.9.5
^ permalink raw reply related
* [PATCH iproute2 3/3] CAN Filter/Classifier -- Documentation
From: Rostislav Lisovy @ 2012-05-25 9:11 UTC (permalink / raw)
To: netdev; +Cc: linux-can, pisa, sojkam1, oliver, Rostislav Lisovy
In-Reply-To: <1337937106-7640-1-git-send-email-lisovy@gmail.com>
Added manpage describing usage of CAN Filter.
Signed-off-by: Rostislav Lisovy <lisovy@gmail.com>
---
man/man8/tc-can.8 | 97 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 97 insertions(+)
create mode 100644 man/man8/tc-can.8
diff --git a/man/man8/tc-can.8 b/man/man8/tc-can.8
new file mode 100644
index 0000000..54ee96a
--- /dev/null
+++ b/man/man8/tc-can.8
@@ -0,0 +1,97 @@
+.TH CAN 8 "8 May 2012" "iproute2" "Linux"
+.SH NAME
+CAN \- Controller Area Network classifier
+.SH SYNOPSIS
+.B tc filter ... dev
+DEV
+.B parent
+CLASSID
+.B [ prio
+PRIORITY
+.B ] [ protocol can ] [ handle
+HANDLE
+.B ] can [
+MATCHSPEC
+.B ] [ flowid
+FLOWID
+.B ]
+
+.B CLASSID := major:minor
+.br
+.B FLOWID := major:minor
+.br
+.B MATCHSPEC := { sffid
+FILTERID
+.B | effid
+FILTERID
+.B | MATCHSPEC ... }
+.br
+.B FILTERID := canid[:mask]
+
+.BR CLASSID ,
+.BR FLOWID ,
+.BR canid
+and
+.B mask
+are parsed as hexadecimal input.
+
+
+.SH DESCRIPTION
+The CAN classifier may be used with any available
+.B qdisc
+on Controller Area Network (CAN) frames passed through AF_CAN
+networking subsystem. The classifier classifies CAN frames according
+to their identifiers. It can be used on CAN frames with both SFF or
+EFF identifiers.
+
+It is possible to add CAN classifier to any qdisc configured on any networking
+device, however it will ignore non-CAN packets.
+
+
+.SH CLASSIFICATION
+The filtering rules for EFF frames are stored in an array, which is traversed
+during classification. This means that the worst-case time needed for
+classification of EFF frames increases with the number of configured rules.
+
+The filter implements an optimization for matching SFF frames using a bitmap
+with one bit for every ID. With this optimization, the classification time
+for SFF frames is nearly constant independently of the number of rules.
+
+.SH EXAMPLE
+This example shows how to set
+.B prio qdisc
+with
+.B CAN
+classifier.
+
+.nf
+tc qdisc add dev can0 root handle 1: prio
+
+tc filter add dev can0 parent 1:0 prio 1 handle 0xa \\
+ can sffid 0x7ff:0xf flowid 1:1
+tc filter add dev can0 parent 1:0 prio 2 handle 0xb \\
+ can sffid 0xC0:0x7ff effid 0x80:0x7ff flowid 1:2
+tc filter add dev can0 parent 1:0 prio 3 \\
+ can sffid 0x80:0x7ff flowid 1:2
+tc filter add dev can0 parent 1:0 prio 4 \\
+ can sffid 0x0:0x0 effid 0x0:0x0 flowid 1:3
+.fi
+
+
+.SH BUGS
+The maximum number or rules passed from
+.BR tc(8)
+utility to CAN classifier is fixed. The limit is set at compilation time
+(default is 128).
+
+
+.SH SEE ALSO
+.BR tc(8)
+
+
+.SH AUTHORS
+Michal Sojka <sojkam1@fel.cvut.cz>, Pavel Pisa <pisa@cmp.felk.cvut.cz>,
+Rostislav Lisovy <lisovy@gmail.cz>.
+
+This manpage maintained by Rostislav Lisovy <lisovy@gmail.com>
+
--
1.7.9.5
^ permalink raw reply related
* [PATCH iproute2 2/3] CAN Filter/Classifier -- Source code
From: Rostislav Lisovy @ 2012-05-25 9:11 UTC (permalink / raw)
To: netdev; +Cc: linux-can, pisa, sojkam1, oliver, Rostislav Lisovy
In-Reply-To: <1337937106-7640-1-git-send-email-lisovy@gmail.com>
The CAN classifier may be used with any available qdisc on Controller
Area Network (CAN) frames passed through AF_CAN networking subsystem.
The classifier classifies CAN frames according to their identifiers.
It can be used on CAN frames with both SFF or EFF identifiers.
The filtering rules for EFF frames are stored in an array, which
is traversed during classification. A bitmap is used to store SFF
rules -- one bit for each ID.
More info about the project:
http://rtime.felk.cvut.cz/can/socketcan-qdisc-final.pdf
Signed-off-by: Rostislav Lisovy <lisovy@gmail.com>
---
include/linux/pkt_cls.h | 10 ++
tc/Makefile | 1 +
tc/f_can.c | 238 +++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 249 insertions(+)
create mode 100644 tc/f_can.c
diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index defbde2..83f9241 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h
@@ -375,6 +375,16 @@ enum {
#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1)
+/* CAN filter */
+
+enum {
+ TCA_CANFLTR_UNSPEC,
+ TCA_CANFLTR_CLASSID,
+ TCA_CANFLTR_RULES,
+ __TCA_CANFLTR_MAX
+};
+
+#define TCA_CANFLTR_MAX (__TCA_CANFLTR_MAX - 1)
/* Cgroup classifier */
diff --git a/tc/Makefile b/tc/Makefile
index 64d93ad..1281568 100644
--- a/tc/Makefile
+++ b/tc/Makefile
@@ -22,6 +22,7 @@ TCMODULES += f_u32.o
TCMODULES += f_route.o
TCMODULES += f_fw.o
TCMODULES += f_basic.o
+TCMODULES += f_can.o
TCMODULES += f_flow.o
TCMODULES += f_cgroup.o
TCMODULES += q_dsmark.o
diff --git a/tc/f_can.c b/tc/f_can.c
new file mode 100644
index 0000000..208625f
--- /dev/null
+++ b/tc/f_can.c
@@ -0,0 +1,238 @@
+/*
+ * f_can.c Filter for CAN packets
+ *
+ * This program is free software; you can distribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright: (c) 2011 Czech Technical University in Prague
+ * (c) 2011 Volkswagen Group Research
+ * Authors: Michal Sojka <sojkam1@fel.cvut.cz>
+ * Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ * Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by: Volkswagen Group Research
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <linux/if.h>
+#include <limits.h>
+#include <inttypes.h>
+#include "utils.h"
+#include "tc_util.h"
+#include "linux/can.h"
+
+#define RULES_SIZE 128 /* Maximum number of rules sent via the
+ netlink message during creation/configuration */
+
+
+static void canfltr_explain(void)
+{
+ fprintf(stderr, "Usage: ... can [ MATCHSPEC ] [ flowid FLOWID ]\n"
+ "\n"
+ "Where: MATCHSPEC := { sffid FILTERID | effid FILTERID |\n"
+ " MATCHSPEC ... }\n"
+ " FILTERID := CANID[:MASK]\n"
+ "\n"
+ "NOTE: CLASSID, CANID, MASK is parsed as hexadecimal input.\n");
+}
+
+static int canfltr_parse_opt(struct filter_util *qu, char *handle,
+ int argc, char **argv, struct nlmsghdr *n)
+{
+ struct tcmsg *t = NLMSG_DATA(n);
+ struct rtattr *tail;
+ struct can_filter canfltr_rules[RULES_SIZE];
+ int rules_count = 0;
+ long h = 0;
+ canid_t can_id;
+ canid_t can_mask;
+
+ if (!argc)
+ return 0;
+
+ if (handle) {
+ h = strtol(handle, NULL, 0);
+ if (h == LONG_MIN || h == LONG_MAX) {
+ fprintf(stderr, "Illegal handle \"%s\", must be numeric.\n",
+ handle);
+ return -1;
+ }
+ }
+
+ t->tcm_handle = h;
+
+ tail = NLMSG_TAIL(n);
+ addattr_l(n, MAX_MSG, TCA_OPTIONS, NULL, 0);
+
+ while (argc > 0) {
+ if (matches(*argv, "sffid") == 0) {
+ /* parse SFF CAN ID optionally with mask */
+ if (rules_count >= RULES_SIZE) {
+ fprintf(stderr, "Too much rules on input. "
+ "Maximum number of rules is: %d\n",
+ RULES_SIZE);
+ return -1;
+ }
+
+ NEXT_ARG();
+
+ if (sscanf(*argv, "%"SCNx32 ":" "%"SCNx32,
+ &can_id, &can_mask) != 2) {
+ if (sscanf(*argv, "%"SCNx32, &can_id) != 1) {
+ fprintf(stderr, "Improperly formed CAN "
+ "ID & mask '%s'\n", *argv);
+ return -1;
+ } else
+ can_mask = CAN_SFF_MASK;
+ }
+
+ /* we do not support extra handling for RTR frames
+ due to the bitmap approach */
+ if (can_id & ~CAN_SFF_MASK) {
+ fprintf(stderr, "ID 0x%lx exceeded standard CAN ID range.\n",
+ (unsigned long)can_id);
+ return -1;
+ }
+
+ canfltr_rules[rules_count].can_id = can_id;
+ canfltr_rules[rules_count].can_mask =
+ (can_mask & CAN_SFF_MASK);
+ rules_count++;
+
+ } else if (matches(*argv, "effid") == 0) {
+ /* parse EFF CAN ID optionally with mask */
+ if (rules_count >= RULES_SIZE) {
+ fprintf(stderr, "Too much rules on input. "
+ "Maximum number of rules is: %d\n",
+ RULES_SIZE);
+ return -1;
+ }
+
+ NEXT_ARG();
+
+ if (sscanf(*argv, "%"SCNx32 ":" "%"SCNx32, &can_id, &can_mask) != 2) {
+ if (sscanf(*argv, "%"SCNx32, &can_id) != 1) {
+ fprintf(stderr, "Improperly formed CAN ID & mask '%s'\n", *argv);
+ return -1;
+ } else
+ can_mask = CAN_EFF_MASK;
+ }
+
+ if (can_id & ~CAN_EFF_MASK) {
+ fprintf(stderr, "ID 0x%lx exceeded extended CAN ID range.",
+ (unsigned long)can_id);
+ return -1;
+ }
+
+ canfltr_rules[rules_count].can_id =
+ can_id | CAN_EFF_FLAG;
+ canfltr_rules[rules_count].can_mask =
+ (can_mask & CAN_EFF_MASK) | CAN_EFF_FLAG;
+ rules_count++;
+
+ } else if (matches(*argv, "classid") == 0 || strcmp(*argv, "flowid") == 0) {
+ unsigned handle;
+ NEXT_ARG();
+ if (get_tc_classid(&handle, *argv)) {
+ fprintf(stderr, "Illegal \"classid\"\n");
+ return -1;
+ }
+ addattr_l(n, MAX_MSG, TCA_CANFLTR_CLASSID, &handle, 4);
+
+ } else if (strcmp(*argv, "help") == 0) {
+ canfltr_explain();
+ return -1;
+
+ } else {
+ fprintf(stderr, "What is \"%s\"?\n", *argv);
+ canfltr_explain();
+ return -1;
+ }
+ argc--; argv++;
+ }
+
+ addattr_l(n, MAX_MSG, TCA_CANFLTR_RULES, &canfltr_rules,
+ sizeof(struct can_filter) * rules_count);
+
+ tail->rta_len = (void *)NLMSG_TAIL(n) - (void *)tail;
+ return 0;
+}
+
+/* When "tc filter show dev XY" is executed, function canfltr_walk() (in
+ * kernel) is called (which calls canfltr_dump() for each instance of a
+ * filter) which sends information about each instance of a filter to
+ * userspace -- to this function which parses the message and prints it.
+ */
+static int canfltr_print_opt(struct filter_util *qu, FILE *f,
+ struct rtattr *opt, __u32 handle)
+{
+ struct rtattr *tb[TCA_CANFLTR_MAX+1];
+ struct can_filter *canfltr_rules = NULL;
+ int rules_count = 0;
+ int i;
+
+ if (opt == NULL)
+ return 0;
+
+ parse_rtattr_nested(tb, TCA_CANFLTR_MAX, opt);
+
+ if (handle)
+ fprintf(f, "handle 0x%x ", handle);
+
+
+ if (tb[TCA_BASIC_CLASSID]) {
+ SPRINT_BUF(b1); /* allocates buffer b1 */
+ fprintf(f, "flowid %s ",
+ sprint_tc_classid(*(__u32 *)RTA_DATA(tb[TCA_BASIC_CLASSID]), b1));
+ }
+
+ if (tb[TCA_CANFLTR_RULES]) {
+ if (RTA_PAYLOAD(tb[TCA_CANFLTR_RULES]) < sizeof(struct can_filter))
+ return -1;
+
+ canfltr_rules = RTA_DATA(tb[TCA_CANFLTR_RULES]);
+ rules_count = (RTA_PAYLOAD(tb[TCA_CANFLTR_RULES]) /
+ sizeof(struct can_filter));
+
+ for (i = 0; i < rules_count; i++) {
+ struct can_filter *pcfltr = &canfltr_rules[i];
+
+ if (pcfltr->can_id & CAN_EFF_FLAG) {
+ if (pcfltr->can_mask == (CAN_EFF_FLAG|CAN_EFF_MASK))
+ fprintf(f, "effid 0x%"PRIX32" ",
+ pcfltr->can_id & CAN_EFF_MASK);
+ else
+ fprintf(f, "effid 0x%"PRIX32":0x%"PRIX32" ",
+ pcfltr->can_id & CAN_EFF_MASK,
+ pcfltr->can_mask & CAN_EFF_MASK);
+ } else {
+ if (pcfltr->can_mask == CAN_SFF_MASK)
+ fprintf(f, "sffid 0x%"PRIX32" ",
+ pcfltr->can_id);
+ else
+ fprintf(f, "sffid 0x%"PRIX32":0x%"PRIX32" ",
+ pcfltr->can_id,
+ pcfltr->can_mask);
+ }
+ }
+ }
+
+ return 0;
+}
+
+struct filter_util can_filter_util = {
+ .id = "can",
+ .parse_fopt = canfltr_parse_opt,
+ .print_fopt = canfltr_print_opt,
+};
+
--
1.7.9.5
^ permalink raw reply related
* [PATCH 1/2] can: Added constants containing length of CAN identifiers
From: Rostislav Lisovy @ 2012-05-25 9:12 UTC (permalink / raw)
To: netdev; +Cc: linux-can, pisa, sojkam1, oliver, Rostislav Lisovy
The necessary information might be determined out of the CAN_*_MASK,
however it is undesirable to misuse masks for such purpose.
Signed-off-by: Rostislav Lisovy <lisovy@gmail.com>
---
include/linux/can.h | 3 +++
1 file changed, 3 insertions(+)
diff --git a/include/linux/can.h b/include/linux/can.h
index 9a19bcb..08d1610 100644
--- a/include/linux/can.h
+++ b/include/linux/can.h
@@ -38,6 +38,9 @@
*/
typedef __u32 canid_t;
+#define CAN_SFF_ID_BITS 11
+#define CAN_EFF_ID_BITS 29
+
/*
* Controller Area Network Error Frame Mask structure
*
--
1.7.9.5
^ permalink raw reply related
* [PATCH 2/2] net/sched: CAN Filter/Classifier
From: Rostislav Lisovy @ 2012-05-25 9:12 UTC (permalink / raw)
To: netdev; +Cc: linux-can, pisa, sojkam1, oliver, Rostislav Lisovy
In-Reply-To: <1337937157-7680-1-git-send-email-lisovy@gmail.com>
The CAN classifier may be used with any available qdisc on Controller
Area Network (CAN) frames passed through AF_CAN networking subsystem.
The classifier classifies CAN frames according to their identifiers.
It can be used on CAN frames with both SFF or EFF identifiers.
The filtering rules for EFF frames are stored in an array, which
is traversed during classification. A bitmap is used to store SFF
rules -- one bit for each ID.
More info about the project:
http://rtime.felk.cvut.cz/can/socketcan-qdisc-final.pdf
Signed-off-by: Rostislav Lisovy <lisovy@gmail.com>
---
net/sched/Kconfig | 10 +
net/sched/Makefile | 1 +
net/sched/cls_can.c | 571 +++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 582 insertions(+)
create mode 100644 net/sched/cls_can.c
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index e7a8976..aeb3c29 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -323,6 +323,16 @@ config NET_CLS_BASIC
To compile this code as a module, choose M here: the
module will be called cls_basic.
+config NET_CLS_CAN
+ tristate "Controller Area Network classifier (CAN)"
+ select NET_CLS
+ ---help---
+ Say Y here if you want to be able to classify CAN frames according
+ to their CAN identifiers (can_id).
+
+ To compile this code as a module, choose M here: the
+ module will be called cls_can.
+
config NET_CLS_TCINDEX
tristate "Traffic-Control Index (TCINDEX)"
select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 5940a19..0217341 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o
obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o
obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o
obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o
+obj-$(CONFIG_NET_CLS_CAN) += cls_can.o
obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o
obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o
obj-$(CONFIG_NET_EMATCH) += ematch.o
diff --git a/net/sched/cls_can.c b/net/sched/cls_can.c
new file mode 100644
index 0000000..111668e
--- /dev/null
+++ b/net/sched/cls_can.c
@@ -0,0 +1,571 @@
+/*
+ * cls_can.c -- Controller Area Network classifier.
+ * Makes decisions according to Controller Area Network identifiers (can_id).
+ *
+ * This program is free software; you can distribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of
+ * the License.
+ *
+ * Idea: Oliver Hartkopp <oliver.hartkopp@volkswagen.de>
+ * Copyright: (c) 2011 Czech Technical University in Prague
+ * (c) 2011 Volkswagen Group Research
+ * Authors: Michal Sojka <sojkam1@fel.cvut.cz>
+ * Pavel Pisa <pisa@cmp.felk.cvut.cz>
+ * Rostislav Lisovy <lisovy@gmail.cz>
+ * Funded by: Volkswagen Group Research
+ *
+ * Some function descriptions are heavily inspired by article "Linux Network
+ * Traffic Control -- Implementation Overview" by Werner Almesberger
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/act_api.h>
+#include <net/pkt_cls.h>
+#include <linux/bitmap.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/can.h>
+
+/* Definition of Netlink message parts */
+enum {
+ TCA_CANFLTR_UNSPEC,
+ TCA_CANFLTR_CLASSID,
+ TCA_CANFLTR_RULES, /* Array of can_filter structs; We are able
+ to determine the length after receiving */
+ __TCA_CANFLTR_MAX
+};
+#define TCA_CANFLTR_MAX (__TCA_CANFLTR_MAX - 1)
+
+static const struct nla_policy canfltr_policy[TCA_CANFLTR_MAX + 1] = {
+ [TCA_CANFLTR_CLASSID] = { .type = NLA_U32 }, /* Be aware of possible
+ problems with 64bit kernel and
+ 32bit userspace etc. */
+ [TCA_CANFLTR_RULES] = { .type = NLA_NESTED }
+};
+
+struct canfltr_rules {
+ struct can_filter *rules_raw; /* Raw rules copied from netlink
+ message; Used for sending information
+ to userspace (when 'tc filter show' is
+ invoked) AND when matching EFF frames*/
+ DECLARE_BITMAP(match_sff, (1 << CAN_SFF_ID_BITS)); /* For each SFF CAN
+ ID (11 bit) there is one record in this
+ bitfield */
+ int rules_count;
+ int eff_rules_count;
+ int sff_rules_count;
+
+ struct rcu_head rcu;
+};
+
+struct canfltr_head {
+ u32 hgenerator;
+ struct list_head flist;
+};
+
+struct canfltr_state {
+ u32 handle;
+ struct canfltr_rules *rules; /* All rules necessary for
+ classification */
+ struct tcf_result res; /* Class ID (flow id) the instance
+ of a filter is bound to */
+ struct list_head link;
+};
+
+/*
+ * ----------------------------------------------------------------------------
+ */
+
+static void canfltr_sff_match_add(struct canfltr_rules *rls,
+ u32 can_id, u32 can_mask)
+{
+ int i;
+
+ /* Limit can_mask and can_id to SFF range to
+ protect against write after end of array */
+ can_mask &= CAN_SFF_MASK;
+ can_id &= can_mask;
+
+ /* single frame */
+ if (can_mask == CAN_SFF_MASK) {
+ set_bit(can_id, rls->match_sff);
+ return;
+ }
+
+ /* all frames */
+ if (can_mask == 0) {
+ bitmap_fill(rls->match_sff, (1 << CAN_SFF_ID_BITS));
+ return;
+ }
+
+ /* individual frame filter */
+ /* Add record (set bit to 1) for each ID that
+ conforms particular rule */
+ for (i = 0; i < (1 << CAN_SFF_ID_BITS); i++) {
+ if ((i & can_mask) == can_id)
+ set_bit(i, rls->match_sff);
+ }
+}
+
+/**
+ * canfltr_get_id() - Extracts Can ID out of the sk_buff structure.
+ */
+static canid_t canfltr_get_id(struct sk_buff *skb)
+{
+ /* Can ID is inside of data field */
+ struct can_frame *cf = (struct can_frame *)skb->data;
+
+ return cf->can_id;
+}
+
+/**
+ * canfltr_classify() - Performs the classification.
+ *
+ * @skb: Socket buffer
+ * @tp:
+ * @res: Is used for setting Class ID as a result of classification
+ *
+ * Iterates over all instances of filter, checking for CAN ID match.
+ *
+ * Returns value relevant for policing. Used return values:
+ * TC_POLICE_OK if succesfully classified (without regard to policing rules)
+ * TC_POLICE_UNSPEC if no matching rule was found
+ */
+static int canfltr_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+ struct tcf_result *res)
+{
+ struct canfltr_head *head = (struct canfltr_head *)tp->root;
+ struct canfltr_state *f;
+ struct canfltr_rules *r;
+ canid_t can_id;
+ int i;
+
+ can_id = canfltr_get_id(skb);
+
+ rcu_read_lock();
+ list_for_each_entry(f, &head->flist, link) {
+ bool match = false;
+ r = rcu_dereference(f->rules);
+
+
+ if (can_id & CAN_EFF_FLAG) {
+ can_id &= CAN_EFF_MASK;
+
+ for (i = 0; i < r->eff_rules_count; i++) {
+ if (!(((r->rules_raw[i].can_id ^ can_id) &
+ r->rules_raw[i].can_mask) & CAN_EFF_MASK)) {
+ match = true;
+ break;
+ }
+ }
+ } else { /* SFF */
+ can_id &= CAN_SFF_MASK;
+ match = test_bit(can_id, r->match_sff);
+ }
+
+ if (match) {
+ *res = f->res;
+ rcu_read_unlock();
+ return TC_POLICE_OK;
+ }
+ }
+
+ rcu_read_unlock();
+ return TC_POLICE_UNSPEC;
+}
+
+/**
+ * canfltr_get() - Looks up a filter element by its handle and returns the
+ * internal filter ID (i.e. pointer)
+ */
+static unsigned long canfltr_get(struct tcf_proto *tp, u32 handle)
+{
+ struct canfltr_head *head = (struct canfltr_head *)tp->root;
+ struct canfltr_state *f;
+
+ if (head == NULL)
+ return 0UL;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (f->handle == handle)
+ return (unsigned long) f;
+ }
+
+ return 0UL;
+}
+
+/**
+ * canfltr_put() - Is invoked when a filter element previously referenced
+ * with get() is no longer used
+ */
+static void canfltr_put(struct tcf_proto *tp, unsigned long f)
+{
+}
+
+/**
+ * canfltr_gen_handle() - Generate handle for newly created filter
+ *
+ * This code is heavily inspired by handle generator in cls_basic.c
+ */
+static unsigned int canfltr_gen_handle(struct tcf_proto *tp)
+{
+ struct canfltr_head *head = (struct canfltr_head *)tp->root;
+ unsigned int i = 0x80000000;
+
+ do {
+ if (++head->hgenerator == 0x7FFFFFFF)
+ head->hgenerator = 1;
+ } while (--i > 0 && canfltr_get(tp, head->hgenerator));
+
+ if (i == 0)
+ return 0;
+
+ return head->hgenerator;
+}
+
+static void canfltr_rules_free_rcu(struct rcu_head *rcu)
+{
+ kfree(container_of(rcu, struct canfltr_rules, rcu));
+}
+
+static int canfltr_set_parms(struct tcf_proto *tp, struct canfltr_state *f,
+ unsigned long base, struct nlattr **tb,
+ struct nlattr *est)
+{
+ struct can_filter *canfltr_nl_rules;
+ struct canfltr_rules *rules_tmp;
+ int err;
+ int i;
+
+ rules_tmp = kzalloc(sizeof(*rules_tmp), GFP_KERNEL);
+ if (!rules_tmp)
+ return -ENOBUFS;
+
+ err = -EINVAL;
+ if (tb[TCA_CANFLTR_CLASSID] == NULL)
+ goto errout;
+
+ if (tb[TCA_CANFLTR_RULES]) {
+ canfltr_nl_rules = nla_data(tb[TCA_CANFLTR_RULES]);
+ rules_tmp->sff_rules_count = 0;
+ rules_tmp->eff_rules_count = 0;
+ rules_tmp->rules_count = (nla_len(tb[TCA_CANFLTR_RULES]) /
+ sizeof(struct can_filter));
+
+ rules_tmp->rules_raw = kzalloc(sizeof(struct can_filter) *
+ rules_tmp->rules_count, GFP_KERNEL);
+ err = -ENOMEM;
+ if (rules_tmp->rules_raw == NULL)
+ goto errout;
+
+ /* We need two for() loops for copying rules into
+ two contiguous areas in rules_raw */
+
+ /* Process EFF frame rules*/
+ for (i = 0; i < rules_tmp->rules_count; i++) {
+ if ((canfltr_nl_rules[i].can_id & CAN_EFF_FLAG) &&
+ (canfltr_nl_rules[i].can_mask & CAN_EFF_FLAG)) {
+ memcpy(rules_tmp->rules_raw +
+ rules_tmp->eff_rules_count,
+ &canfltr_nl_rules[i],
+ sizeof(struct can_filter));
+ rules_tmp->eff_rules_count++;
+ } else {
+ continue;
+ }
+ }
+
+ /* Process SFF frame rules */
+ for (i = 0; i < rules_tmp->rules_count; i++) {
+ if ((canfltr_nl_rules[i].can_id & CAN_EFF_FLAG) &&
+ (canfltr_nl_rules[i].can_mask & CAN_EFF_FLAG)) {
+ continue;
+ } else {
+ memcpy(rules_tmp->rules_raw +
+ rules_tmp->eff_rules_count +
+ rules_tmp->sff_rules_count,
+ &canfltr_nl_rules[i],
+ sizeof(struct can_filter));
+ rules_tmp->sff_rules_count++;
+ canfltr_sff_match_add(rules_tmp,
+ canfltr_nl_rules[i].can_id,
+ canfltr_nl_rules[i].can_mask);
+ }
+ }
+ }
+
+
+ /* Setting parameters for newly created filter */
+ if (f->rules == NULL) {
+ rcu_assign_pointer(f->rules, rules_tmp);
+ } else { /* Changing existing filter */
+ struct canfltr_rules *rules_old;
+
+ rules_old = xchg(&f->rules, rules_tmp);
+ call_rcu(&rules_old->rcu, canfltr_rules_free_rcu);
+ }
+
+ return 0;
+
+errout:
+ kfree(rules_tmp);
+ return err;
+}
+
+/**
+ * canfltr_change() - Called for changing properties of an existing filter or
+ * after addition of a new filter to a class (by calling bind_tcf which binds
+ * an instance of a filter to the class).
+ *
+ * @tp: Structure representing instance of a filter.
+ * Part of a linked list of all filters.
+ * @base:
+ * @handle:
+ * @tca: Messages passed through the Netlink from userspace.
+ * @arg:
+ */
+static int canfltr_change(struct tcf_proto *tp, unsigned long base, u32 handle,
+ struct nlattr **tca, unsigned long *arg)
+{
+ struct canfltr_head *head = (struct canfltr_head *)tp->root;
+ struct canfltr_state *f = (struct canfltr_state *)*arg;
+ struct nlattr *tb[TCA_CANFLTR_MAX + 1];
+ int err;
+
+ if (tca[TCA_OPTIONS] == NULL)
+ return -EINVAL;
+
+ /* Parses a stream of attributes and stores a pointer to each
+ attribute in the tb array accessible via the attribute type.
+ Policy may be set to NULL if no validation is required.*/
+ err = nla_parse_nested(tb, TCA_CANFLTR_MAX, tca[TCA_OPTIONS],
+ canfltr_policy);
+ if (err < 0)
+ return err;
+ /* Change existing filter (remove all settings and add
+ them thereafter as if filter was newly created) */
+ if (f != NULL) {
+ if (handle && f->handle != handle)
+ return -EINVAL;
+
+ return canfltr_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+ }
+
+ /* Create new filter */
+ err = -ENOBUFS;
+ f = kzalloc(sizeof(*f), GFP_KERNEL);
+ if (f == NULL)
+ goto errout;
+
+ if (tb[TCA_CANFLTR_CLASSID]) {
+ f->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]);
+ tcf_bind_filter(tp, &f->res, base);
+ }
+
+ err = -EINVAL;
+ if (handle) /* handle passed from userspace */
+ f->handle = handle;
+ else {
+ f->handle = canfltr_gen_handle(tp);
+ if (f->handle == 0)
+ goto errout;
+ }
+
+ /* Configure filter */
+ err = canfltr_set_parms(tp, f, base, tb, tca[TCA_RATE]);
+ if (err < 0)
+ goto errout;
+
+ /* Add newly created filter to list of all filters */
+ tcf_tree_lock(tp);
+ list_add(&f->link, &head->flist);
+ tcf_tree_unlock(tp);
+ *arg = (unsigned long) f;
+
+ return 0;
+
+errout:
+ if (*arg == 0UL && f)
+ kfree(f);
+
+ return err;
+}
+
+
+static void canfltr_delete_filter(struct tcf_proto *tp,
+ struct canfltr_state *f)
+{
+ tcf_unbind_filter(tp, &f->res);
+
+ rcu_barrier();
+ kfree(f->rules->rules_raw);
+ kfree(f->rules);
+ kfree(f);
+}
+
+/**
+ * canfltr_destroy() - Remove whole filter.
+ */
+static void canfltr_destroy(struct tcf_proto *tp)
+{
+ struct canfltr_head *head = tp->root;
+ struct canfltr_state *f, *n;
+
+ list_for_each_entry_safe(f, n, &head->flist, link) {
+ list_del(&f->link);
+ canfltr_delete_filter(tp, f);
+ }
+ kfree(head);
+}
+
+/**
+ * canfltr_delete() - Delete one instance of a filter.
+ */
+static int canfltr_delete(struct tcf_proto *tp, unsigned long arg)
+{
+ struct canfltr_head *head = (struct canfltr_head *)tp->root;
+ struct canfltr_state *t;
+ struct canfltr_state *f = (struct canfltr_state *)arg;
+
+ rcu_barrier(); /* Wait for completion of call_rcu()'s */
+
+ list_for_each_entry(t, &head->flist, link)
+ if (t == f) {
+ tcf_tree_lock(tp);
+ list_del(&t->link);
+ tcf_tree_unlock(tp);
+ canfltr_delete_filter(tp, t);
+ return 0;
+ }
+
+ return -ENOENT;
+}
+
+
+/**
+ * canfltr_init() - Initialize filter
+ */
+static int canfltr_init(struct tcf_proto *tp)
+{
+ struct canfltr_head *head;
+
+ if ((tp->protocol != htons(ETH_P_ALL)) &&
+ (tp->protocol != htons(ETH_P_CAN)))
+ return -1;
+
+ /* Work only on CAN frames */
+ if (tp->protocol == htons(ETH_P_ALL))
+ tp->protocol = htons(ETH_P_CAN);
+
+ head = kzalloc(sizeof(*head), GFP_KERNEL);
+ if (head == NULL)
+ return -ENOBUFS;
+
+ INIT_LIST_HEAD(&head->flist);
+ tp->root = head;
+
+ return 0;
+}
+
+/**
+ * canfltr_walk() - Iterates over all elements of a filter and invokes a
+ * callback function for each of them. This is used to obtain diagnostic data.
+ */
+static void canfltr_walk(struct tcf_proto *tp, struct tcf_walker *arg)
+{
+ struct canfltr_head *head = (struct canfltr_head *) tp->root;
+ struct canfltr_state *f;
+
+ list_for_each_entry(f, &head->flist, link) {
+ if (arg->count < arg->skip)
+ goto skip;
+
+ if (arg->fn(tp, (unsigned long) f, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+skip:
+ arg->count++;
+ }
+}
+
+/**
+ * canfltr_dump() - Returns diagnostic data for a filter or one of its elements.
+ */
+static int canfltr_dump(struct tcf_proto *tp, unsigned long fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct canfltr_state *f = (struct canfltr_state *) fh;
+ struct nlattr *nest;
+ struct canfltr_rules *r;
+
+ if (f == NULL)
+ return skb->len;
+
+ rcu_read_lock();
+ r = rcu_dereference(f->rules);
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (nest == NULL)
+ goto nla_put_failure;
+
+ if (f->res.classid)
+ NLA_PUT_U32(skb, TCA_CANFLTR_CLASSID, f->res.classid);
+
+ NLA_PUT(skb, TCA_CANFLTR_RULES, r->rules_count *
+ sizeof(struct can_filter), r->rules_raw);
+
+
+ nla_nest_end(skb, nest);
+
+ rcu_read_unlock();
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ rcu_read_unlock();
+ return -1;
+}
+
+
+static struct tcf_proto_ops cls_canfltr_ops __read_mostly = {
+ .kind = "can",
+ .classify = canfltr_classify,
+ .init = canfltr_init,
+ .destroy = canfltr_destroy,
+ .get = canfltr_get,
+ .put = canfltr_put,
+ .change = canfltr_change,
+ .delete = canfltr_delete,
+ .walk = canfltr_walk,
+ .dump = canfltr_dump,
+ .owner = THIS_MODULE,
+};
+
+static int __init init_canfltr(void)
+{
+ pr_debug("canfltr: CAN filter loaded\n");
+ return register_tcf_proto_ops(&cls_canfltr_ops);
+}
+
+static void __exit exit_canfltr(void)
+{
+ pr_debug("canfltr: CAN filter removed\n");
+ unregister_tcf_proto_ops(&cls_canfltr_ops);
+}
+
+module_init(init_canfltr);
+module_exit(exit_canfltr);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rostislav Lisovy <lisovy@gmail.cz>");
+MODULE_DESCRIPTION("Controller Area Network classifier");
--
1.7.9.5
^ permalink raw reply related
* Re: [PATCH 1/2] can: Added constants containing length of CAN identifiers
From: David Miller @ 2012-05-25 9:22 UTC (permalink / raw)
To: lisovy; +Cc: netdev, linux-can, pisa, sojkam1, oliver
In-Reply-To: <1337937157-7680-1-git-send-email-lisovy@gmail.com>
It is not appropriate to submit new features at this time,
as I described in detail in:
http://marc.info/?l=netfilter-devel&m=133763475402372&w=2
I used a subject line with BIG CAPITAL LETTERS in that posting so
there is really no reason you should have overlooked it.
^ permalink raw reply
* [PATCH v7 0/2] fixes for sock memcg static branch disablement
From: Glauber Costa @ 2012-05-25 9:32 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, cgroups, devel, kamezawa.hiroyu, netdev, Tejun Heo,
Li Zefan, David Miller
Hi Andrew,
I believe this one addresses all of your previous comments.
Besides merging your patch, I tried to improve the comments so they would
be more informative.
The first patch, I believe, is already merged at your tree. But I am including
it here for completeness. I had no changes since last submission, so feel free
to pick the second - or if there are still missing changes you'd like to see,
point me to them.
Thanks
Glauber Costa (2):
Always free struct memcg through schedule_work()
decrement static keys on real destroy time
include/net/sock.h | 22 ++++++++++++++++++
mm/memcontrol.c | 55 ++++++++++++++++++++++++++++++++++----------
net/ipv4/tcp_memcontrol.c | 34 ++++++++++++++++++++++-----
3 files changed, 91 insertions(+), 20 deletions(-)
--
1.7.7.6
^ permalink raw reply
* [PATCH v7 1/2] Always free struct memcg through schedule_work()
From: Glauber Costa @ 2012-05-25 9:32 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, cgroups, devel, kamezawa.hiroyu, netdev, Tejun Heo,
Li Zefan, David Miller, Glauber Costa, Johannes Weiner,
Michal Hocko
In-Reply-To: <1337938328-11537-1-git-send-email-glommer@parallels.com>
Right now we free struct memcg with kfree right after a
rcu grace period, but defer it if we need to use vfree() to get
rid of that memory area. We do that by need, because we need vfree
to be called in a process context.
This patch unifies this behavior, by ensuring that even kfree will
happen in a separate thread. The goal is to have a stable place to
call the upcoming jump label destruction function outside the realm
of the complicated and quite far-reaching cgroup lock (that can't be
held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)
Signed-off-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
CC: Andrew Morton <akpm@linux-foundation.org>
---
mm/memcontrol.c | 24 +++++++++++++-----------
1 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 932a734..0b4b4c8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -245,8 +245,8 @@ struct mem_cgroup {
*/
struct rcu_head rcu_freeing;
/*
- * But when using vfree(), that cannot be done at
- * interrupt time, so we must then queue the work.
+ * We also need some space for a worker in deferred freeing.
+ * By the time we call it, rcu_freeing is not longer in use.
*/
struct work_struct work_freeing;
};
@@ -4826,23 +4826,28 @@ out_free:
}
/*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
* but in process context. The work_freeing structure is overlaid
* on the rcu_freeing structure, which itself is overlaid on memsw.
*/
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
{
struct mem_cgroup *memcg;
+ int size = sizeof(struct mem_cgroup);
memcg = container_of(work, struct mem_cgroup, work_freeing);
- vfree(memcg);
+ if (size < PAGE_SIZE)
+ kfree(memcg);
+ else
+ vfree(memcg);
}
-static void vfree_rcu(struct rcu_head *rcu_head)
+
+static void free_rcu(struct rcu_head *rcu_head)
{
struct mem_cgroup *memcg;
memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
- INIT_WORK(&memcg->work_freeing, vfree_work);
+ INIT_WORK(&memcg->work_freeing, free_work);
schedule_work(&memcg->work_freeing);
}
@@ -4868,10 +4873,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
- if (sizeof(struct mem_cgroup) < PAGE_SIZE)
- kfree_rcu(memcg, rcu_freeing);
- else
- call_rcu(&memcg->rcu_freeing, vfree_rcu);
+ call_rcu(&memcg->rcu_freeing, free_rcu);
}
static void mem_cgroup_get(struct mem_cgroup *memcg)
--
1.7.7.6
^ permalink raw reply related
* [PATCH v7 2/2] decrement static keys on real destroy time
From: Glauber Costa @ 2012-05-25 9:32 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-mm, cgroups, devel, kamezawa.hiroyu, netdev, Tejun Heo,
Li Zefan, David Miller, Glauber Costa, Johannes Weiner,
Michal Hocko
In-Reply-To: <1337938328-11537-1-git-send-email-glommer@parallels.com>
We call the destroy function when a cgroup starts to be removed,
such as by a rmdir event.
However, because of our reference counters, some objects are still
inflight. Right now, we are decrementing the static_keys at destroy()
time, meaning that if we get rid of the last static_key reference,
some objects will still have charges, but the code to properly
uncharge them won't be run.
This becomes a problem specially if it is ever enabled again, because
now new charges will be added to the staled charges making keeping
it pretty much impossible.
We just need to be careful with the static branch activation:
since there is no particular preferred order of their activation,
we need to make sure that we only start using it after all
call sites are active. This is achieved by having a per-memcg
flag that is only updated after static_key_slow_inc() returns.
At this time, we are sure all sites are active.
This is made per-memcg, not global, for a reason:
it also has the effect of making socket accounting more
consistent. The first memcg to be limited will trigger static_key()
activation, therefore, accounting. But all the others will then be
accounted no matter what. After this patch, only limited memcgs
will have its sockets accounted.
[v2: changed a tcp limited flag for a generic proto limited flag ]
[v3: update the current active flag only after the static_key update ]
[v4: disarm_static_keys() inside free_work ]
[v5: got rid of tcp_limit_mutex, now in the static_key interface ]
[v6: changed active and activated to a flags field, as suggested by akpm ]
[v7: merged more comments from akpm ]
Signed-off-by: Glauber Costa <glommer@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Li Zefan <lizefan@huawei.com>
CC: Kamezawa Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Johannes Weiner <hannes@cmpxchg.org>
CC: Michal Hocko <mhocko@suse.cz>
CC: Andrew Morton <akpm@linux-foundation.org>
---
include/net/sock.h | 22 ++++++++++++++++++++++
mm/memcontrol.c | 31 +++++++++++++++++++++++++++++--
net/ipv4/tcp_memcontrol.c | 34 +++++++++++++++++++++++++++-------
3 files changed, 78 insertions(+), 9 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h
index b3ebe6b..d6a8ae3 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -46,6 +46,7 @@
#include <linux/list_nulls.h>
#include <linux/timer.h>
#include <linux/cache.h>
+#include <linux/bitops.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h> /* struct sk_buff */
@@ -907,12 +908,23 @@ struct proto {
#endif
};
+/*
+ * Bits in struct cg_proto.flags
+ */
+enum cg_proto_flags {
+ /* Currently active and new sockets should be assigned to cgroups */
+ MEMCG_SOCK_ACTIVE,
+ /* It was ever activated; we must disarm static keys on destruction */
+ MEMCG_SOCK_ACTIVATED,
+};
+
struct cg_proto {
void (*enter_memory_pressure)(struct sock *sk);
struct res_counter *memory_allocated; /* Current allocated memory. */
struct percpu_counter *sockets_allocated; /* Current number of sockets. */
int *memory_pressure;
long *sysctl_mem;
+ unsigned long flags;
/*
* memcg field is used to find which memcg we belong directly
* Each memcg struct can hold more than one cg_proto, so container_of
@@ -928,6 +940,16 @@ struct cg_proto {
extern int proto_register(struct proto *prot, int alloc_slab);
extern void proto_unregister(struct proto *prot);
+static inline bool memcg_proto_active(struct cg_proto *cg_proto)
+{
+ return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+}
+
+static inline bool memcg_proto_activated(struct cg_proto *cg_proto)
+{
+ return test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags);
+}
+
#ifdef SOCK_REFCNT_DEBUG
static inline void sk_refcnt_debug_inc(struct sock *sk)
{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b4b4c8..788be2e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -404,6 +404,7 @@ void sock_update_memcg(struct sock *sk)
{
if (mem_cgroup_sockets_enabled) {
struct mem_cgroup *memcg;
+ struct cg_proto *cg_proto;
BUG_ON(!sk->sk_prot->proto_cgroup);
@@ -423,9 +424,10 @@ void sock_update_memcg(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (!mem_cgroup_is_root(memcg)) {
+ cg_proto = sk->sk_prot->proto_cgroup(memcg);
+ if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
mem_cgroup_get(memcg);
- sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+ sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
}
@@ -454,6 +456,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
#endif /* CONFIG_INET */
#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+ if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ return;
+ static_key_slow_dec(&memcg_socket_limit_enabled);
+}
+#else
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+}
+#endif
+
static void drain_all_stock_async(struct mem_cgroup *memcg);
static struct mem_cgroup_per_zone *
@@ -4836,6 +4851,18 @@ static void free_work(struct work_struct *work)
int size = sizeof(struct mem_cgroup);
memcg = container_of(work, struct mem_cgroup, work_freeing);
+ /*
+ * We need to make sure that (at least for now), the jump label
+ * destruction code runs outside of the cgroup lock. This is because
+ * get_online_cpus(), which is called from the static_branch update,
+ * can't be called inside the cgroup_lock. cpusets are the ones
+ * enforcing this dependency, so if they ever change, we might as well.
+ *
+ * schedule_work() will guarantee this happens. Be careful if you need
+ * to move this code around, and make sure it is outside
+ * the cgroup_lock.
+ */
+ disarm_sock_keys(memcg);
if (size < PAGE_SIZE)
kfree(memcg);
else
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 1517037..b6f3583 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -74,9 +74,6 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
percpu_counter_destroy(&tcp->tcp_sockets_allocated);
val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT);
-
- if (val != RESOURCE_MAX)
- static_key_slow_dec(&memcg_socket_limit_enabled);
}
EXPORT_SYMBOL(tcp_destroy_cgroup);
@@ -107,10 +104,33 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val)
tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT,
net->ipv4.sysctl_tcp_mem[i]);
- if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX)
- static_key_slow_dec(&memcg_socket_limit_enabled);
- else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX)
- static_key_slow_inc(&memcg_socket_limit_enabled);
+ if (val == RESOURCE_MAX)
+ clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ else if (val != RESOURCE_MAX) {
+ /*
+ * The active bit needs to be written after the static_key
+ * update. This is what guarantees that the socket activation
+ * function is the last one to run. See sock_update_memcg() for
+ * details, and note that we don't mark any socket as belonging
+ * to this memcg until that flag is up.
+ *
+ * We need to do this, because static_keys will span multiple
+ * sites, but we can't control their order. If we mark a socket
+ * as accounted, but the accounting functions are not patched in
+ * yet, we'll lose accounting.
+ *
+ * We never race with the readers in sock_update_memcg(),
+ * because when this value change, the code to process it is not
+ * patched in yet.
+ *
+ * The activated bit is used to guarantee that no two writers
+ * will do the update in the same memcg. Without that, we can't
+ * properly shutdown the static key.
+ */
+ if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+ static_key_slow_inc(&memcg_socket_limit_enabled);
+ set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+ }
return 0;
}
--
1.7.7.6
^ permalink raw reply related
* Re: [PATCH v7 1/2] Always free struct memcg through schedule_work()
From: Michal Hocko @ 2012-05-25 9:50 UTC (permalink / raw)
To: Glauber Costa
Cc: Andrew Morton, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA, devel-GEFAQzZX7r8dnm+yROfE0A,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
netdev-u79uwXL29TY76Z2rM5mHXA, Tejun Heo, Li Zefan, David Miller,
Johannes Weiner
In-Reply-To: <1337938328-11537-2-git-send-email-glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
On Fri 25-05-12 13:32:07, Glauber Costa wrote:
> Right now we free struct memcg with kfree right after a
> rcu grace period, but defer it if we need to use vfree() to get
> rid of that memory area. We do that by need, because we need vfree
> to be called in a process context.
>
> This patch unifies this behavior, by ensuring that even kfree will
> happen in a separate thread. The goal is to have a stable place to
> call the upcoming jump label destruction function outside the realm
> of the complicated and quite far-reaching cgroup lock (that can't be
> held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)
>
> Signed-off-by: Glauber Costa <glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
> Acked-by: Kamezawa Hiroyuki <kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
Acked-by: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>
Just one comment below
> CC: Tejun Heo <tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
> CC: Li Zefan <lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
> CC: Johannes Weiner <hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
> CC: Michal Hocko <mhocko-AlSwsSmVLrQ@public.gmane.org>
> CC: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
> ---
> mm/memcontrol.c | 24 +++++++++++++-----------
> 1 files changed, 13 insertions(+), 11 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 932a734..0b4b4c8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
[...]
> @@ -4826,23 +4826,28 @@ out_free:
> }
>
> /*
> - * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
> + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
> * but in process context. The work_freeing structure is overlaid
> * on the rcu_freeing structure, which itself is overlaid on memsw.
> */
> -static void vfree_work(struct work_struct *work)
> +static void free_work(struct work_struct *work)
> {
> struct mem_cgroup *memcg;
> + int size = sizeof(struct mem_cgroup);
>
> memcg = container_of(work, struct mem_cgroup, work_freeing);
> - vfree(memcg);
> + if (size < PAGE_SIZE)
What about
if (is_vmalloc_addr(memcg))
> + kfree(memcg);
> + else
> + vfree(memcg);
> }
--
Michal Hocko
SUSE Labs
SUSE LINUX s.r.o.
Lihovarska 1060/12
190 00 Praha 9
Czech Republic
^ permalink raw reply
* Re: [PATCH v7 1/2] Always free struct memcg through schedule_work()
From: Glauber Costa @ 2012-05-25 9:51 UTC (permalink / raw)
To: Michal Hocko
Cc: Andrew Morton, linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
cgroups-u79uwXL29TY76Z2rM5mHXA, devel-GEFAQzZX7r8dnm+yROfE0A,
kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A,
netdev-u79uwXL29TY76Z2rM5mHXA, Tejun Heo, Li Zefan, David Miller,
Johannes Weiner
In-Reply-To: <20120525095007.GA30527-VqjxzfR4DlwKmadIfiO5sKVXKuFTiq87@public.gmane.org>
On 05/25/2012 01:50 PM, Michal Hocko wrote:
> On Fri 25-05-12 13:32:07, Glauber Costa wrote:
>> Right now we free struct memcg with kfree right after a
>> rcu grace period, but defer it if we need to use vfree() to get
>> rid of that memory area. We do that by need, because we need vfree
>> to be called in a process context.
>>
>> This patch unifies this behavior, by ensuring that even kfree will
>> happen in a separate thread. The goal is to have a stable place to
>> call the upcoming jump label destruction function outside the realm
>> of the complicated and quite far-reaching cgroup lock (that can't be
>> held when calling neither the cpu_hotplug.lock nor the jump_label_mutex)
>>
>> Signed-off-by: Glauber Costa<glommer-bzQdu9zFT3WakBO8gow8eQ@public.gmane.org>
>> Acked-by: Kamezawa Hiroyuki<kamezawa.hiroyu-+CUm20s59erQFUHtdCDX3A@public.gmane.org>
>
> Acked-by: Michal Hocko<mhocko-AlSwsSmVLrQ@public.gmane.org>
>
> Just one comment below
>
>> CC: Tejun Heo<tj-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
>> CC: Li Zefan<lizefan-hv44wF8Li93QT0dZR+AlfA@public.gmane.org>
>> CC: Johannes Weiner<hannes-druUgvl0LCNAfugRpC6u6w@public.gmane.org>
>> CC: Michal Hocko<mhocko-AlSwsSmVLrQ@public.gmane.org>
>> CC: Andrew Morton<akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
>> ---
>> mm/memcontrol.c | 24 +++++++++++++-----------
>> 1 files changed, 13 insertions(+), 11 deletions(-)
>>
>> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
>> index 932a734..0b4b4c8 100644
>> --- a/mm/memcontrol.c
>> +++ b/mm/memcontrol.c
> [...]
>> @@ -4826,23 +4826,28 @@ out_free:
>> }
>>
>> /*
>> - * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
>> + * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
>> * but in process context. The work_freeing structure is overlaid
>> * on the rcu_freeing structure, which itself is overlaid on memsw.
>> */
>> -static void vfree_work(struct work_struct *work)
>> +static void free_work(struct work_struct *work)
>> {
>> struct mem_cgroup *memcg;
>> + int size = sizeof(struct mem_cgroup);
>>
>> memcg = container_of(work, struct mem_cgroup, work_freeing);
>> - vfree(memcg);
>> + if (size< PAGE_SIZE)
>
> What about
> if (is_vmalloc_addr(memcg))
>> + kfree(memcg);
>> + else
>> + vfree(memcg);
>> }
>
Could be, but I believe this one is already in Andrew's tree from last
submission (might be wrong)
^ permalink raw reply
* Re: [PATCH 1/2] can: Added constants containing length of CAN identifiers
From: Rostislav Lisovy @ 2012-05-25 10:44 UTC (permalink / raw)
To: David Miller; +Cc: netdev, linux-can, pisa, sojkam1, oliver
In-Reply-To: <20120525.052256.2147003730285745711.davem@davemloft.net>
On Fri, 2012-05-25 at 05:22 -0400, David Miller wrote:
> It is not appropriate to submit new features at this time,
> as I described in detail in:
>
> http://marc.info/?l=netfilter-devel&m=133763475402372&w=2
>
> I used a subject line with BIG CAPITAL LETTERS in that posting so
> there is really no reason you should have overlooked it.
I am very sorry for not going through the mailing list history
thoroughly enough and thus overlooking your announcement. This was
however meant more like a [RFC]. If anybody has any comments, please
send them to me.
I will resend the patches as soon as the net-next is open.
Best regards,
Rostislav Lisovy
^ permalink raw reply
* Re: [PATCH] ip.7: Improve explanation about calling listen or connect
From: Peter Schiffer @ 2012-05-25 11:02 UTC (permalink / raw)
To: mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w
Cc: Flavio Leitner, linux-man-u79uwXL29TY76Z2rM5mHXA, netdev
In-Reply-To: <1336566636-14713-1-git-send-email-fbl-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Hi Michael,
do you have any comments for this update? Or do you need some supporting
info?
peter
On 05/09/2012 02:30 PM, Flavio Leitner wrote:
> Signed-off-by: Flavio Leitner<fbl-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
> ---
> man7/ip.7 | 15 +++++++++------
> 1 files changed, 9 insertions(+), 6 deletions(-)
>
> diff --git a/man7/ip.7 b/man7/ip.7
> index 9f560df..84fe32d 100644
> --- a/man7/ip.7
> +++ b/man7/ip.7
> @@ -69,12 +69,11 @@ For
> you may specify a valid IANA IP protocol defined in
> RFC\ 1700 assigned numbers.
> .PP
> -.\" FIXME ip current does an autobind in listen, but I'm not sure
> -.\" if that should be documented.
> When a process wants to receive new incoming packets or connections, it
> should bind a socket to a local interface address using
> .BR bind (2).
> -Only one IP socket may be bound to any given local (address, port) pair.
> +In this case, only one IP socket may be bound to any given local
> +(address, port) pair.
> When
> .B INADDR_ANY
> is specified in the bind call, the socket will be bound to
> @@ -82,10 +81,14 @@ is specified in the bind call, the socket will be bound to
> local interfaces.
> When
> .BR listen (2)
> -or
> +is called on an unbound socket, the socket is automatically bound
> +to a random free port with the local address set to
> +.BR INADDR_ANY .
> +When
> .BR connect (2)
> -are called on an unbound socket, it is automatically bound to a
> -random free port with the local address set to
> +is called on an unbound socket, the socket is automatically bound
> +to a random free port or an usable shared port with the local address
> +set to
> .BR INADDR_ANY .
>
> A TCP local socket address that has been bound is unavailable for
--
To unsubscribe from this list: send the line "unsubscribe linux-man" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply
* [PATCH] ieee802154: pass source address in dgram_recvmsg
From: Stephen Röttger @ 2012-05-25 12:14 UTC (permalink / raw)
To: dbaryshkov, slapin
Cc: davem, linux-zigbee-devel, netdev, linux-kernel,
Stephen Röttger
This patch lets dgram_recvmsg fill in the sockaddr struct in
msg->msg_name with the source address of the packet.
This is used by the userland functions recvmsg and recvfrom to get the
senders address.
The patch is based on the devel branch of
git://linux-zigbee.git.sourceforge.net/gitroot/linux-zigbee/kernel
Signed-off-by: Stephen Röttger <stephen.roettger@zero-entropy.de>
---
net/ieee802154/dgram.c | 10 ++++++++++
1 files changed, 10 insertions(+), 0 deletions(-)
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index 7883fa6..d0a6ebc 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -290,6 +290,9 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
size_t copied = 0;
int err = -EOPNOTSUPP;
struct sk_buff *skb;
+ struct sockaddr_ieee802154 *saddr;
+
+ saddr = (struct sockaddr_ieee802154 *)msg->msg_name;
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
@@ -308,6 +311,13 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
sock_recv_ts_and_drops(msg, sk, skb);
+ if (saddr) {
+ saddr->family = AF_IEEE802154;
+ saddr->addr = mac_cb(skb)->sa;
+ }
+ if (addr_len)
+ *addr_len = sizeof(*saddr);
+
if (flags & MSG_TRUNC)
copied = skb->len;
done:
--
1.7.8
^ permalink raw reply related
* skb_release_data oops
From: kendo @ 2012-05-25 14:19 UTC (permalink / raw)
To: netdev
I use the Linux kernel 2.6..38.8,found a bug when free skb,This failure may occur because what was it? Can you give some suggestions, thanks!!!!
Best reguards.
---------------------------------------------------------------
May 25 19:30:54 AnShion <9> klogd: [164619.378640] BUG: unable to handle kernel paging request at 000095a3
May 25 19:30:54 AnShion <9> klogd: [164619.454609] IP: [<c01c2353>] put_page+0x3/0x40
May 25 19:30:54 AnShion <12> klogd: [164619.508726] *pde = 00000000
May 25 19:30:54 AnShion <8> klogd: [164619.544185] Oops: 0000 [#1] SMP
May 25 19:30:54 AnShion <8> klogd: [164619.583891] last sysfs file: /sys/devices/virtual/net/tunl_FJ/uevent
May 25 19:30:54 AnShion <12> klogd: [164619.660716] Modules linked in: dpi_engine ipmi_watchdog nf_connmark ip_set_hash_netiface ip_set_hash_net ip_set_hash_ip xt_set ip_set xt_hashrate xt_dpi xt_pcc xt_nth xt_random xt_nflog xt_replace igb e1000e [last unloaded: dpi_engine]
May 25 19:30:54 AnShion <12> klogd: [164619.912644]
May 25 19:30:54 AnShion <12> klogd: [164619.931412] Pid: 0, comm: kworker/0:1 Not tainted 2.6.38.8 #347 To be filled by O.E.M. To be filled by O.E.M./P8B-X series
May 25 19:30:54 AnShion <12> klogd: [164620.064736] EIP: 0060:[<c01c2353>] EFLAGS: 00010202 CPU: 5
May 25 19:30:54 AnShion <12> klogd: [164620.131193] EIP is at put_page+0x3/0x40
May 25 19:30:54 AnShion <12> klogd: [164620.177950] EAX: 000095a3 EBX: 00000001 ECX: 00000000 EDX: 000095a3
May 25 19:30:54 AnShion <12> klogd: [164620.253737] ESI: dbda30c0 EDI: dbda30c0 EBP: f3cffd4c ESP: f3cffd3c
May 25 19:30:54 AnShion <12> klogd: [164620.329522] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
May 25 19:30:54 AnShion <8> klogd: [164620.394942] Process kworker/0:1 (pid: 0, ti=f3cfe000 task=f3cd4b00 task.ti=f3ce6000)
May 25 19:30:54 AnShion <8> klogd: [164620.488351] Stack:
May 25 19:30:54 AnShion <12> klogd: [164620.513337] f3cffd4c c069dbe4 dbda30c0 00000001 f3cffd58 c069d7a2 00000000 f3cffd70
May 25 19:30:54 AnShion <12> klogd: [164620.607577] c069d8ea c06dba0f 00000000 00000001 dbda30c0 f3cffda4 c06dba0f f30e0000
May 25 19:30:54 AnShion <12> klogd: [164620.701819] 00000000 f3cffd94 c0700ba0 80000000 00000002 c0b407a8 f34d3240 dbda30c0
May 25 19:30:54 AnShion <8> klogd: [164620.796059] Call Trace:
May 25 19:30:54 AnShion <12> klogd: [164620.826232] [<c069dbe4>] ? skb_release_data+0x84/0xa0
May 25 19:30:54 AnShion <12> klogd: [164620.888539] [<c069d7a2>] __kfree_skb+0x12/0x90
May 25 19:30:54 AnShion <12> klogd: [164620.943589] [<c069d8ea>] kfree_skb+0x5a/0x70
May 25 19:30:54 AnShion <12> klogd: [164620.996568] [<c06dba0f>] ? nf_hook_slow+0xcf/0xf0
May 25 19:30:54 AnShion <12> klogd: [164621.054730] [<c06dba0f>] nf_hook_slow+0xcf/0xf0
May 25 19:30:54 AnShion <12> klogd: [164621.110820] [<c0700ba0>] ? ip_local_deliver_finish+0x0/0x260
May 25 19:30:54 AnShion <12> klogd: [164621.180385] [<c0700e52>] ip_local_deliver+0x52/0xa0
May 25 19:30:54 AnShion <12> klogd: [164621.240620] [<c0700ba0>] ? ip_local_deliver_finish+0x0/0x260
May 25 19:30:54 AnShion <12> klogd: [164621.310186] [<c0700701>] ip_rcv_finish+0x241/0x3c0
May 25 19:30:54 AnShion <12> klogd: [164621.369383] [<c0700b26>] ip_rcv+0x2a6/0x320
May 25 19:30:54 AnShion <12> klogd: [164621.421324] [<c07004c0>] ? ip_rcv_finish+0x0/0x3c0
May 25 19:30:54 AnShion <12> klogd: [164621.480523] [<c06a9468>] __netif_receive_skb+0x258/0x520
May 25 19:30:54 AnShion <12> klogd: [164621.545942] [<c01e8cc0>] ? add_partial+0x40/0x70
May 25 19:30:54 AnShion <12> klogd: [164621.603068] [<c06a9863>] netif_receive_skb+0x23/0x50
May 25 19:30:54 AnShion <12> klogd: [164621.664340] [<c06a9987>] napi_skb_finish+0x37/0x50
May 25 19:30:54 AnShion <12> klogd: [164621.723537] [<c06a9fcb>] napi_gro_receive+0xdb/0xf0
May 25 19:30:54 AnShion <12> klogd: [164621.783774] [<c0192a4f>] ? irq_to_desc+0xf/0x20
May 25 19:30:54 AnShion <12> klogd: [164621.839860] [<c0105606>] ? handle_irq+0x16/0x90
May 25 19:30:54 AnShion <12> klogd: [164621.895954] [<f81d679c>] igb_poll+0x5fc/0xef0 [igb]
May 25 19:30:54 AnShion <12> klogd: [164621.956184] [<c0104bc5>] ? do_IRQ+0x45/0xb0
May 25 19:30:54 AnShion <12> klogd: [164622.008128] [<c06a9dfa>] net_rx_action+0xaa/0x1a0
May 25 19:30:54 AnShion <12> klogd: [164622.066288] [<c014bfa1>] __do_softirq+0xb1/0x190
May 25 19:30:54 AnShion <12> klogd: [164622.123411] [<c014bef0>] ? __do_softirq+0x0/0x190
May 25 19:30:54 AnShion <8> klogd: [164622.181572] <IRQ>
May 25 19:30:54 AnShion <12> klogd: [164622.207700] [<c014be6d>] ? irq_exit+0x5d/0x80
May 25 19:30:54 AnShion <12> klogd: [164622.261716] [<c011c9a6>] ? smp_apic_timer_interrupt+0x56/0x90
May 25 19:30:54 AnShion <12> klogd: [164622.332317] [<c0819c61>] ? apic_timer_interrupt+0x31/0x38
May 25 19:30:54 AnShion <12> klogd: [164622.398773] [<c0124b05>] ? native_safe_halt+0x5/0x10
May 25 19:30:54 AnShion <12> klogd: [164622.460046] [<c040e345>] ? acpi_idle_do_entry+0x33/0x54
May 25 19:30:54 AnShion <12> klogd: [164622.524426] [<c040e3bd>] ? acpi_idle_enter_c1+0x57/0x95
May 25 19:30:54 AnShion <12> klogd: [164622.588809] [<c0674089>] ? cpuidle_idle_call+0xd9/0x1c0
May 25 19:30:54 AnShion <12> klogd: [164622.653190] [<c010214a>] ? cpu_idle+0x8a/0xc0
May 25 19:30:54 AnShion <12> klogd: [164622.707206] [<c0813699>] ? start_secondary+0x1a1/0x1e8
May 25 19:30:54 AnShion <8> klogd: [164622.770550] Code: 04 f0 ff 0e 0f 94 c0 84 c0 74 d4 89 f8 e8 96 fe ff ff eb cb 0f ae e8 89 f6 8b 03 eb de 8d 74 26 00 8d bc 27 00 00 00 00 55 89 c2 <66> f7 00 00 c0 89 e5 75 1d 8b 40 04 f0 ff 4a 04 0f 94 c0 84 c0
^ permalink raw reply
* Investment
From: Alexander Eric @ 2012-05-25 14:28 UTC (permalink / raw)
In-Reply-To: <909505805.23519491337955964409.JavaMail.root@zim-store04.web.westnet.com.au>
[-- Attachment #1: Type: text/plain, Size: 51 bytes --]
Attached are details of Investment
cooperation
[-- Attachment #2: Investment!.pdf --]
[-- Type: application/pdf, Size: 25258 bytes --]
^ permalink raw reply
* [PATCH v8] tilegx network driver: initial support
From: Chris Metcalf @ 2012-05-25 14:42 UTC (permalink / raw)
To: bhutchings, arnd, David Miller, linux-kernel, netdev
In-Reply-To: <20120524.003148.700603156196416506.davem@davemloft.net>
This change adds support for the tilegx network driver based on the
GXIO IORPC support in the tilegx software stack, using the on-chip
mPIPE packet processing engine.
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
---
This version of the patch fixes the issue where we were failing
to properly stop the net_device queue when the mpipe egress queue
filled up. I also removed the internal bug numbers from the sources.
drivers/net/ethernet/tile/Kconfig | 1 +
drivers/net/ethernet/tile/Makefile | 4 +-
drivers/net/ethernet/tile/tilegx.c | 1854 ++++++++++++++++++++++++++++++++++++
3 files changed, 1857 insertions(+), 2 deletions(-)
create mode 100644 drivers/net/ethernet/tile/tilegx.c
diff --git a/drivers/net/ethernet/tile/Kconfig b/drivers/net/ethernet/tile/Kconfig
index 2d9218f..9184b61 100644
--- a/drivers/net/ethernet/tile/Kconfig
+++ b/drivers/net/ethernet/tile/Kconfig
@@ -7,6 +7,7 @@ config TILE_NET
depends on TILE
default y
select CRC32
+ select TILE_GXIO_MPIPE if TILEGX
---help---
This is a standard Linux network device driver for the
on-chip Tilera Gigabit Ethernet and XAUI interfaces.
diff --git a/drivers/net/ethernet/tile/Makefile b/drivers/net/ethernet/tile/Makefile
index f634f14..0ef9eef 100644
--- a/drivers/net/ethernet/tile/Makefile
+++ b/drivers/net/ethernet/tile/Makefile
@@ -4,7 +4,7 @@
obj-$(CONFIG_TILE_NET) += tile_net.o
ifdef CONFIG_TILEGX
-tile_net-objs := tilegx.o mpipe.o iorpc_mpipe.o dma_queue.o
+tile_net-y := tilegx.o
else
-tile_net-objs := tilepro.o
+tile_net-y := tilepro.o
endif
diff --git a/drivers/net/ethernet/tile/tilegx.c b/drivers/net/ethernet/tile/tilegx.c
new file mode 100644
index 0000000..cc00ba5
--- /dev/null
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -0,0 +1,1854 @@
+/*
+ * Copyright 2012 Tilera Corporation. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/kernel.h> /* printk() */
+#include <linux/slab.h> /* kmalloc() */
+#include <linux/errno.h> /* error codes */
+#include <linux/types.h> /* size_t */
+#include <linux/interrupt.h>
+#include <linux/in.h>
+#include <linux/irq.h>
+#include <linux/netdevice.h> /* struct device, and other headers */
+#include <linux/etherdevice.h> /* eth_type_trans */
+#include <linux/skbuff.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/hugetlb.h>
+#include <linux/in6.h>
+#include <linux/timer.h>
+#include <linux/hrtimer.h>
+#include <linux/ktime.h>
+#include <linux/io.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include <asm/checksum.h>
+#include <asm/homecache.h>
+#include <gxio/mpipe.h>
+#include <arch/sim.h>
+
+/* Default transmit lockup timeout period, in jiffies. */
+#define TILE_NET_TIMEOUT (5 * HZ)
+
+/* The maximum number of distinct channels (idesc.channel is 5 bits). */
+#define TILE_NET_CHANNELS 32
+
+/* Maximum number of idescs to handle per "poll". */
+#define TILE_NET_BATCH 128
+
+/* Maximum number of packets to handle per "poll". */
+#define TILE_NET_WEIGHT 64
+
+/* Number of entries in each iqueue. */
+#define IQUEUE_ENTRIES 512
+
+/* Number of entries in each equeue. */
+#define EQUEUE_ENTRIES 2048
+
+/* Total header bytes per equeue slot. Must be big enough for 2 bytes
+ * of NET_IP_ALIGN alignment, plus 14 bytes (?) of L2 header, plus up to
+ * 60 bytes of actual TCP header. We round up to align to cache lines.
+ */
+#define HEADER_BYTES 128
+
+/* Maximum completions per cpu per device (must be a power of two).
+ * ISSUE: What is the right number here? If this is too small, then
+ * egress might block waiting for free space in a completions array.
+ * ISSUE: At the least, allocate these only for initialized echannels.
+ */
+#define TILE_NET_MAX_COMPS 64
+
+#define MAX_FRAGS (MAX_SKB_FRAGS + 1)
+
+/* Size of completions data to allocate.
+ * ISSUE: Probably more than needed since we don't use all the channels.
+ */
+#define COMPS_SIZE (TILE_NET_CHANNELS * sizeof(struct tile_net_comps))
+
+/* Size of NotifRing data to allocate. */
+#define NOTIF_RING_SIZE (IQUEUE_ENTRIES * sizeof(gxio_mpipe_idesc_t))
+
+/* Timeout to wake the per-device TX timer after we stop the queue.
+ * We don't want the timeout too short (adds overhead, and might end
+ * up causing stop/wake/stop/wake cycles) or too long (affects performance).
+ * For the 10 Gb NIC, 30 usec means roughly 30+ 1500-byte packets.
+ */
+#define TX_TIMER_DELAY_USEC 30
+
+/* Timeout to wake the per-cpu egress timer to free completions. */
+#define EGRESS_TIMER_DELAY_USEC 1000
+
+MODULE_AUTHOR("Tilera Corporation");
+MODULE_LICENSE("GPL");
+
+/* A "packet fragment" (a chunk of memory). */
+struct frag {
+ void *buf;
+ size_t length;
+};
+
+/* A single completion. */
+struct tile_net_comp {
+ /* The "complete_count" when the completion will be complete. */
+ s64 when;
+ /* The buffer to be freed when the completion is complete. */
+ struct sk_buff *skb;
+};
+
+/* The completions for a given cpu and device. */
+struct tile_net_comps {
+ /* The completions. */
+ struct tile_net_comp comp_queue[TILE_NET_MAX_COMPS];
+ /* The number of completions used. */
+ unsigned long comp_next;
+ /* The number of completions freed. */
+ unsigned long comp_last;
+};
+
+/* Info for a specific cpu. */
+struct tile_net_info {
+ /* The NAPI struct. */
+ struct napi_struct napi;
+ /* Packet queue. */
+ gxio_mpipe_iqueue_t iqueue;
+ /* Our cpu. */
+ int my_cpu;
+ /* True if iqueue is valid. */
+ bool has_iqueue;
+ /* NAPI flags. */
+ bool napi_added;
+ bool napi_enabled;
+ /* Number of small sk_buffs which must still be provided. */
+ unsigned int num_needed_small_buffers;
+ /* Number of large sk_buffs which must still be provided. */
+ unsigned int num_needed_large_buffers;
+ /* A timer for handling egress completions. */
+ struct hrtimer egress_timer;
+ /* True if "egress_timer" is scheduled. */
+ bool egress_timer_scheduled;
+ /* Comps for each egress channel. */
+ struct tile_net_comps *comps_for_echannel[TILE_NET_CHANNELS];
+};
+
+/* Info for egress on a particular egress channel. */
+struct tile_net_egress {
+ /* The "equeue". */
+ gxio_mpipe_equeue_t *equeue;
+ /* The headers for TSO. */
+ unsigned char *headers;
+};
+
+/* Info for a specific device. */
+struct tile_net_priv {
+ /* Our network device. */
+ struct net_device *dev;
+ /* The primary link. */
+ gxio_mpipe_link_t link;
+ /* The primary channel, if open, else -1. */
+ int channel;
+ /* The "loopify" egress link, if needed. */
+ gxio_mpipe_link_t loopify_link;
+ /* The "loopify" egress channel, if open, else -1. */
+ int loopify_channel;
+ /* The egress channel (channel or loopify_channel). */
+ int echannel;
+ /* Total stats. */
+ struct net_device_stats stats;
+ /* Timer to wake up tx queue */
+ struct hrtimer tx_wake_timer;
+};
+
+/* Egress info, indexed by "priv->echannel" (lazily created as needed). */
+static struct tile_net_egress egress_for_echannel[TILE_NET_CHANNELS];
+
+/* Devices currently associated with each channel.
+ * NOTE: The array entry can become NULL after ifconfig down, but
+ * we do not free the underlying net_device structures, so it is
+ * safe to use a pointer after reading it from this array.
+ */
+static struct net_device *tile_net_devs_for_channel[TILE_NET_CHANNELS];
+
+/* A mutex for "tile_net_devs_for_channel". */
+static DEFINE_MUTEX(tile_net_devs_for_channel_mutex);
+
+/* The per-cpu info. */
+static DEFINE_PER_CPU(struct tile_net_info, per_cpu_info);
+
+/* The "context" for all devices. */
+static gxio_mpipe_context_t context;
+
+/* The small/large "buffer stacks". */
+static int small_buffer_stack = -1;
+static int large_buffer_stack = -1;
+
+/* Amount of memory allocated for each buffer stack. */
+static size_t buffer_stack_size;
+
+/* The actual memory allocated for the buffer stacks. */
+static void *small_buffer_stack_va;
+static void *large_buffer_stack_va;
+
+/* The buckets. */
+static int first_bucket = -1;
+static int num_buckets = 1;
+
+/* The ingress irq. */
+static int ingress_irq = -1;
+
+/* Text value of tile_net.cpus if passed as a module parameter. */
+static char *network_cpus_string;
+
+/* The actual cpus in "network_cpus". */
+static struct cpumask network_cpus_map;
+
+/* If "loopify=LINK" was specified, this is "LINK". */
+static char *loopify_link_name;
+
+/* If "tile_net.custom" was specified, this is non-NULL. */
+static char *custom_str;
+
+/* The "tile_net.cpus" argument specifies the cpus that are dedicated
+ * to handle ingress packets.
+ *
+ * The parameter should be in the form "tile_net.cpus=m-n[,x-y]", where
+ * m, n, x, y are integer numbers that represent the cpus that can be
+ * neither a dedicated cpu nor a dataplane cpu.
+ */
+static bool network_cpus_init(void)
+{
+ char buf[1024];
+ int rc;
+
+ if (network_cpus_string == NULL)
+ return false;
+
+ rc = cpulist_parse_crop(network_cpus_string, &network_cpus_map);
+ if (rc != 0) {
+ pr_warn("tile_net.cpus=%s: malformed cpu list\n",
+ network_cpus_string);
+ return false;
+ }
+
+ /* Remove dedicated cpus. */
+ cpumask_and(&network_cpus_map, &network_cpus_map, cpu_possible_mask);
+
+ if (cpumask_empty(&network_cpus_map)) {
+ pr_warn("Ignoring empty tile_net.cpus='%s'.\n",
+ network_cpus_string);
+ return false;
+ }
+
+ cpulist_scnprintf(buf, sizeof(buf), &network_cpus_map);
+ pr_info("Linux network CPUs: %s\n", buf);
+ return true;
+}
+
+module_param_named(cpus, network_cpus_string, charp, 0444);
+MODULE_PARM_DESC(cpus, "cpulist of cores that handle network interrupts");
+
+/* The "tile_net.loopify=LINK" argument causes the named device to
+ * actually use "loop0" for ingress, and "loop1" for egress. This
+ * allows an app to sit between the actual link and linux, passing
+ * (some) packets along to linux, and forwarding (some) packets sent
+ * out by linux.
+ */
+module_param_named(loopify, loopify_link_name, charp, 0444);
+MODULE_PARM_DESC(loopify, "name the device to use loop0/1 for ingress/egress");
+
+/* The "tile_net.custom" argument causes us to ignore the "conventional"
+ * classifier metadata, in particular, the "l2_offset".
+ */
+module_param_named(custom, custom_str, charp, 0444);
+MODULE_PARM_DESC(custom, "indicates a (heavily) customized classifier");
+
+/* Atomically update a statistics field.
+ * Note that on TILE-Gx, this operation is fire-and-forget on the
+ * issuing core (single-cycle dispatch) and takes only a few cycles
+ * longer than a regular store when the request reaches the home cache.
+ * No expensive bus management overhead is required.
+ */
+static void tile_net_stats_add(unsigned long value, unsigned long *field)
+{
+ BUILD_BUG_ON(sizeof(atomic_long_t) != sizeof(unsigned long));
+ atomic_long_add(value, (atomic_long_t *)field);
+}
+
+/* Allocate and push a buffer. */
+static bool tile_net_provide_buffer(bool small)
+{
+ int stack = small ? small_buffer_stack : large_buffer_stack;
+ const unsigned long buffer_alignment = 128;
+ struct sk_buff *skb;
+ int len;
+
+ len = sizeof(struct sk_buff **) + buffer_alignment;
+ len += (small ? 128 : 1664);
+ skb = dev_alloc_skb(len);
+ if (skb == NULL)
+ return false;
+
+ /* Make room for a back-pointer to 'skb' and guarantee alignment. */
+ skb_reserve(skb, sizeof(struct sk_buff **));
+ skb_reserve(skb, -(long)skb->data & (buffer_alignment - 1));
+
+ /* Save a back-pointer to 'skb'. */
+ *(struct sk_buff **)(skb->data - sizeof(struct sk_buff **)) = skb;
+
+ /* Make sure "skb" and the back-pointer have been flushed. */
+ wmb();
+
+ gxio_mpipe_push_buffer(&context, stack,
+ (void *)va_to_tile_io_addr(skb->data));
+
+ return true;
+}
+
+static void tile_net_pop_all_buffers(int stack)
+{
+ void *va;
+ while ((va = gxio_mpipe_pop_buffer(&context, stack)) != NULL) {
+ struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+ struct sk_buff *skb = *skb_ptr;
+ dev_kfree_skb_irq(skb);
+ }
+}
+
+/* Provide linux buffers to mPIPE. */
+static void tile_net_provide_needed_buffers(struct tile_net_info *info)
+{
+ while (info->num_needed_small_buffers != 0) {
+ if (!tile_net_provide_buffer(true))
+ goto oops;
+ info->num_needed_small_buffers--;
+ }
+
+ while (info->num_needed_large_buffers != 0) {
+ if (!tile_net_provide_buffer(false))
+ goto oops;
+ info->num_needed_large_buffers--;
+ }
+
+ return;
+
+oops:
+ /* Add a description to the page allocation failure dump. */
+ pr_notice("Tile %d still needs some buffers\n", info->my_cpu);
+}
+
+static inline bool filter_packet(struct net_device *dev, void *buf)
+{
+ /* Filter packets received before we're up. */
+ if (dev == NULL || !(dev->flags & IFF_UP))
+ return true;
+
+ /* Filter out packets that aren't for us. */
+ if (!(dev->flags & IFF_PROMISC) &&
+ !is_multicast_ether_addr(buf) &&
+ compare_ether_addr(dev->dev_addr, buf) != 0)
+ return true;
+
+ return false;
+}
+
+/* Convert a raw mpipe buffer to its matching skb pointer. */
+static struct sk_buff *mpipe_buf_to_skb(void *va)
+{
+ /* Acquire the associated "skb". */
+ struct sk_buff **skb_ptr = va - sizeof(*skb_ptr);
+ struct sk_buff *skb = *skb_ptr;
+
+ /* Paranoia. */
+ if (skb->data != va) {
+ /* Panic here since there's a reasonable chance
+ * that corrupt buffers means generic memory
+ * corruption, with unpredictable system effects.
+ */
+ panic("Corrupt linux buffer! va=%p, skb=%p, skb->data=%p",
+ va, skb, skb->data);
+ }
+
+ return skb;
+}
+
+static void tile_net_receive_skb(struct net_device *dev, struct sk_buff *skb,
+ struct tile_net_info *info,
+ gxio_mpipe_idesc_t *idesc, unsigned long len)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+
+ /* Encode the actual packet length. */
+ skb_put(skb, len);
+
+ skb->protocol = eth_type_trans(skb, dev);
+
+ /* Acknowledge "good" hardware checksums. */
+ if (idesc->cs && idesc->csum_seed_val == 0xFFFF)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ netif_receive_skb(skb);
+
+ /* Update stats. */
+ tile_net_stats_add(1, &priv->stats.rx_packets);
+ tile_net_stats_add(len, &priv->stats.rx_bytes);
+
+ /* Need a new buffer. */
+ if (idesc->size == GXIO_MPIPE_BUFFER_SIZE_128)
+ info->num_needed_small_buffers++;
+ else
+ info->num_needed_large_buffers++;
+}
+
+/* Handle a packet. Return true if "processed", false if "filtered". */
+static bool tile_net_handle_packet(struct tile_net_info *info,
+ gxio_mpipe_idesc_t *idesc)
+{
+ struct net_device *dev = tile_net_devs_for_channel[idesc->channel];
+ uint8_t l2_offset;
+ void *va;
+ void *buf;
+ unsigned long len;
+ bool filter;
+
+ /* Drop packets for which no buffer was available.
+ * NOTE: This happens under heavy load.
+ */
+ if (idesc->be) {
+ struct tile_net_priv *priv = netdev_priv(dev);
+ tile_net_stats_add(1, &priv->stats.rx_dropped);
+ gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+ if (net_ratelimit())
+ pr_info("Dropping packet (insufficient buffers).\n");
+ return false;
+ }
+
+ /* Get the "l2_offset", if allowed. */
+ l2_offset = custom_str ? 0 : gxio_mpipe_idesc_get_l2_offset(idesc);
+
+ /* Get the raw buffer VA (includes "headroom"). */
+ va = tile_io_addr_to_va((unsigned long)(long)idesc->va);
+
+ /* Get the actual packet start/length. */
+ buf = va + l2_offset;
+ len = idesc->l2_size - l2_offset;
+
+ /* Point "va" at the raw buffer. */
+ va -= NET_IP_ALIGN;
+
+ filter = filter_packet(dev, buf);
+ if (filter) {
+ gxio_mpipe_iqueue_drop(&info->iqueue, idesc);
+ } else {
+ struct sk_buff *skb = mpipe_buf_to_skb(va);
+
+ /* Skip headroom, and any custom header. */
+ skb_reserve(skb, NET_IP_ALIGN + l2_offset);
+
+ tile_net_receive_skb(dev, skb, info, idesc, len);
+ }
+
+ gxio_mpipe_iqueue_consume(&info->iqueue, idesc);
+ return !filter;
+}
+
+/* Handle some packets for the current CPU.
+ *
+ * This function handles up to TILE_NET_BATCH idescs per call.
+ *
+ * ISSUE: Since we do not provide new buffers until this function is
+ * complete, we must initially provide enough buffers for each network
+ * cpu to fill its iqueue and also its batched idescs.
+ *
+ * ISSUE: The "rotting packet" race condition occurs if a packet
+ * arrives after the queue appears to be empty, and before the
+ * hypervisor interrupt is re-enabled.
+ */
+static int tile_net_poll(struct napi_struct *napi, int budget)
+{
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+ unsigned int work = 0;
+ gxio_mpipe_idesc_t *idesc;
+ int i, n;
+
+ /* Process packets. */
+ while ((n = gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc)) > 0) {
+ for (i = 0; i < n; i++) {
+ if (i == TILE_NET_BATCH)
+ goto done;
+ if (tile_net_handle_packet(info, idesc + i)) {
+ if (++work >= budget)
+ goto done;
+ }
+ }
+ }
+
+ /* There are no packets left. */
+ napi_complete(&info->napi);
+
+ /* Re-enable hypervisor interrupts. */
+ gxio_mpipe_enable_notif_ring_interrupt(&context, info->iqueue.ring);
+
+ /* HACK: Avoid the "rotting packet" problem. */
+ if (gxio_mpipe_iqueue_try_peek(&info->iqueue, &idesc) > 0)
+ napi_schedule(&info->napi);
+
+ /* ISSUE: Handle completions? */
+
+done:
+ tile_net_provide_needed_buffers(info);
+
+ return work;
+}
+
+/* Handle an ingress interrupt on the current cpu. */
+static irqreturn_t tile_net_handle_ingress_irq(int irq, void *unused)
+{
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+ napi_schedule(&info->napi);
+ return IRQ_HANDLED;
+}
+
+/* Free some completions. This must be called with interrupts blocked. */
+static int tile_net_free_comps(gxio_mpipe_equeue_t *equeue,
+ struct tile_net_comps *comps,
+ int limit, bool force_update)
+{
+ int n = 0;
+ while (comps->comp_last < comps->comp_next) {
+ unsigned int cid = comps->comp_last % TILE_NET_MAX_COMPS;
+ struct tile_net_comp *comp = &comps->comp_queue[cid];
+ if (!gxio_mpipe_equeue_is_complete(equeue, comp->when,
+ force_update || n == 0))
+ break;
+ dev_kfree_skb_irq(comp->skb);
+ comps->comp_last++;
+ if (++n == limit)
+ break;
+ }
+ return n;
+}
+
+/* Add a completion. This must be called with interrupts blocked.
+ * tile_net_equeue_try_reserve() will have ensured a free completion entry.
+ */
+static void add_comp(gxio_mpipe_equeue_t *equeue,
+ struct tile_net_comps *comps,
+ uint64_t when, struct sk_buff *skb)
+{
+ int cid = comps->comp_next % TILE_NET_MAX_COMPS;
+ comps->comp_queue[cid].when = when;
+ comps->comp_queue[cid].skb = skb;
+ comps->comp_next++;
+}
+
+static void tile_net_schedule_tx_wake_timer(struct net_device *dev)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+
+ hrtimer_start(&priv->tx_wake_timer,
+ ktime_set(0, TX_TIMER_DELAY_USEC * 1000UL),
+ HRTIMER_MODE_REL);
+}
+
+static enum hrtimer_restart tile_net_handle_tx_wake_timer(struct hrtimer *t)
+{
+ struct net_device *dev;
+ struct tile_net_priv *priv;
+
+ priv = container_of(t, struct tile_net_priv, tx_wake_timer);
+ dev = priv->dev;
+
+ if (netif_queue_stopped(dev))
+ netif_wake_queue(dev);
+
+ return HRTIMER_NORESTART;
+}
+
+/* Make sure the egress timer is scheduled.
+ *
+ * Note that we use "schedule if not scheduled" logic instead of the more
+ * obvious "reschedule" logic, because "reschedule" is fairly expensive.
+ */
+static void tile_net_schedule_egress_timer(struct tile_net_info *info)
+{
+ if (!info->egress_timer_scheduled) {
+ hrtimer_start(&info->egress_timer,
+ ktime_set(0, EGRESS_TIMER_DELAY_USEC * 1000UL),
+ HRTIMER_MODE_REL);
+ info->egress_timer_scheduled = true;
+ }
+}
+
+/* The "function" for "info->egress_timer".
+ *
+ * This timer will reschedule itself as long as there are any pending
+ * completions expected for this tile.
+ */
+static enum hrtimer_restart tile_net_handle_egress_timer(struct hrtimer *t)
+{
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+ unsigned long irqflags;
+ bool pending = false;
+ int i;
+
+ local_irq_save(irqflags);
+
+ /* The timer is no longer scheduled. */
+ info->egress_timer_scheduled = false;
+
+ /* Free all possible comps for this tile. */
+ for (i = 0; i < TILE_NET_CHANNELS; i++) {
+ struct tile_net_egress *egress = &egress_for_echannel[i];
+ struct tile_net_comps *comps = info->comps_for_echannel[i];
+ if (comps->comp_last >= comps->comp_next)
+ continue;
+ tile_net_free_comps(egress->equeue, comps, -1, true);
+ pending = pending || (comps->comp_last < comps->comp_next);
+ }
+
+ /* Reschedule timer if needed. */
+ if (pending)
+ tile_net_schedule_egress_timer(info);
+
+ local_irq_restore(irqflags);
+
+ return HRTIMER_NORESTART;
+}
+
+/* Helper function for "tile_net_update()".
+ * "dev" (i.e. arg) is the device being brought up or down,
+ * or NULL if all devices are now down.
+ */
+static void tile_net_update_cpu(void *arg)
+{
+ struct net_device *dev = arg;
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+
+ if (!info->has_iqueue)
+ return;
+
+ if (dev != NULL) {
+ if (!info->napi_added) {
+ netif_napi_add(dev, &info->napi,
+ tile_net_poll, TILE_NET_WEIGHT);
+ info->napi_added = true;
+ }
+ if (!info->napi_enabled) {
+ napi_enable(&info->napi);
+ info->napi_enabled = true;
+ }
+ enable_percpu_irq(ingress_irq, 0);
+ } else {
+ disable_percpu_irq(ingress_irq);
+ if (info->napi_enabled) {
+ napi_disable(&info->napi);
+ info->napi_enabled = false;
+ }
+ /* FIXME: Drain the iqueue. */
+ }
+}
+
+/* Helper function for tile_net_open() and tile_net_stop().
+ * Always called under tile_net_devs_for_channel_mutex.
+ */
+static int tile_net_update(struct net_device *dev)
+{
+ static gxio_mpipe_rules_t rules; /* too big to fit on the stack */
+ bool saw_channel = false;
+ int channel;
+ int rc;
+ int cpu;
+
+ gxio_mpipe_rules_init(&rules, &context);
+
+ for (channel = 0; channel < TILE_NET_CHANNELS; channel++) {
+ if (tile_net_devs_for_channel[channel] == NULL)
+ continue;
+ if (!saw_channel) {
+ saw_channel = true;
+ gxio_mpipe_rules_begin(&rules, first_bucket,
+ num_buckets, NULL);
+ gxio_mpipe_rules_set_headroom(&rules, NET_IP_ALIGN);
+ }
+ gxio_mpipe_rules_add_channel(&rules, channel);
+ }
+
+ /* NOTE: This can fail if there is no classifier.
+ * ISSUE: Can anything else cause it to fail?
+ */
+ rc = gxio_mpipe_rules_commit(&rules);
+ if (rc != 0) {
+ netdev_warn(dev, "gxio_mpipe_rules_commit failed: %d\n", rc);
+ return -EIO;
+ }
+
+ /* Update all cpus, sequentially (to protect "netif_napi_add()"). */
+ for_each_online_cpu(cpu)
+ smp_call_function_single(cpu, tile_net_update_cpu,
+ (saw_channel ? dev : NULL), 1);
+
+ /* HACK: Allow packets to flow in the simulator. */
+ if (saw_channel)
+ sim_enable_mpipe_links(0, -1);
+
+ return 0;
+}
+
+/* Allocate and initialize mpipe buffer stacks, and register them in
+ * the mPIPE TLBs, for both small and large packet sizes.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_buffer_stacks(struct net_device *dev, int num_buffers)
+{
+ pte_t hash_pte = pte_set_home((pte_t) { 0 }, PAGE_HOME_HASH);
+ int rc;
+
+ /* Compute stack bytes; we round up to 64KB and then use
+ * alloc_pages() so we get the required 64KB alignment as well.
+ */
+ buffer_stack_size =
+ ALIGN(gxio_mpipe_calc_buffer_stack_bytes(num_buffers),
+ 64 * 1024);
+
+ /* Allocate two buffer stack indices. */
+ rc = gxio_mpipe_alloc_buffer_stacks(&context, 2, 0, 0);
+ if (rc < 0) {
+ netdev_err(dev, "gxio_mpipe_alloc_buffer_stacks failed: %d\n",
+ rc);
+ return rc;
+ }
+ small_buffer_stack = rc;
+ large_buffer_stack = rc + 1;
+
+ /* Allocate the small memory stack. */
+ small_buffer_stack_va =
+ alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+ if (small_buffer_stack_va == NULL) {
+ netdev_err(dev,
+ "Could not alloc %zd bytes for buffer stacks\n",
+ buffer_stack_size);
+ return -ENOMEM;
+ }
+ rc = gxio_mpipe_init_buffer_stack(&context, small_buffer_stack,
+ GXIO_MPIPE_BUFFER_SIZE_128,
+ small_buffer_stack_va,
+ buffer_stack_size, 0);
+ if (rc != 0) {
+ netdev_err(dev, "gxio_mpipe_init_buffer_stack: %d\n", rc);
+ return rc;
+ }
+ rc = gxio_mpipe_register_client_memory(&context, small_buffer_stack,
+ hash_pte, 0);
+ if (rc != 0) {
+ netdev_err(dev,
+ "gxio_mpipe_register_buffer_memory failed: %d\n",
+ rc);
+ return rc;
+ }
+
+ /* Allocate the large buffer stack. */
+ large_buffer_stack_va =
+ alloc_pages_exact(buffer_stack_size, GFP_KERNEL);
+ if (large_buffer_stack_va == NULL) {
+ netdev_err(dev,
+ "Could not alloc %zd bytes for buffer stacks\n",
+ buffer_stack_size);
+ return -ENOMEM;
+ }
+ rc = gxio_mpipe_init_buffer_stack(&context, large_buffer_stack,
+ GXIO_MPIPE_BUFFER_SIZE_1664,
+ large_buffer_stack_va,
+ buffer_stack_size, 0);
+ if (rc != 0) {
+ netdev_err(dev, "gxio_mpipe_init_buffer_stack failed: %d\n",
+ rc);
+ return rc;
+ }
+ rc = gxio_mpipe_register_client_memory(&context, large_buffer_stack,
+ hash_pte, 0);
+ if (rc != 0) {
+ netdev_err(dev,
+ "gxio_mpipe_register_buffer_memory failed: %d\n",
+ rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+/* Allocate per-cpu resources (memory for completions and idescs).
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int alloc_percpu_mpipe_resources(struct net_device *dev,
+ int cpu, int ring)
+{
+ struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+ int order, i, rc;
+ struct page *page;
+ void *addr;
+
+ /* Allocate the "comps". */
+ order = get_order(COMPS_SIZE);
+ page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+ if (page == NULL) {
+ netdev_err(dev, "Failed to alloc %zd bytes comps memory\n",
+ COMPS_SIZE);
+ return -ENOMEM;
+ }
+ addr = pfn_to_kaddr(page_to_pfn(page));
+ memset(addr, 0, COMPS_SIZE);
+ for (i = 0; i < TILE_NET_CHANNELS; i++)
+ info->comps_for_echannel[i] =
+ addr + i * sizeof(struct tile_net_comps);
+
+ /* If this is a network cpu, create an iqueue. */
+ if (cpu_isset(cpu, network_cpus_map)) {
+ order = get_order(NOTIF_RING_SIZE);
+ page = homecache_alloc_pages(GFP_KERNEL, order, cpu);
+ if (page == NULL) {
+ netdev_err(dev,
+ "Failed to alloc %zd bytes iqueue memory\n",
+ NOTIF_RING_SIZE);
+ return -ENOMEM;
+ }
+ addr = pfn_to_kaddr(page_to_pfn(page));
+ rc = gxio_mpipe_iqueue_init(&info->iqueue, &context, ring,
+ addr, NOTIF_RING_SIZE, 0);
+ if (rc != 0) {
+ netdev_err(dev,
+ "gxio_mpipe_iqueue_init failed: %d\n", rc);
+ return rc;
+ }
+ info->has_iqueue = true;
+ }
+
+ return 0;
+}
+
+/* Initialize NotifGroup and buckets.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int init_notif_group_and_buckets(struct net_device *dev,
+ int ring, int network_cpus_count)
+{
+ int group, rc;
+
+ /* Allocate one NotifGroup. */
+ rc = gxio_mpipe_alloc_notif_groups(&context, 1, 0, 0);
+ if (rc < 0) {
+ netdev_err(dev, "gxio_mpipe_alloc_notif_groups failed: %d\n",
+ rc);
+ return rc;
+ }
+ group = rc;
+
+ /* Initialize global num_buckets value. */
+ if (network_cpus_count > 4)
+ num_buckets = 256;
+ else if (network_cpus_count > 1)
+ num_buckets = 16;
+
+ /* Allocate some buckets, and set global first_bucket value. */
+ rc = gxio_mpipe_alloc_buckets(&context, num_buckets, 0, 0);
+ if (rc < 0) {
+ netdev_err(dev, "gxio_mpipe_alloc_buckets failed: %d\n", rc);
+ return rc;
+ }
+ first_bucket = rc;
+
+ /* Init group and buckets. */
+ rc = gxio_mpipe_init_notif_group_and_buckets(
+ &context, group, ring, network_cpus_count,
+ first_bucket, num_buckets,
+ GXIO_MPIPE_BUCKET_STICKY_FLOW_LOCALITY);
+ if (rc != 0) {
+ netdev_err(
+ dev,
+ "gxio_mpipe_init_notif_group_and_buckets failed: %d\n",
+ rc);
+ return rc;
+ }
+
+ return 0;
+}
+
+/* Create an irq and register it, then activate the irq and request
+ * interrupts on all cores. Note that "ingress_irq" being initialized
+ * is how we know not to call tile_net_init_mpipe() again.
+ * This routine supports tile_net_init_mpipe(), below.
+ */
+static int tile_net_setup_interrupts(struct net_device *dev)
+{
+ int cpu, rc;
+
+ rc = create_irq();
+ if (rc < 0) {
+ netdev_err(dev, "create_irq failed: %d\n", rc);
+ return rc;
+ }
+ ingress_irq = rc;
+ tile_irq_activate(ingress_irq, TILE_IRQ_PERCPU);
+ rc = request_irq(ingress_irq, tile_net_handle_ingress_irq,
+ 0, NULL, NULL);
+ if (rc != 0) {
+ netdev_err(dev, "request_irq failed: %d\n", rc);
+ destroy_irq(ingress_irq);
+ ingress_irq = -1;
+ return rc;
+ }
+
+ for_each_online_cpu(cpu) {
+ struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+ if (info->has_iqueue) {
+ gxio_mpipe_request_notif_ring_interrupt(
+ &context, cpu_x(cpu), cpu_y(cpu),
+ 1, ingress_irq, info->iqueue.ring);
+ }
+ }
+
+ return 0;
+}
+
+/* Undo any state set up partially by a failed call to tile_net_init_mpipe. */
+static void tile_net_init_mpipe_fail(void)
+{
+ int cpu;
+
+ /* Do cleanups that require the mpipe context first. */
+ if (small_buffer_stack >= 0)
+ tile_net_pop_all_buffers(small_buffer_stack);
+ if (large_buffer_stack >= 0)
+ tile_net_pop_all_buffers(large_buffer_stack);
+
+ /* Destroy mpipe context so the hardware no longer owns any memory. */
+ gxio_mpipe_destroy(&context);
+
+ for_each_online_cpu(cpu) {
+ struct tile_net_info *info = &per_cpu(per_cpu_info, cpu);
+ free_pages((unsigned long)(info->comps_for_echannel[0]),
+ get_order(COMPS_SIZE));
+ info->comps_for_echannel[0] = NULL;
+ free_pages((unsigned long)(info->iqueue.idescs),
+ get_order(NOTIF_RING_SIZE));
+ info->iqueue.idescs = NULL;
+ }
+
+ if (small_buffer_stack_va)
+ free_pages_exact(small_buffer_stack_va, buffer_stack_size);
+ if (large_buffer_stack_va)
+ free_pages_exact(large_buffer_stack_va, buffer_stack_size);
+
+ small_buffer_stack_va = NULL;
+ large_buffer_stack_va = NULL;
+ large_buffer_stack = -1;
+ small_buffer_stack = -1;
+ first_bucket = -1;
+}
+
+/* The first time any tilegx network device is opened, we initialize
+ * the global mpipe state. If this step fails, we fail to open the
+ * device, but if it succeeds, we never need to do it again, and since
+ * tile_net can't be unloaded, we never undo it.
+ *
+ * Note that some resources in this path (buffer stack indices,
+ * bindings from init_buffer_stack, etc.) are hypervisor resources
+ * that are freed implicitly by gxio_mpipe_destroy().
+ */
+static int tile_net_init_mpipe(struct net_device *dev)
+{
+ int i, num_buffers, rc;
+ int cpu;
+ int first_ring, ring;
+ int network_cpus_count = cpus_weight(network_cpus_map);
+
+ if (!hash_default) {
+ netdev_err(dev, "Networking requires hash_default!\n");
+ return -EIO;
+ }
+
+ rc = gxio_mpipe_init(&context, 0);
+ if (rc != 0) {
+ netdev_err(dev, "gxio_mpipe_init failed: %d\n", rc);
+ return -EIO;
+ }
+
+ /* Set up the buffer stacks. */
+ num_buffers =
+ network_cpus_count * (IQUEUE_ENTRIES + TILE_NET_BATCH);
+ rc = init_buffer_stacks(dev, num_buffers);
+ if (rc != 0)
+ goto fail;
+
+ /* Provide initial buffers. */
+ rc = -ENOMEM;
+ for (i = 0; i < num_buffers; i++) {
+ if (!tile_net_provide_buffer(true)) {
+ netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+ goto fail;
+ }
+ }
+ for (i = 0; i < num_buffers; i++) {
+ if (!tile_net_provide_buffer(false)) {
+ netdev_err(dev, "Cannot allocate initial sk_bufs!\n");
+ goto fail;
+ }
+ }
+
+ /* Allocate one NotifRing for each network cpu. */
+ rc = gxio_mpipe_alloc_notif_rings(&context, network_cpus_count, 0, 0);
+ if (rc < 0) {
+ netdev_err(dev, "gxio_mpipe_alloc_notif_rings failed %d\n",
+ rc);
+ goto fail;
+ }
+
+ /* Init NotifRings per-cpu. */
+ first_ring = rc;
+ ring = first_ring;
+ for_each_online_cpu(cpu) {
+ rc = alloc_percpu_mpipe_resources(dev, cpu, ring++);
+ if (rc != 0)
+ goto fail;
+ }
+
+ /* Initialize NotifGroup and buckets. */
+ rc = init_notif_group_and_buckets(dev, first_ring, network_cpus_count);
+ if (rc != 0)
+ goto fail;
+
+ /* Create and enable interrupts. */
+ rc = tile_net_setup_interrupts(dev);
+ if (rc != 0)
+ goto fail;
+
+ return 0;
+
+fail:
+ tile_net_init_mpipe_fail();
+ return rc;
+}
+
+/* Create persistent egress info for a given egress channel.
+ * Note that this may be shared between, say, "gbe0" and "xgbe0".
+ * ISSUE: Defer header allocation until TSO is actually needed?
+ */
+static int tile_net_init_egress(struct net_device *dev, int echannel)
+{
+ struct page *headers_page, *edescs_page, *equeue_page;
+ gxio_mpipe_edesc_t *edescs;
+ gxio_mpipe_equeue_t *equeue;
+ unsigned char *headers;
+ int headers_order, edescs_order, equeue_order;
+ size_t edescs_size;
+ int edma;
+ int rc = -ENOMEM;
+
+ /* Only initialize once. */
+ if (egress_for_echannel[echannel].equeue != NULL)
+ return 0;
+
+ /* Allocate memory for the "headers". */
+ headers_order = get_order(EQUEUE_ENTRIES * HEADER_BYTES);
+ headers_page = alloc_pages(GFP_KERNEL, headers_order);
+ if (headers_page == NULL) {
+ netdev_warn(dev,
+ "Could not alloc %zd bytes for TSO headers.\n",
+ PAGE_SIZE << headers_order);
+ goto fail;
+ }
+ headers = pfn_to_kaddr(page_to_pfn(headers_page));
+
+ /* Allocate memory for the "edescs". */
+ edescs_size = EQUEUE_ENTRIES * sizeof(*edescs);
+ edescs_order = get_order(edescs_size);
+ edescs_page = alloc_pages(GFP_KERNEL, edescs_order);
+ if (edescs_page == NULL) {
+ netdev_warn(dev,
+ "Could not alloc %zd bytes for eDMA ring.\n",
+ edescs_size);
+ goto fail_headers;
+ }
+ edescs = pfn_to_kaddr(page_to_pfn(edescs_page));
+
+ /* Allocate memory for the "equeue". */
+ equeue_order = get_order(sizeof(*equeue));
+ equeue_page = alloc_pages(GFP_KERNEL, equeue_order);
+ if (equeue_page == NULL) {
+ netdev_warn(dev,
+ "Could not alloc %zd bytes for equeue info.\n",
+ PAGE_SIZE << equeue_order);
+ goto fail_edescs;
+ }
+ equeue = pfn_to_kaddr(page_to_pfn(equeue_page));
+
+ /* Allocate an edma ring. Note that in practice this can't
+ * fail, which is good, because we will leak an edma ring if so.
+ */
+ rc = gxio_mpipe_alloc_edma_rings(&context, 1, 0, 0);
+ if (rc < 0) {
+ netdev_warn(dev, "gxio_mpipe_alloc_edma_rings failed: %d\n",
+ rc);
+ goto fail_equeue;
+ }
+ edma = rc;
+
+ /* Initialize the equeue. */
+ rc = gxio_mpipe_equeue_init(equeue, &context, edma, echannel,
+ edescs, edescs_size, 0);
+ if (rc != 0) {
+ netdev_err(dev, "gxio_mpipe_equeue_init failed: %d\n", rc);
+ goto fail_equeue;
+ }
+
+ /* Done. */
+ egress_for_echannel[echannel].equeue = equeue;
+ egress_for_echannel[echannel].headers = headers;
+ return 0;
+
+fail_equeue:
+ __free_pages(equeue_page, equeue_order);
+
+fail_edescs:
+ __free_pages(edescs_page, edescs_order);
+
+fail_headers:
+ __free_pages(headers_page, headers_order);
+
+fail:
+ return rc;
+}
+
+/* Return channel number for a newly-opened link. */
+static int tile_net_link_open(struct net_device *dev, gxio_mpipe_link_t *link,
+ const char *link_name)
+{
+ int rc = gxio_mpipe_link_open(link, &context, link_name, 0);
+ if (rc < 0) {
+ netdev_err(dev, "Failed to open '%s'\n", link_name);
+ return rc;
+ }
+ rc = gxio_mpipe_link_channel(link);
+ if (rc < 0 || rc >= TILE_NET_CHANNELS) {
+ netdev_err(dev, "gxio_mpipe_link_channel bad value: %d\n", rc);
+ gxio_mpipe_link_close(link);
+ return -EINVAL;
+ }
+ return rc;
+}
+
+/* Help the kernel activate the given network interface. */
+static int tile_net_open(struct net_device *dev)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+ int rc;
+
+ mutex_lock(&tile_net_devs_for_channel_mutex);
+
+ /* Do one-time initialization the first time any device is opened. */
+ if (ingress_irq < 0) {
+ rc = tile_net_init_mpipe(dev);
+ if (rc != 0)
+ goto fail;
+ }
+
+ /* Determine if this is the "loopify" device. */
+ if (unlikely((loopify_link_name != NULL) &&
+ !strcmp(dev->name, loopify_link_name))) {
+ rc = tile_net_link_open(dev, &priv->link, "loop0");
+ if (rc < 0)
+ goto fail;
+ priv->channel = rc;
+ rc = tile_net_link_open(dev, &priv->loopify_link, "loop1");
+ if (rc < 0)
+ goto fail;
+ priv->loopify_channel = rc;
+ priv->echannel = rc;
+ } else {
+ rc = tile_net_link_open(dev, &priv->link, dev->name);
+ if (rc < 0)
+ goto fail;
+ priv->channel = rc;
+ priv->echannel = rc;
+ }
+
+ /* Initialize egress info (if needed). Once ever, per echannel. */
+ rc = tile_net_init_egress(dev, priv->echannel);
+ if (rc != 0)
+ goto fail;
+
+ tile_net_devs_for_channel[priv->channel] = dev;
+
+ rc = tile_net_update(dev);
+ if (rc != 0)
+ goto fail;
+
+ mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+ netif_start_queue(dev);
+ netif_carrier_on(dev);
+ return 0;
+
+fail:
+ if (priv->loopify_channel >= 0) {
+ if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+ netdev_warn(dev, "Failed to close loopify link!\n");
+ priv->loopify_channel = -1;
+ }
+ if (priv->channel >= 0) {
+ if (gxio_mpipe_link_close(&priv->link) != 0)
+ netdev_warn(dev, "Failed to close link!\n");
+ priv->channel = -1;
+ }
+ priv->echannel = -1;
+ tile_net_devs_for_channel[priv->channel] = NULL;
+ mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+ /* Don't return raw gxio error codes to generic Linux. */
+ return (rc > -512) ? rc : -EIO;
+}
+
+/* Help the kernel deactivate the given network interface. */
+static int tile_net_stop(struct net_device *dev)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+
+ netif_stop_queue(dev);
+
+ mutex_lock(&tile_net_devs_for_channel_mutex);
+ tile_net_devs_for_channel[priv->channel] = NULL;
+ (void)tile_net_update(dev);
+ if (priv->loopify_channel >= 0) {
+ if (gxio_mpipe_link_close(&priv->loopify_link) != 0)
+ netdev_warn(dev, "Failed to close loopify link!\n");
+ priv->loopify_channel = -1;
+ }
+ if (priv->channel >= 0) {
+ if (gxio_mpipe_link_close(&priv->link) != 0)
+ netdev_warn(dev, "Failed to close link!\n");
+ priv->channel = -1;
+ }
+ priv->echannel = -1;
+ mutex_unlock(&tile_net_devs_for_channel_mutex);
+
+ return 0;
+}
+
+/* Determine the VA for a fragment. */
+static inline void *tile_net_frag_buf(skb_frag_t *f)
+{
+ unsigned long pfn = page_to_pfn(skb_frag_page(f));
+ return pfn_to_kaddr(pfn) + f->page_offset;
+}
+
+/* Acquire a completion entry and an egress slot, or if we can't,
+ * stop the queue and schedule the tx_wake timer.
+ */
+static s64 tile_net_equeue_try_reserve(struct net_device *dev,
+ struct tile_net_comps *comps,
+ gxio_mpipe_equeue_t *equeue,
+ int num_edescs)
+{
+ /* Try to acquire a completion entry. */
+ if (comps->comp_next - comps->comp_last < TILE_NET_MAX_COMPS - 1 ||
+ tile_net_free_comps(equeue, comps, 32, false) != 0) {
+
+ /* Try to acquire an egress slot. */
+ s64 slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+ if (slot >= 0)
+ return slot;
+
+ /* Freeing some completions gives the equeue time to drain. */
+ tile_net_free_comps(equeue, comps, TILE_NET_MAX_COMPS, false);
+
+ slot = gxio_mpipe_equeue_try_reserve(equeue, num_edescs);
+ if (slot >= 0)
+ return slot;
+ }
+
+ /* Still nothing; give up and stop the queue for a short while. */
+ netif_stop_queue(dev);
+ tile_net_schedule_tx_wake_timer(dev);
+ return -1;
+}
+
+/* Determine how many edesc's are needed for TSO.
+ *
+ * Sometimes, if "sendfile()" requires copying, we will be called with
+ * "data" containing the header and payload, with "frags" being empty.
+ * Sometimes, for example when using NFS over TCP, a single segment can
+ * span 3 fragments. This requires special care.
+ */
+static int tso_count_edescs(struct sk_buff *skb)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ unsigned int len = skb->len;
+ unsigned int p_len = sh->gso_size;
+ long f_id = -1; /* id of the current fragment */
+ long f_size = -1; /* size of the current fragment */
+ long f_used = -1; /* bytes used from the current fragment */
+ long n; /* size of the current piece of payload */
+ int num_edescs = 0;
+ int segment;
+
+ for (segment = 0; segment < sh->gso_segs; segment++) {
+
+ unsigned int p_used = 0;
+
+ /* The last segment may be less than gso_size. */
+ len -= p_len;
+ if (len < p_len)
+ p_len = len;
+
+ /* One edesc for header and for each piece of the payload. */
+ for (num_edescs++; p_used < p_len; num_edescs++) {
+
+ /* Advance as needed. */
+ while (f_used >= f_size) {
+ f_id++;
+ f_size = sh->frags[f_id].size;
+ f_used = 0;
+ }
+
+ /* Use bytes from the current fragment. */
+ n = p_len - p_used;
+ if (n > f_size - f_used)
+ n = f_size - f_used;
+ f_used += n;
+ p_used += n;
+ }
+ }
+
+ return num_edescs;
+}
+
+/* Prepare modified copies of the skbuff headers.
+ * FIXME: add support for IPv6.
+ */
+static void tso_headers_prepare(struct sk_buff *skb, unsigned char *headers,
+ s64 slot)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct iphdr *ih;
+ struct tcphdr *th;
+ unsigned int len = skb->len;
+ unsigned char *data = skb->data;
+ unsigned int ih_off, th_off, sh_len, total_len, p_len;
+ unsigned int isum_start, tsum_start, id, seq;
+ long f_id = -1; /* id of the current fragment */
+ long f_size = -1; /* size of the current fragment */
+ long f_used = -1; /* bytes used from the current fragment */
+ long n; /* size of the current piece of payload */
+ int segment;
+
+ /* Locate original headers and compute various lengths. */
+ ih = ip_hdr(skb);
+ th = tcp_hdr(skb);
+ ih_off = (unsigned char *)ih - data;
+ th_off = (unsigned char *)th - data;
+ sh_len = th_off + tcp_hdrlen(skb);
+ p_len = sh->gso_size;
+ total_len = p_len + sh_len;
+
+ /* Set up seed values for IP and TCP csum and initialize id and seq. */
+ isum_start = ((0xFFFF - ih->check) +
+ (0xFFFF - ih->tot_len) +
+ (0xFFFF - ih->id));
+ tsum_start = th->check + (0xFFFF ^ htons(len));
+ id = ntohs(ih->id);
+ seq = ntohl(th->seq);
+
+ /* Prepare all the headers. */
+ for (segment = 0; segment < sh->gso_segs; segment++) {
+ unsigned char *buf;
+ unsigned int p_used = 0;
+
+ /* The last segment may be less than gso_size. */
+ len -= p_len;
+ if (len < p_len) {
+ p_len = len;
+ total_len = p_len + sh_len;
+ }
+
+ /* Copy to the header memory for this segment. */
+ buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+ NET_IP_ALIGN;
+ memcpy(buf, data, sh_len);
+
+ /* Update copied ip header. */
+ ih = (struct iphdr *)(buf + ih_off);
+ ih->tot_len = htons(total_len - ih_off);
+ ih->id = htons(id);
+ ih->check = csum_long(isum_start + htons(total_len - ih_off) +
+ htons(id)) ^ 0xffff;
+
+ /* Update copied tcp header. */
+ th = (struct tcphdr *)(buf + th_off);
+ th->seq = htonl(seq);
+ th->check = csum_long(tsum_start + htons(total_len));
+ if (segment != sh->gso_segs - 1) {
+ th->fin = 0;
+ th->psh = 0;
+ }
+
+ /* Skip past the header. */
+ slot++;
+
+ /* Skip past the payload. */
+ while (p_used < p_len) {
+
+ /* Advance as needed. */
+ while (f_used >= f_size) {
+ f_id++;
+ f_size = sh->frags[f_id].size;
+ f_used = 0;
+ }
+
+ /* Use bytes from the current fragment. */
+ n = p_len - p_used;
+ if (n > f_size - f_used)
+ n = f_size - f_used;
+ f_used += n;
+ p_used += n;
+
+ slot++;
+ }
+
+ id++;
+ seq += p_len;
+ }
+
+ /* Flush the headers so they are ready for hardware DMA. */
+ wmb();
+}
+
+/* Pass all the data to mpipe for egress. */
+static void tso_egress(struct net_device *dev, gxio_mpipe_equeue_t *equeue,
+ struct sk_buff *skb, unsigned char *headers, s64 slot)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ unsigned int len = skb->len;
+ unsigned int p_len = sh->gso_size;
+ gxio_mpipe_edesc_t edesc_head = { { 0 } };
+ gxio_mpipe_edesc_t edesc_body = { { 0 } };
+ long f_id = -1; /* id of the current fragment */
+ long f_size = -1; /* size of the current fragment */
+ long f_used = -1; /* bytes used from the current fragment */
+ long n; /* size of the current piece of payload */
+ unsigned long tx_packets = 0, tx_bytes = 0;
+ unsigned int csum_start, sh_len;
+ int segment;
+
+ /* Prepare to egress the headers: set up header edesc. */
+ csum_start = skb_checksum_start_offset(skb);
+ sh_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ edesc_head.csum = 1;
+ edesc_head.csum_start = csum_start;
+ edesc_head.csum_dest = csum_start + skb->csum_offset;
+ edesc_head.xfer_size = sh_len;
+
+ /* This is only used to specify the TLB. */
+ edesc_head.stack_idx = large_buffer_stack;
+ edesc_body.stack_idx = large_buffer_stack;
+
+ /* Egress all the edescs. */
+ for (segment = 0; segment < sh->gso_segs; segment++) {
+ void *va;
+ unsigned char *buf;
+ unsigned int p_used = 0;
+
+ /* The last segment may be less than gso_size. */
+ len -= p_len;
+ if (len < p_len)
+ p_len = len;
+
+ /* Egress the header. */
+ buf = headers + (slot % EQUEUE_ENTRIES) * HEADER_BYTES +
+ NET_IP_ALIGN;
+ edesc_head.va = va_to_tile_io_addr(buf);
+ gxio_mpipe_equeue_put_at(equeue, edesc_head, slot);
+ slot++;
+
+ /* Egress the payload. */
+ while (p_used < p_len) {
+
+ /* Advance as needed. */
+ while (f_used >= f_size) {
+ f_id++;
+ f_size = sh->frags[f_id].size;
+ f_used = 0;
+ }
+
+ va = tile_net_frag_buf(&sh->frags[f_id]) + f_used;
+
+ /* Use bytes from the current fragment. */
+ n = p_len - p_used;
+ if (n > f_size - f_used)
+ n = f_size - f_used;
+ f_used += n;
+ p_used += n;
+
+ /* Egress a piece of the payload. */
+ edesc_body.va = va_to_tile_io_addr(va);
+ edesc_body.xfer_size = n;
+ edesc_body.bound = !(p_used < p_len);
+ gxio_mpipe_equeue_put_at(equeue, edesc_body, slot);
+ slot++;
+ }
+
+ tx_packets++;
+ tx_bytes += sh_len + p_len;
+ }
+
+ /* Update stats. */
+ tile_net_stats_add(tx_packets, &priv->stats.tx_packets);
+ tile_net_stats_add(tx_bytes, &priv->stats.tx_bytes);
+}
+
+/* Do TSO handling for egress. */
+static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+ int channel = priv->echannel;
+ struct tile_net_egress *egress = &egress_for_echannel[channel];
+ struct tile_net_comps *comps = info->comps_for_echannel[channel];
+ gxio_mpipe_equeue_t *equeue = egress->equeue;
+ unsigned long irqflags;
+ int num_edescs;
+ s64 slot;
+
+ /* Determine how many mpipe edesc's are needed. */
+ num_edescs = tso_count_edescs(skb);
+
+ local_irq_save(irqflags);
+
+ /* Set first reserved egress slot. */
+ slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+ if (slot < 0) {
+ local_irq_restore(irqflags);
+ return NETDEV_TX_BUSY;
+ }
+
+ /* Set up copies of header data properly. */
+ tso_headers_prepare(skb, egress->headers, slot);
+
+ /* Actually pass the data to the network hardware. */
+ tso_egress(dev, equeue, skb, egress->headers, slot);
+
+ /* Add a completion record. */
+ add_comp(equeue, comps, slot + num_edescs - 1, skb);
+
+ local_irq_restore(irqflags);
+
+ /* Make sure the egress timer is scheduled. */
+ tile_net_schedule_egress_timer(info);
+
+ return NETDEV_TX_OK;
+}
+
+/* Analyze the body and frags for a transmit request. */
+static unsigned int tile_net_tx_frags(struct frag *frags,
+ struct sk_buff *skb,
+ void *b_data, unsigned int b_len)
+{
+ unsigned int i, n = 0;
+
+ struct skb_shared_info *sh = skb_shinfo(skb);
+
+ if (b_len != 0) {
+ frags[n].buf = b_data;
+ frags[n++].length = b_len;
+ }
+
+ for (i = 0; i < sh->nr_frags; i++) {
+ skb_frag_t *f = &sh->frags[i];
+ frags[n].buf = tile_net_frag_buf(f);
+ frags[n++].length = skb_frag_size(f);
+ }
+
+ return n;
+}
+
+/* Help the kernel transmit a packet. */
+static int tile_net_tx(struct sk_buff *skb, struct net_device *dev)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+ struct tile_net_egress *egress = &egress_for_echannel[priv->echannel];
+ gxio_mpipe_equeue_t *equeue = egress->equeue;
+ struct tile_net_comps *comps =
+ info->comps_for_echannel[priv->echannel];
+ unsigned int len = skb->len;
+ unsigned char *data = skb->data;
+ unsigned int num_edescs;
+ struct frag frags[MAX_FRAGS];
+ gxio_mpipe_edesc_t edescs[MAX_FRAGS];
+ unsigned long irqflags;
+ gxio_mpipe_edesc_t edesc = { { 0 } };
+ unsigned int i;
+ s64 slot;
+
+ /* Save the timestamp. */
+ dev->trans_start = jiffies;
+
+ if (skb_is_gso(skb))
+ return tile_net_tx_tso(skb, dev);
+
+ num_edescs = tile_net_tx_frags(frags, skb, data, skb_headlen(skb));
+
+ /* This is only used to specify the TLB. */
+ edesc.stack_idx = large_buffer_stack;
+
+ /* Prepare the edescs. */
+ for (i = 0; i < num_edescs; i++) {
+ edesc.xfer_size = frags[i].length;
+ edesc.va = va_to_tile_io_addr(frags[i].buf);
+ edescs[i] = edesc;
+ }
+
+ /* Mark the final edesc. */
+ edescs[num_edescs - 1].bound = 1;
+
+ /* Add checksum info to the initial edesc, if needed. */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ unsigned int csum_start = skb_checksum_start_offset(skb);
+ edescs[0].csum = 1;
+ edescs[0].csum_start = csum_start;
+ edescs[0].csum_dest = csum_start + skb->csum_offset;
+ }
+
+ local_irq_save(irqflags);
+
+ /* Set first reserved egress slot. */
+ slot = tile_net_equeue_try_reserve(dev, comps, equeue, num_edescs);
+ if (slot < 0) {
+ local_irq_restore(irqflags);
+ return NETDEV_TX_BUSY;
+ }
+
+ for (i = 0; i < num_edescs; i++)
+ gxio_mpipe_equeue_put_at(equeue, edescs[i], slot++);
+
+ /* Add a completion record. */
+ add_comp(equeue, comps, slot - 1, skb);
+
+ /* NOTE: Use ETH_ZLEN for short packets (e.g. 42 < 60). */
+ tile_net_stats_add(1, &priv->stats.tx_packets);
+ tile_net_stats_add(max_t(unsigned int, len, ETH_ZLEN),
+ &priv->stats.tx_bytes);
+
+ local_irq_restore(irqflags);
+
+ /* Make sure the egress timer is scheduled. */
+ tile_net_schedule_egress_timer(info);
+
+ return NETDEV_TX_OK;
+}
+
+/* Deal with a transmit timeout. */
+static void tile_net_tx_timeout(struct net_device *dev)
+{
+ netif_wake_queue(dev);
+}
+
+/* Ioctl commands. */
+static int tile_net_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+ return -EOPNOTSUPP;
+}
+
+/* Get system network statistics for device. */
+static struct net_device_stats *tile_net_get_stats(struct net_device *dev)
+{
+ struct tile_net_priv *priv = netdev_priv(dev);
+ return &priv->stats;
+}
+
+/* Change the MTU. */
+static int tile_net_change_mtu(struct net_device *dev, int new_mtu)
+{
+ if ((new_mtu < 68) || (new_mtu > 1500))
+ return -EINVAL;
+ dev->mtu = new_mtu;
+ return 0;
+}
+
+/* Change the Ethernet address of the NIC.
+ *
+ * The hypervisor driver does not support changing MAC address. However,
+ * the hardware does not do anything with the MAC address, so the address
+ * which gets used on outgoing packets, and which is accepted on incoming
+ * packets, is completely up to us.
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int tile_net_set_mac_address(struct net_device *dev, void *p)
+{
+ struct sockaddr *addr = p;
+
+ if (!is_valid_ether_addr(addr->sa_data))
+ return -EINVAL;
+ memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
+ return 0;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+/* Polling 'interrupt' - used by things like netconsole to send skbs
+ * without having to re-enable interrupts. It's not called while
+ * the interrupt routine is executing.
+ */
+static void tile_net_netpoll(struct net_device *dev)
+{
+ disable_percpu_irq(ingress_irq);
+ tile_net_handle_ingress_irq(ingress_irq, NULL);
+ enable_percpu_irq(ingress_irq, 0);
+}
+#endif
+
+static const struct net_device_ops tile_net_ops = {
+ .ndo_open = tile_net_open,
+ .ndo_stop = tile_net_stop,
+ .ndo_start_xmit = tile_net_tx,
+ .ndo_do_ioctl = tile_net_ioctl,
+ .ndo_get_stats = tile_net_get_stats,
+ .ndo_change_mtu = tile_net_change_mtu,
+ .ndo_tx_timeout = tile_net_tx_timeout,
+ .ndo_set_mac_address = tile_net_set_mac_address,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+ .ndo_poll_controller = tile_net_netpoll,
+#endif
+};
+
+/* The setup function.
+ *
+ * This uses ether_setup() to assign various fields in dev, including
+ * setting IFF_BROADCAST and IFF_MULTICAST, then sets some extra fields.
+ */
+static void tile_net_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+ dev->netdev_ops = &tile_net_ops;
+ dev->watchdog_timeo = TILE_NET_TIMEOUT;
+ dev->features |= NETIF_F_LLTX;
+ dev->features |= NETIF_F_HW_CSUM;
+ dev->features |= NETIF_F_SG;
+ dev->features |= NETIF_F_TSO;
+ dev->tx_queue_len = 0;
+ dev->mtu = 1500;
+}
+
+/* Allocate the device structure, register the device, and obtain the
+ * MAC address from the hypervisor.
+ */
+static void tile_net_dev_init(const char *name, const uint8_t *mac)
+{
+ int ret;
+ int i;
+ int nz_addr = 0;
+ struct net_device *dev;
+ struct tile_net_priv *priv;
+
+ /* HACK: Ignore "loop" links. */
+ if (strncmp(name, "loop", 4) == 0)
+ return;
+
+ /* Allocate the device structure. Normally, "name" is a
+ * template, instantiated by register_netdev(), but not for us.
+ */
+ dev = alloc_netdev(sizeof(*priv), name, tile_net_setup);
+ if (!dev) {
+ pr_err("alloc_netdev(%s) failed\n", name);
+ return;
+ }
+
+ /* Initialize "priv". */
+ priv = netdev_priv(dev);
+ memset(priv, 0, sizeof(*priv));
+ priv->dev = dev;
+ priv->channel = -1;
+ priv->loopify_channel = -1;
+ priv->echannel = -1;
+
+ /* Get the MAC address and set it in the device struct; this must
+ * be done before the device is opened. If the MAC is all zeroes,
+ * we use a random address, since we're probably on the simulator.
+ */
+ for (i = 0; i < 6; i++)
+ nz_addr |= mac[i];
+
+ if (nz_addr) {
+ memcpy(dev->dev_addr, mac, 6);
+ dev->addr_len = 6;
+ } else {
+ random_ether_addr(dev->dev_addr);
+ }
+
+ /* Initialize the transmit wake timer. */
+ hrtimer_init(&priv->tx_wake_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ priv->tx_wake_timer.function = tile_net_handle_tx_wake_timer;
+
+ /* Register the network device. */
+ ret = register_netdev(dev);
+ if (ret) {
+ netdev_err(dev, "register_netdev failed %d\n", ret);
+ free_netdev(dev);
+ return;
+ }
+}
+
+/* Per-cpu module initialization. */
+static void tile_net_init_module_percpu(void *unused)
+{
+ struct tile_net_info *info = &__get_cpu_var(per_cpu_info);
+ int my_cpu = smp_processor_id();
+
+ info->has_iqueue = false;
+
+ info->my_cpu = my_cpu;
+
+ /* Initialize the egress timer. */
+ hrtimer_init(&info->egress_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ info->egress_timer.function = tile_net_handle_egress_timer;
+}
+
+/* Module initialization. */
+static int __init tile_net_init_module(void)
+{
+ int i;
+ char name[GXIO_MPIPE_LINK_NAME_LEN];
+ uint8_t mac[6];
+
+ pr_info("Tilera Network Driver\n");
+
+ mutex_init(&tile_net_devs_for_channel_mutex);
+
+ /* Initialize each CPU. */
+ on_each_cpu(tile_net_init_module_percpu, NULL, 1);
+
+ /* Find out what devices we have, and initialize them. */
+ for (i = 0; gxio_mpipe_link_enumerate_mac(i, name, mac) >= 0; i++)
+ tile_net_dev_init(name, mac);
+
+ if (!network_cpus_init())
+ network_cpus_map = *cpu_online_mask;
+
+ return 0;
+}
+
+module_init(tile_net_init_module);
--
1.6.5.2
^ permalink raw reply related
* Re: skb_release_data oops
From: Eric Dumazet @ 2012-05-25 15:11 UTC (permalink / raw)
To: kendo; +Cc: netdev
In-Reply-To: <4FBF9876.032A7B.32344@m12-16.163.com>
On Fri, 2012-05-25 at 22:19 +0800, kendo wrote:
> I use the Linux kernel 2.6..38.8,found a bug when free skb,This failure may occur because what was it? Can you give some suggestions, thanks!!!!
>
> Best reguards.
>
> ---------------------------------------------------------------
>
> May 25 19:30:54 AnShion <9> klogd: [164619.378640] BUG: unable to handle kernel paging request at 000095a3
> May 25 19:30:54 AnShion <9> klogd: [164619.454609] IP: [<c01c2353>] put_page+0x3/0x40
> May 25 19:30:54 AnShion <12> klogd: [164619.508726] *pde = 00000000
> May 25 19:30:54 AnShion <8> klogd: [164619.544185] Oops: 0000 [#1] SMP
> May 25 19:30:54 AnShion <8> klogd: [164619.583891] last sysfs file: /sys/devices/virtual/net/tunl_FJ/uevent
> May 25 19:30:54 AnShion <12> klogd: [164619.660716] Modules linked in:
> dpi_engine ipmi_watchdog nf_connmark ip_set_hash_netiface ip_set_hash_net ip_set_hash_ip xt_set ip_set \
> xt_hashrate xt_dpi xt_pcc xt_nth xt_random xt_nflog xt_replace igb e1000e
Looks like you use a bunch of alien modules.
netdev is not the place to discuss of their bugs.
^ permalink raw reply
* on the the two IPs unreachable if second interface is up
From: lejeczek @ 2012-05-25 15:12 UTC (permalink / raw)
To: netdev
hello everybody
apologies if this may feel off the topic, I was hoping some
net experts could shed some lights on some peculiar symptoms
I experience with one linux box
a BOX has two net interfaces, a public and private one
public IP is reachable from/via the Internet just fine
public IP is not reachable from the same private network the
BOX's second interface is on
public IP becomes reachable to private subnet immediately
after second(private) interface was turned down
BOX's firewall whether on or off makes no difference
this is the most peculiar problem of this nature I've ever
experienced
the goal is simple, have other systems on the same private
subnet as the BOX's second interface to be able to talk to
the BOX's public IP
default gateway for the private subnet is a separate another
system.
any suggestions as to how to troubleshoot it I would very!
much appreciate
many thanks!
^ permalink raw reply
* Re: [RFC] mac80211: Use correct originator sequence number in a Path Reply
From: Yeoh Chun-Yeow @ 2012-05-25 15:30 UTC (permalink / raw)
To: devel-ZwoEplunGu1xMJw8dq7oimD2FQJk+8+b
Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
linux-wireless-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, ravip-DNmUmOh1Rg72fBVCVOL8/A
In-Reply-To: <1337934071-29342-1-git-send-email-qasimj-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
Hi, Qasim Javed
I think that you are referring to the target HWMP sequence number in
PREP element.
Based on the 802.11s standard, it has specified that
dot11MeshHWMPnetDiameterTraversalTime is only applied to original HWMP
sequence number for PREQ as mentioned in the "Contents of a PREQ
element" in section 11C.
For PREP element, it should be based on the description in section 11C.9.8.3:
"If it is a target mesh STA, it shall update its own HWMP sequence
number to maximum (current HWMP sequence number, target HWMP sequence
number in the PREQ) + 1 immediately before it generates a PREP in
response to a PREQ. The target HWMP sequence number of the PREQ is
relevant when a link was broken along the path and the stored sequence
number was increased at an intermediate mesh STA."
So the target HWMP sequence number should be modified as follow:
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 70ac7d1..5988e82 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -538,12 +538,10 @@ static void hwmp_preq_frame_process(struct
ieee80211_sub_if_data *sdata,
forward = false;
reply = true;
metric = 0;
- if (time_after(jiffies, ifmsh->last_sn_update +
- net_traversal_jiffies(sdata)) ||
- time_before(jiffies, ifmsh->last_sn_update)) {
- target_sn = ++ifmsh->sn;
- ifmsh->last_sn_update = jiffies;
- }
+ if (SN_LT(ifmsh->sn, target_sn))
+ ifmsh->sn = target_sn;
+ target_sn = ++ifmsh->sn;
+ ifmsh->last_sn_update = jiffies;
Comments.
Regards,
Chun-Yeow
^ permalink raw reply related
* Re: on the the two IPs unreachable if second interface is up
From: lejeczek @ 2012-05-25 15:35 UTC (permalink / raw)
To: netdev
In-Reply-To: <4FBFA172.2060604@yahoo.co.uk>
here is something potentially interesting!
when I arping the BOX's public IP from a system/client of
the private subnet I get replied with a mac of the BOX's
second(private) interface :-[
could this be normal?
On 25/05/12 16:12, lejeczek wrote:
> hello everybody
>
> apologies if this may feel off the topic, I was hoping
> some net experts could shed some lights on some peculiar
> symptoms I experience with one linux box
>
> a BOX has two net interfaces, a public and private one
> public IP is reachable from/via the Internet just fine
> public IP is not reachable from the same private network
> the BOX's second interface is on
> public IP becomes reachable to private subnet immediately
> after second(private) interface was turned down
> BOX's firewall whether on or off makes no difference
>
> this is the most peculiar problem of this nature I've ever
> experienced
>
> the goal is simple, have other systems on the same private
> subnet as the BOX's second interface to be able to talk to
> the BOX's public IP
> default gateway for the private subnet is a separate
> another system.
>
> any suggestions as to how to troubleshoot it I would very!
> much appreciate
> many thanks!
> --
> To unsubscribe from this list: send the line "unsubscribe
> netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at
> http://vger.kernel.org/majordomo-info.html
>
^ permalink raw reply
* Re: [RFC] mac80211: Use correct originator sequence number in a Path Reply
From: Qasim Javed @ 2012-05-25 15:50 UTC (permalink / raw)
To: devel-ZwoEplunGu1xMJw8dq7oimD2FQJk+8+b
Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
linux-wireless-u79uwXL29TY76Z2rM5mHXA,
linux-kernel-u79uwXL29TY76Z2rM5mHXA, ravip-DNmUmOh1Rg72fBVCVOL8/A
In-Reply-To: <CAEFj985HxSrOwOoevDjG1jxPxobLda-X_LZUtj6LgwXZwozBog-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
Please see my comments below.
On Fri, May 25, 2012 at 10:30 AM, Yeoh Chun-Yeow <yeohchunyeow-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> Hi, Qasim Javed
>
> I think that you are referring to the target HWMP sequence number in
> PREP element.
No. I am referring to the originator sequence number in PREP because
when the PREP reaches the originator of the PREQ, the originator
sequence number in the PREP and the value of the metric is used to
determine which PREP will be accepted. If the originator sequence
numbers in the PREPs are different, then the PREP with the higher
sequence number will be accepted irrespective of the value of the
metric. Only if the originator sequence numbers in the PREP are equal
will the metric values in the PREPs be examined.
>
> Based on the 802.11s standard, it has specified that
> dot11MeshHWMPnetDiameterTraversalTime is only applied to original HWMP
> sequence number for PREQ as mentioned in the "Contents of a PREQ
> element" in section 11C.
>
> For PREP element, it should be based on the description in section 11C.9.8.3:
> "If it is a target mesh STA, it shall update its own HWMP sequence
> number to maximum (current HWMP sequence number, target HWMP sequence
> number in the PREQ) + 1 immediately before it generates a PREP in
> response to a PREQ. The target HWMP sequence number of the PREQ is
> relevant when a link was broken along the path and the stored sequence
> number was increased at an intermediate mesh STA."
>
> So the target HWMP sequence number should be modified as follow:
>
> diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
> index 70ac7d1..5988e82 100644
> --- a/net/mac80211/mesh_hwmp.c
> +++ b/net/mac80211/mesh_hwmp.c
> @@ -538,12 +538,10 @@ static void hwmp_preq_frame_process(struct
> ieee80211_sub_if_data *sdata,
> forward = false;
> reply = true;
> metric = 0;
> - if (time_after(jiffies, ifmsh->last_sn_update +
> - net_traversal_jiffies(sdata)) ||
> - time_before(jiffies, ifmsh->last_sn_update)) {
> - target_sn = ++ifmsh->sn;
> - ifmsh->last_sn_update = jiffies;
> - }
> + if (SN_LT(ifmsh->sn, target_sn))
> + ifmsh->sn = target_sn;
> + target_sn = ++ifmsh->sn;
> + ifmsh->last_sn_update = jiffies;
>
> Comments.
I agree with the description in the standard but you seem to be
misinterpreting it. Please note that the function being considered
here is hwmp_preq_frame_process, which evidently processes a PREQ.
However, because a PREP is generated in response to a PREQ, this
function also checks whether a PREP needs to be generated and then
calls mesh_path_sel_frame_tx with frame type being MPATH_PREP. This
function is also passed the originator and target sequence numbers.
What I am saying is that, in the scenario described in my original
email, the wrong originator sequence number is being used for the
PREP.
Please notice that in hwmp_preq_frame_process, target_sn ends up being
used as orig_sn for the PREP. This is probably what is causing the
confusion in your case.
Your patch is definitely not what I was pointing out, in fact it
diverts from the standard functionality since it removes the check for
HWMPNetDiameterTraversalTime.
>
> Regards,
> Chun-Yeow
> _______________________________________________
> Devel mailing list
> Devel-ZwoEplunGu1xMJw8dq7oimD2FQJk+8+b@public.gmane.org
> http://lists.open80211s.org/cgi-bin/mailman/listinfo/devel
^ permalink raw reply
* Re: [PATCH] gianfar:don't add FCB length to hard_header_len
From: Paul Gortmaker @ 2012-05-25 15:58 UTC (permalink / raw)
To: Joe Perches; +Cc: Jan Ceuleers, David Miller, b06378, netdev, linuxppc-dev
In-Reply-To: <1337876210.5070.4.camel@joe2Laptop>
[Re: [PATCH] gianfar:don't add FCB length to hard_header_len] On 24/05/2012 (Thu 09:16) Joe Perches wrote:
> On Thu, 2012-05-24 at 17:04 +0200, Jan Ceuleers wrote:
> > On 05/22/2012 09:18 PM, David Miller wrote:
> > > From: Jiajun Wu <b06378@freescale.com>
> > > Date: Tue, 22 May 2012 17:00:48 +0800
> > >
> > >> FCB(Frame Control Block) isn't the part of netdev hard header.
> > >> Add FCB to hard_header_len will make GRO fail at MAC comparision stage.
> > >>
> > >> Signed-off-by: Jiajun Wu <b06378@freescale.com>
> > >
> > > Applied, thanks.
> > >
> > > Someone needs to go through this driver when net-next opens up
> > > and fix all of the indentation in this driver.
> >
> > May I give that a go?
>
> I have scripts that automate most of this.
> I don't have the card though.
There is no card. The gianfar is a SOC for freescale 83xx, 85xx, 86xx
CPUs. If need be, I can test just as I did for your name overrun fix
in commit 0015e551e.
But you really shouldn't need the hardware to validate this kind of
patch anyways -- aside from your code flow change in the irq routine of
gianfar_ptp, you should have been simply able to check for object file
equivalence before and after your change.
Paul.
>
> Maybe this is a starting point?
> It doesn't fix most 80 column warnings.
>
> drivers/net/ethernet/freescale/gianfar.c | 299 +++++++++++-----------
> drivers/net/ethernet/freescale/gianfar_ethtool.c | 131 +++++-----
> drivers/net/ethernet/freescale/gianfar_ptp.c | 8 +-
> drivers/net/ethernet/freescale/gianfar_sysfs.c | 2 +-
> 4 files changed, 225 insertions(+), 215 deletions(-)
>
[...]
> diff --git a/drivers/net/ethernet/freescale/gianfar_ptp.c b/drivers/net/ethernet/freescale/gianfar_ptp.c
> index c08e5d4..3f7b81d 100644
> --- a/drivers/net/ethernet/freescale/gianfar_ptp.c
> +++ b/drivers/net/ethernet/freescale/gianfar_ptp.c
> @@ -268,11 +268,11 @@ static irqreturn_t isr(int irq, void *priv)
> ptp_clock_event(etsects->clock, &event);
> }
>
> - if (ack) {
> - gfar_write(&etsects->regs->tmr_tevent, ack);
> - return IRQ_HANDLED;
> - } else
> + if (!ack)
> return IRQ_NONE;
> +
> + gfar_write(&etsects->regs->tmr_tevent, ack);
> + return IRQ_HANDLED;
> }
>
> /*
^ permalink raw reply
* Re: Using jiffies for tcp_time_stamp?
From: Chris Friesen @ 2012-05-25 15:58 UTC (permalink / raw)
To: Srećko Jurić-Kavelj; +Cc: netdev
In-Reply-To: <CAACrLC2CXU-DNeonWQGJTfX53ssm_asK7WQrFuWRBB77cg-YdA@mail.gmail.com>
On 05/22/2012 11:21 AM, Srećko Jurić-Kavelj wrote:
> By looking at the code it's clear that the time stamping is done with
> jiffies, and my kernel has CONFIG_HZ=100.
>
> I understand that this is for performance reasons (and the RTT
> smoothing filter is implemented with bit shifting operations), but
> would using a more precise time stamp have significant impact on
> performance? Since RTT is used to compute RTO, wouldn't there be any
> benefits of having more accurate estimate of this value?
I don't know if it would make any difference to the tcp algorithms, but
certainly on some architectures you can get a fast and accurate hardware
timestamp.
Chris
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox