* [RFC PATCH ghak32 V2 05/13] audit: add containerid support for ptrace and signals
From: Richard Guy Briggs @ 2018-03-16 9:00 UTC (permalink / raw)
To: cgroups-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Linux-Audit Mailing List,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: ebiederm-aS9lmoZGLiVWk0Htik3J/w, luto-DgEjT+Ai2ygdnm+yROfE0A,
jlayton-H+wXaHxf7aLQT0dZR+AlfA, carlos-H+wXaHxf7aLQT0dZR+AlfA,
dhowells-H+wXaHxf7aLQT0dZR+AlfA, madzcar-Re5JQEeQqe8AvxtiuMwx3w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
simo-H+wXaHxf7aLQT0dZR+AlfA, eparis-FjpueFixGhCM4zKIHC2jIg
In-Reply-To: <cover.1521179281.git.rgb-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Add container ID support to ptrace and signals. In particular, the "op"
field provides a way to label the auxiliary record to which it is
associated.
Signed-off-by: Richard Guy Briggs <rgb-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
include/linux/audit.h | 16 +++++++++++-----
kernel/audit.c | 12 ++++++++----
kernel/audit.h | 2 ++
kernel/auditsc.c | 19 +++++++++++++++----
4 files changed, 36 insertions(+), 13 deletions(-)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index f10ca1b..ed16bb6 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -35,6 +35,7 @@ struct audit_sig_info {
uid_t uid;
pid_t pid;
char ctx[0];
+ u64 cid;
};
struct audit_buffer;
@@ -155,8 +156,8 @@ extern void audit_log_link_denied(const char *operation,
extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab,
struct task_struct *tsk);
-extern int audit_log_container_info(struct task_struct *tsk,
- struct audit_context *context);
+extern int audit_log_container_info(struct audit_context *context,
+ char *op, u64 containerid);
extern int audit_update_lsm_rules(void);
@@ -208,8 +209,8 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
static inline void audit_log_task_info(struct audit_buffer *ab,
struct task_struct *tsk)
{ }
-static inline int audit_log_container_info(struct task_struct *tsk,
- struct audit_context *context);
+static inline int audit_log_container_info(struct audit_context *context,
+ char *op, u64 containerid);
{ }
#define audit_enabled 0
#endif /* CONFIG_AUDIT */
@@ -598,9 +599,14 @@ static inline bool audit_loginuid_set(struct task_struct *tsk)
return uid_valid(audit_get_loginuid(tsk));
}
+static inline bool cid_valid(u64 containerid)
+{
+ return containerid != INVALID_CID;
+}
+
static inline bool audit_containerid_set(struct task_struct *tsk)
{
- return audit_get_containerid(tsk) != INVALID_CID;
+ return cid_valid(audit_get_containerid(tsk));
}
static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
diff --git a/kernel/audit.c b/kernel/audit.c
index a12f21f..b238be5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -142,6 +142,7 @@ struct audit_net {
kuid_t audit_sig_uid = INVALID_UID;
pid_t audit_sig_pid = -1;
u32 audit_sig_sid = 0;
+u64 audit_sig_cid = INVALID_CID;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -1438,6 +1439,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
+ sig_data->cid = audit_sig_cid;
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
@@ -2051,20 +2053,22 @@ void audit_log_session_info(struct audit_buffer *ab)
/*
* audit_log_container_info - report container info
- * @tsk: task to be recorded
* @context: task or local context for record
+ * @op: containerid string description
+ * @containerid: container ID to report
*/
-int audit_log_container_info(struct task_struct *tsk, struct audit_context *context)
+int audit_log_container_info(struct audit_context *context,
+ char *op, u64 containerid)
{
struct audit_buffer *ab;
- if (!audit_containerid_set(tsk))
+ if (!cid_valid(containerid))
return 0;
/* Generate AUDIT_CONTAINER_INFO with container ID */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER_INFO);
if (!ab)
return -ENOMEM;
- audit_log_format(ab, "contid=%llu", audit_get_containerid(tsk));
+ audit_log_format(ab, "op=%s contid=%llu", op, containerid);
audit_log_end(ab);
return 0;
}
diff --git a/kernel/audit.h b/kernel/audit.h
index aaa651a..743d445 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -147,6 +147,7 @@ struct audit_context {
kuid_t target_uid;
unsigned int target_sessionid;
u32 target_sid;
+ u64 target_cid;
char target_comm[TASK_COMM_LEN];
struct audit_tree_refs *trees, *first_trees;
@@ -330,6 +331,7 @@ extern void audit_log_d_path_exe(struct audit_buffer *ab,
extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
extern u32 audit_sig_sid;
+extern u64 audit_sig_cid;
extern int audit_filter(int msgtype, unsigned int listtype);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2bba324..2932ef1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -113,6 +113,7 @@ struct audit_aux_data_pids {
kuid_t target_uid[AUDIT_AUX_PIDS];
unsigned int target_sessionid[AUDIT_AUX_PIDS];
u32 target_sid[AUDIT_AUX_PIDS];
+ u64 target_cid[AUDIT_AUX_PIDS];
char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
int pid_count;
};
@@ -1422,21 +1423,27 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
for (aux = context->aux_pids; aux; aux = aux->next) {
struct audit_aux_data_pids *axs = (void *)aux;
- for (i = 0; i < axs->pid_count; i++)
+ for (i = 0; i < axs->pid_count; i++) {
+ char axsn[sizeof("aux0xN ")];
+
+ sprintf(axsn, "aux0x%x", i);
if (audit_log_pid_context(context, axs->target_pid[i],
axs->target_auid[i],
axs->target_uid[i],
axs->target_sessionid[i],
axs->target_sid[i],
- axs->target_comm[i]))
+ axs->target_comm[i])
+ && audit_log_container_info(context, axsn, axs->target_cid[i]))
call_panic = 1;
+ }
}
if (context->target_pid &&
audit_log_pid_context(context, context->target_pid,
context->target_auid, context->target_uid,
context->target_sessionid,
- context->target_sid, context->target_comm))
+ context->target_sid, context->target_comm)
+ && audit_log_container_info(context, "target", context->target_cid))
call_panic = 1;
if (context->pwd.dentry && context->pwd.mnt) {
@@ -1456,7 +1463,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_proctitle(tsk, context);
- audit_log_container_info(tsk, context);
+ audit_log_container_info(context, "task", audit_get_containerid(tsk));
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -2356,6 +2363,7 @@ void __audit_ptrace(struct task_struct *t)
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
security_task_getsecid(t, &context->target_sid);
+ context->target_cid = audit_get_containerid(t);
memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
}
@@ -2383,6 +2391,7 @@ int audit_signal_info(int sig, struct task_struct *t)
else
audit_sig_uid = uid;
security_task_getsecid(tsk, &audit_sig_sid);
+ audit_sig_cid = audit_get_containerid(tsk);
}
if (!audit_signals || audit_dummy_context())
@@ -2396,6 +2405,7 @@ int audit_signal_info(int sig, struct task_struct *t)
ctx->target_uid = t_uid;
ctx->target_sessionid = audit_get_sessionid(t);
security_task_getsecid(t, &ctx->target_sid);
+ ctx->target_cid = audit_get_containerid(t);
memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
return 0;
}
@@ -2417,6 +2427,7 @@ int audit_signal_info(int sig, struct task_struct *t)
axp->target_uid[axp->pid_count] = t_uid;
axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
security_task_getsecid(t, &axp->target_sid[axp->pid_count]);
+ axp->target_cid[axp->pid_count] = audit_get_containerid(t);
memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
axp->pid_count++;
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH ghak32 V2 04/13] audit: add containerid filtering
From: Richard Guy Briggs @ 2018-03-16 9:00 UTC (permalink / raw)
To: cgroups, containers, linux-api, Linux-Audit Mailing List,
linux-fsdevel, LKML, netdev
Cc: luto, jlayton, carlos, viro, dhowells, simo, eparis, serge,
ebiederm, madzcar, Richard Guy Briggs
In-Reply-To: <cover.1521179281.git.rgb@redhat.com>
Implement container ID filtering using the AUDIT_CONTAINERID field name
to send an 8-character string representing a u64 since the value field
is only u32.
Sending it as two u32 was considered, but gathering and comparing two
fields was more complex.
The feature indicator is AUDIT_FEATURE_BITMAP_CONTAINERID_FILTER.
This requires support from userspace to be useful.
See: https://github.com/linux-audit/audit-userspace/issues/40
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
include/linux/audit.h | 1 +
include/uapi/linux/audit.h | 5 ++++-
kernel/audit.h | 1 +
kernel/auditfilter.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++
kernel/auditsc.c | 3 +++
5 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index 3acbe9d..f10ca1b 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -76,6 +76,7 @@ struct audit_field {
u32 type;
union {
u32 val;
+ u64 val64;
kuid_t uid;
kgid_t gid;
struct {
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index e83ccbd..8443a8f 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -262,6 +262,7 @@
#define AUDIT_LOGINUID_SET 24
#define AUDIT_SESSIONID 25 /* Session ID */
#define AUDIT_FSTYPE 26 /* FileSystem Type */
+#define AUDIT_CONTAINERID 27 /* Container ID */
/* These are ONLY useful when checking
* at syscall exit time (AUDIT_AT_EXIT). */
@@ -342,6 +343,7 @@ enum {
#define AUDIT_FEATURE_BITMAP_SESSIONID_FILTER 0x00000010
#define AUDIT_FEATURE_BITMAP_LOST_RESET 0x00000020
#define AUDIT_FEATURE_BITMAP_FILTER_FS 0x00000040
+#define AUDIT_FEATURE_BITMAP_CONTAINERID_FILTER 0x00000080
#define AUDIT_FEATURE_BITMAP_ALL (AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT | \
AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME | \
@@ -349,7 +351,8 @@ enum {
AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND | \
AUDIT_FEATURE_BITMAP_SESSIONID_FILTER | \
AUDIT_FEATURE_BITMAP_LOST_RESET | \
- AUDIT_FEATURE_BITMAP_FILTER_FS)
+ AUDIT_FEATURE_BITMAP_FILTER_FS | \
+ AUDIT_FEATURE_BITMAP_CONTAINERID_FILTER)
/* deprecated: AUDIT_VERSION_* */
#define AUDIT_VERSION_LATEST AUDIT_FEATURE_BITMAP_ALL
diff --git a/kernel/audit.h b/kernel/audit.h
index 214e149..aaa651a 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -234,6 +234,7 @@ static inline int audit_hash_ino(u32 ino)
extern int audit_match_class(int class, unsigned syscall);
extern int audit_comparator(const u32 left, const u32 op, const u32 right);
+extern int audit_comparator64(const u64 left, const u32 op, const u64 right);
extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right);
extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right);
extern int parent_len(const char *path);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index d7a807e..c4c8746 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -410,6 +410,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
/* FALL THROUGH */
case AUDIT_ARCH:
case AUDIT_FSTYPE:
+ case AUDIT_CONTAINERID:
if (f->op != Audit_not_equal && f->op != Audit_equal)
return -EINVAL;
break;
@@ -584,6 +585,14 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
}
entry->rule.exe = audit_mark;
break;
+ case AUDIT_CONTAINERID:
+ if (f->val != sizeof(u64))
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str))
+ goto exit_free;
+ f->val64 = ((u64 *)str)[0];
+ break;
}
}
@@ -666,6 +675,11 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, audit_mark_path(krule->exe));
break;
+ case AUDIT_CONTAINERID:
+ data->buflen += data->values[i] = sizeof(u64);
+ for (i = 0; i < sizeof(u64); i++)
+ ((char *)bufp)[i] = ((char *)&f->val64)[i];
+ break;
case AUDIT_LOGINUID_SET:
if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
data->fields[i] = AUDIT_LOGINUID;
@@ -752,6 +766,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (!gid_eq(a->fields[i].gid, b->fields[i].gid))
return 1;
break;
+ case AUDIT_CONTAINERID:
+ if (a->fields[i].val64 != b->fields[i].val64)
+ return 1;
+ break;
default:
if (a->fields[i].val != b->fields[i].val)
return 1;
@@ -1210,6 +1228,31 @@ int audit_comparator(u32 left, u32 op, u32 right)
}
}
+int audit_comparator64(u64 left, u32 op, u64 right)
+{
+ switch (op) {
+ case Audit_equal:
+ return (left == right);
+ case Audit_not_equal:
+ return (left != right);
+ case Audit_lt:
+ return (left < right);
+ case Audit_le:
+ return (left <= right);
+ case Audit_gt:
+ return (left > right);
+ case Audit_ge:
+ return (left >= right);
+ case Audit_bitmask:
+ return (left & right);
+ case Audit_bittest:
+ return ((left & right) == right);
+ default:
+ BUG();
+ return 0;
+ }
+}
+
int audit_uid_comparator(kuid_t left, u32 op, kuid_t right)
{
switch (op) {
@@ -1348,6 +1391,10 @@ int audit_filter(int msgtype, unsigned int listtype)
result = audit_comparator(audit_loginuid_set(current),
f->op, f->val);
break;
+ case AUDIT_CONTAINERID:
+ result = audit_comparator64(audit_get_containerid(current),
+ f->op, f->val64);
+ break;
case AUDIT_MSGTYPE:
result = audit_comparator(msgtype, f->op, f->val);
break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 65be110..2bba324 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -614,6 +614,9 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_LOGINUID_SET:
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
+ case AUDIT_CONTAINERID:
+ result = audit_comparator64(audit_get_containerid(tsk), f->op, f->val64);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH ghak32 V2 03/13] audit: log container info of syscalls
From: Richard Guy Briggs @ 2018-03-16 9:00 UTC (permalink / raw)
To: cgroups, containers, linux-api, Linux-Audit Mailing List,
linux-fsdevel, LKML, netdev
Cc: luto, jlayton, carlos, viro, dhowells, simo, eparis, serge,
ebiederm, madzcar, Richard Guy Briggs
In-Reply-To: <cover.1521179281.git.rgb@redhat.com>
Create a new audit record AUDIT_CONTAINER_INFO to document the container
ID of a process if it is present.
Called from audit_log_exit(), syscalls are covered.
A sample raw event:
type=SYSCALL msg=audit(1519924845.499:257): arch=c000003e syscall=257 success=yes exit=3 a0=ffffff9c a1=56374e1cef30 a2=241 a3=1b6 items=2 ppid=606 pid=635 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="bash" exe="/usr/bin/bash" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key="tmpcontainerid"
type=CWD msg=audit(1519924845.499:257): cwd="/root"
type=PATH msg=audit(1519924845.499:257): item=0 name="/tmp/" inode=13863 dev=00:27 mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmp_t:s0 nametype= PARENT cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
type=PATH msg=audit(1519924845.499:257): item=1 name="/tmp/tmpcontainerid" inode=17729 dev=00:27 mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
type=PROCTITLE msg=audit(1519924845.499:257): proctitle=62617368002D6300736C65657020313B206563686F2074657374203E202F746D702F746D70636F6E7461696E65726964
type=CONTAINER_INFO msg=audit(1519924845.499:257): op=task contid=123458
See: https://github.com/linux-audit/audit-kernel/issues/32
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
include/linux/audit.h | 5 +++++
include/uapi/linux/audit.h | 1 +
kernel/audit.c | 20 ++++++++++++++++++++
kernel/auditsc.c | 2 ++
4 files changed, 28 insertions(+)
diff --git a/include/linux/audit.h b/include/linux/audit.h
index fe4ba3f..3acbe9d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -154,6 +154,8 @@ extern void audit_log_link_denied(const char *operation,
extern int audit_log_task_context(struct audit_buffer *ab);
extern void audit_log_task_info(struct audit_buffer *ab,
struct task_struct *tsk);
+extern int audit_log_container_info(struct task_struct *tsk,
+ struct audit_context *context);
extern int audit_update_lsm_rules(void);
@@ -205,6 +207,9 @@ static inline int audit_log_task_context(struct audit_buffer *ab)
static inline void audit_log_task_info(struct audit_buffer *ab,
struct task_struct *tsk)
{ }
+static inline int audit_log_container_info(struct task_struct *tsk,
+ struct audit_context *context);
+{ }
#define audit_enabled 0
#endif /* CONFIG_AUDIT */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 921a71f..e83ccbd 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -115,6 +115,7 @@
#define AUDIT_REPLACE 1329 /* Replace auditd if this packet unanswerd */
#define AUDIT_KERN_MODULE 1330 /* Kernel Module events */
#define AUDIT_FANOTIFY 1331 /* Fanotify access decision */
+#define AUDIT_CONTAINER_INFO 1332 /* Container ID information */
#define AUDIT_AVC 1400 /* SE Linux avc denial or grant */
#define AUDIT_SELINUX_ERR 1401 /* Internal SE Linux Errors */
diff --git a/kernel/audit.c b/kernel/audit.c
index 3f2f143..a12f21f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -2049,6 +2049,26 @@ void audit_log_session_info(struct audit_buffer *ab)
audit_log_format(ab, " auid=%u ses=%u", auid, sessionid);
}
+/*
+ * audit_log_container_info - report container info
+ * @tsk: task to be recorded
+ * @context: task or local context for record
+ */
+int audit_log_container_info(struct task_struct *tsk, struct audit_context *context)
+{
+ struct audit_buffer *ab;
+
+ if (!audit_containerid_set(tsk))
+ return 0;
+ /* Generate AUDIT_CONTAINER_INFO with container ID */
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_CONTAINER_INFO);
+ if (!ab)
+ return -ENOMEM;
+ audit_log_format(ab, "contid=%llu", audit_get_containerid(tsk));
+ audit_log_end(ab);
+ return 0;
+}
+
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a6b0a52..65be110 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1453,6 +1453,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_proctitle(tsk, context);
+ audit_log_container_info(tsk, context);
+
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH ghak32 V2 02/13] audit: check children and threading before allowing containerid
From: Richard Guy Briggs @ 2018-03-16 9:00 UTC (permalink / raw)
To: cgroups-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Linux-Audit Mailing List,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: ebiederm-aS9lmoZGLiVWk0Htik3J/w, luto-DgEjT+Ai2ygdnm+yROfE0A,
jlayton-H+wXaHxf7aLQT0dZR+AlfA, carlos-H+wXaHxf7aLQT0dZR+AlfA,
dhowells-H+wXaHxf7aLQT0dZR+AlfA, madzcar-Re5JQEeQqe8AvxtiuMwx3w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
simo-H+wXaHxf7aLQT0dZR+AlfA, eparis-FjpueFixGhCM4zKIHC2jIg
In-Reply-To: <cover.1521179281.git.rgb-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
Check if a task has existing children or co-threads and refuse to set
the container ID if either are present. Failure to check this could
permit games where a child scratches its parent's back to work around
inheritance and double-setting policy.
Signed-off-by: Richard Guy Briggs <rgb-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
---
kernel/auditsc.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 29c8482..a6b0a52 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2087,6 +2087,10 @@ static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
/* if we don't have caps, reject */
if (!capable(CAP_AUDIT_CONTROL))
return -EPERM;
+ /* if task has children or is not single-threaded, deny */
+ if (!list_empty(&task->children) ||
+ !(thread_group_leader(task) && thread_group_empty(task)))
+ return -EPERM;
/* if containerid is unset, allow */
if (!audit_containerid_set(task))
return 0;
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH ghak32 V2 01/13] audit: add container id
From: Richard Guy Briggs @ 2018-03-16 9:00 UTC (permalink / raw)
To: cgroups, containers, linux-api, Linux-Audit Mailing List,
linux-fsdevel, LKML, netdev
Cc: luto, jlayton, carlos, viro, dhowells, simo, eparis, serge,
ebiederm, madzcar, Richard Guy Briggs
In-Reply-To: <cover.1521179281.git.rgb@redhat.com>
Implement the proc fs write to set the audit container ID of a process,
emitting an AUDIT_CONTAINER record to document the event.
This is a write from the container orchestrator task to a proc entry of
the form /proc/PID/containerid where PID is the process ID of the newly
created task that is to become the first task in a container, or an
additional task added to a container.
The write expects up to a u64 value (unset: 18446744073709551615).
This will produce a record such as this:
type=CONTAINER msg=audit(1519903238.968:261): op=set pid=596 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=1 opid=596 old-contid=18446744073709551615 contid=123455 res=0
The "op" field indicates an initial set. The "pid" to "ses" fields are
the orchestrator while the "opid" field is the object's PID, the process
being "contained". Old and new container ID values are given in the
"contid" fields, while res indicates its success.
It is not permitted to self-set, unset or re-set the container ID. A
child inherits its parent's container ID, but then can be set only once
after.
See: https://github.com/linux-audit/audit-kernel/issues/32
Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
---
fs/proc/base.c | 37 ++++++++++++++++++++
include/linux/audit.h | 16 +++++++++
include/linux/init_task.h | 4 ++-
include/linux/sched.h | 1 +
include/uapi/linux/audit.h | 2 ++
kernel/auditsc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 143 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 60316b5..6ce4fbe 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1299,6 +1299,41 @@ static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
.read = proc_sessionid_read,
.llseek = generic_file_llseek,
};
+
+static ssize_t proc_containerid_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct inode *inode = file_inode(file);
+ u64 containerid;
+ int rv;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (!task)
+ return -ESRCH;
+ if (*ppos != 0) {
+ /* No partial writes. */
+ put_task_struct(task);
+ return -EINVAL;
+ }
+
+ rv = kstrtou64_from_user(buf, count, 10, &containerid);
+ if (rv < 0) {
+ put_task_struct(task);
+ return rv;
+ }
+
+ rv = audit_set_containerid(task, containerid);
+ put_task_struct(task);
+ if (rv < 0)
+ return rv;
+ return count;
+}
+
+static const struct file_operations proc_containerid_operations = {
+ .write = proc_containerid_write,
+ .llseek = generic_file_llseek,
+};
+
#endif
#ifdef CONFIG_FAULT_INJECTION
@@ -2961,6 +2996,7 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
+ REG("containerid", S_IWUSR, proc_containerid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
@@ -3355,6 +3391,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
REG("sessionid", S_IRUGO, proc_sessionid_operations),
+ REG("containerid", S_IWUSR, proc_containerid_operations),
#endif
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
diff --git a/include/linux/audit.h b/include/linux/audit.h
index af410d9..fe4ba3f 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -29,6 +29,7 @@
#define AUDIT_INO_UNSET ((unsigned long)-1)
#define AUDIT_DEV_UNSET ((dev_t)-1)
+#define INVALID_CID AUDIT_CID_UNSET
struct audit_sig_info {
uid_t uid;
@@ -321,6 +322,7 @@ static inline void audit_ptrace(struct task_struct *t)
extern int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial);
extern int audit_set_loginuid(kuid_t loginuid);
+extern int audit_set_containerid(struct task_struct *tsk, u64 containerid);
static inline kuid_t audit_get_loginuid(struct task_struct *tsk)
{
@@ -332,6 +334,11 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
return tsk->sessionid;
}
+static inline u64 audit_get_containerid(struct task_struct *tsk)
+{
+ return tsk->containerid;
+}
+
extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
extern void __audit_bprm(struct linux_binprm *bprm);
@@ -517,6 +524,10 @@ static inline unsigned int audit_get_sessionid(struct task_struct *tsk)
{
return -1;
}
+static inline kuid_t audit_get_containerid(struct task_struct *tsk)
+{
+ return INVALID_CID;
+}
static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
{ }
static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
@@ -581,6 +592,11 @@ static inline bool audit_loginuid_set(struct task_struct *tsk)
return uid_valid(audit_get_loginuid(tsk));
}
+static inline bool audit_containerid_set(struct task_struct *tsk)
+{
+ return audit_get_containerid(tsk) != INVALID_CID;
+}
+
static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
{
audit_log_n_string(ab, buf, strlen(buf));
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 6a53262..046bd0a 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -18,6 +18,7 @@
#include <linux/sched/rt.h>
#include <linux/livepatch.h>
#include <linux/mm_types.h>
+#include <linux/audit.h>
#include <asm/thread_info.h>
@@ -120,7 +121,8 @@
#ifdef CONFIG_AUDITSYSCALL
#define INIT_IDS \
.loginuid = INVALID_UID, \
- .sessionid = (unsigned int)-1,
+ .sessionid = (unsigned int)-1, \
+ .containerid = INVALID_CID,
#else
#define INIT_IDS
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d258826..1b82191 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -796,6 +796,7 @@ struct task_struct {
#ifdef CONFIG_AUDITSYSCALL
kuid_t loginuid;
unsigned int sessionid;
+ u64 containerid;
#endif
struct seccomp seccomp;
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 4e61a9e..921a71f 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -71,6 +71,7 @@
#define AUDIT_TTY_SET 1017 /* Set TTY auditing status */
#define AUDIT_SET_FEATURE 1018 /* Turn an audit feature on or off */
#define AUDIT_GET_FEATURE 1019 /* Get which features are enabled */
+#define AUDIT_CONTAINER 1020 /* Define the container id and information */
#define AUDIT_FIRST_USER_MSG 1100 /* Userspace messages mostly uninteresting to kernel */
#define AUDIT_USER_AVC 1107 /* We filter this differently */
@@ -465,6 +466,7 @@ struct audit_tty_status {
};
#define AUDIT_UID_UNSET (unsigned int)-1
+#define AUDIT_CID_UNSET ((u64)-1)
/* audit_rule_data supports filter rules with both integer and string
* fields. It corresponds with AUDIT_ADD_RULE, AUDIT_DEL_RULE and
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4e0a4ac..29c8482 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2073,6 +2073,90 @@ int audit_set_loginuid(kuid_t loginuid)
return rc;
}
+static int audit_set_containerid_perm(struct task_struct *task, u64 containerid)
+{
+ struct task_struct *parent;
+ u64 pcontainerid, ccontainerid;
+
+ /* Don't allow to set our own containerid */
+ if (current == task)
+ return -EPERM;
+ /* Don't allow the containerid to be unset */
+ if (!cid_valid(containerid))
+ return -EINVAL;
+ /* if we don't have caps, reject */
+ if (!capable(CAP_AUDIT_CONTROL))
+ return -EPERM;
+ /* if containerid is unset, allow */
+ if (!audit_containerid_set(task))
+ return 0;
+ /* it is already set, and not inherited from the parent, reject */
+ ccontainerid = audit_get_containerid(task);
+ rcu_read_lock();
+ parent = rcu_dereference(task->real_parent);
+ rcu_read_unlock();
+ task_lock(parent);
+ pcontainerid = audit_get_containerid(parent);
+ task_unlock(parent);
+ if (ccontainerid != pcontainerid)
+ return -EPERM;
+ return 0;
+}
+
+static void audit_log_set_containerid(struct task_struct *task, u64 oldcontainerid,
+ u64 containerid, int rc)
+{
+ struct audit_buffer *ab;
+ uid_t uid;
+ struct tty_struct *tty;
+
+ if (!audit_enabled)
+ return;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONTAINER);
+ if (!ab)
+ return;
+
+ uid = from_kuid(&init_user_ns, task_uid(current));
+ tty = audit_get_tty(current);
+
+ audit_log_format(ab, "op=set pid=%d uid=%u", task_tgid_nr(current), uid);
+ audit_log_task_context(ab);
+ audit_log_format(ab, " auid=%u tty=%s ses=%u opid=%d old-contid=%llu contid=%llu res=%d",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ tty ? tty_name(tty) : "(none)", audit_get_sessionid(current),
+ task_tgid_nr(task), oldcontainerid, containerid, !rc);
+
+ audit_put_tty(tty);
+ audit_log_end(ab);
+}
+
+/**
+ * audit_set_containerid - set current task's audit_context containerid
+ * @containerid: containerid value
+ *
+ * Returns 0 on success, -EPERM on permission failure.
+ *
+ * Called (set) from fs/proc/base.c::proc_containerid_write().
+ */
+int audit_set_containerid(struct task_struct *task, u64 containerid)
+{
+ u64 oldcontainerid;
+ int rc;
+
+ oldcontainerid = audit_get_containerid(task);
+
+ rc = audit_set_containerid_perm(task, containerid);
+ if (!rc) {
+ task_lock(task);
+ task->containerid = containerid;
+ task_unlock(task);
+ }
+
+ audit_log_set_containerid(task, oldcontainerid, containerid, rc);
+ return rc;
+}
+
/**
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
--
1.8.3.1
^ permalink raw reply related
* [RFC PATCH ghak32 V2 00/13] audit: implement container id
From: Richard Guy Briggs @ 2018-03-16 9:00 UTC (permalink / raw)
To: cgroups-u79uwXL29TY76Z2rM5mHXA,
containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
linux-api-u79uwXL29TY76Z2rM5mHXA, Linux-Audit Mailing List,
linux-fsdevel-u79uwXL29TY76Z2rM5mHXA, LKML,
netdev-u79uwXL29TY76Z2rM5mHXA
Cc: ebiederm-aS9lmoZGLiVWk0Htik3J/w, luto-DgEjT+Ai2ygdnm+yROfE0A,
jlayton-H+wXaHxf7aLQT0dZR+AlfA, carlos-H+wXaHxf7aLQT0dZR+AlfA,
dhowells-H+wXaHxf7aLQT0dZR+AlfA, madzcar-Re5JQEeQqe8AvxtiuMwx3w,
viro-RmSDqhL/yNMiFSDQTTA3OLVCufUGDwFn,
simo-H+wXaHxf7aLQT0dZR+AlfA, eparis-FjpueFixGhCM4zKIHC2jIg
Implement audit kernel container ID.
This patchset is a second RFC based on the proposal document (V3)
posted:
https://www.redhat.com/archives/linux-audit/2018-January/msg00014.html
The first patch implements the proc fs write to set the audit container
ID of a process, emitting an AUDIT_CONTAINER record to announce the
registration of that container ID on that process. This patch requires
userspace support for record acceptance and proper type display.
The second checks for children or co-threads and refuses to set the
container ID if either are present. (This policy could be changed to
set both with the same container ID provided they meet the rest of the
requirements.)
The third implements the auxiliary record AUDIT_CONTAINER_INFO if a
container ID is identifiable with an event. This patch requires
userspace support for proper type display.
The fourth adds container ID filtering to the exit, exclude and user
lists. This patch requires auditctil userspace support for the
--containerid option.
The 5th adds signal and ptrace support.
The 6th creates a local audit context to be able to bind a standalone
record with a locally created auxiliary record.
The 7th, 8th, 9th, 10th patches add container ID records to standalone
records. Some of these may end up being syscall auxiliary records and
won't need this specific support since they'll be supported via
syscalls.
The 11th adds network namespace container ID labelling based on member
tasks' container ID labels.
The 12th adds container ID support to standalone netfilter records that
don't have a task context and lists each container to which that net
namespace belongs.
The 13th implements reading the container ID from the proc filesystem
for debugging. This patch isn't planned for upstream inclusion.
Feedback please!
Example: Set a container ID of 123456 to the "sleep" task:
sleep 2&
child=$!
echo 123456 > /proc/$child/containerid; echo $?
ausearch -ts recent -m container
echo child:$child contid:$( cat /proc/$child/containerid)
This should produce a record such as:
type=CONTAINER msg=audit(1521122590.315:222): op=set pid=689 uid=0 subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 auid=0 tty=pts0 ses=3 opid=707 old-contid=18446744073709551615 contid=123456 res=1
Example: Set a filter on a container ID 123459 on /tmp/tmpcontainerid:
containerid=123459
key=tmpcontainerid
auditctl -a exit,always -F dir=/tmp -F perm=wa -F containerid=$containerid -F key=$key
perl -e "sleep 1; open(my \$tmpfile, '>', \"/tmp/$key\"); close(\$tmpfile);" &
child=$!
echo $containerid > /proc/$child/containerid
sleep 2
ausearch -i -ts recent -k $key
auditctl -d exit,always -F dir=/tmp -F perm=wa -F containerid=$containerid -F key=$key
rm -f /tmp/$key
This should produce an event such as:
type=CONTAINER_INFO msg=audit(1521122591.614:227): op=task contid=123459
type=PROCTITLE msg=audit(1521122591.614:227): proctitle=7065726C002D6500736C65657020313B206F70656E286D792024746D7066696C652C20273E272C20222F746D702F746D70636F6E7461696E6572696422293B20636C6F73652824746D7066696C65293B
type=PATH msg=audit(1521122591.614:227): item=1 name="/tmp/tmpcontainerid" inode=18427 dev=00:26 mode=0100644 ouid=0 ogid=0 rdev=00:00 obj=unconfined_u:object_r:user_tmp_t:s0 nametype=CREATE cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
type=PATH msg=audit(1521122591.614:227): item=0 name="/tmp/" inode=13513 dev=00:26 mode=041777 ouid=0 ogid=0 rdev=00:00 obj=system_u:object_r:tmp_t:s0 nametype=PARENT cap_fp=0000000000000000 cap_fi=0000000000000000 cap_fe=0 cap_fver=0
type=CWD msg=audit(1521122591.614:227): cwd="/root"
type=SYSCALL msg=audit(1521122591.614:227): arch=c000003e syscall=257 success=yes exit=3 a0=ffffffffffffff9c a1=55db90a28900 a2=241 a3=1b6 items=2 ppid=689 pid=724 auid=0 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=pts0 ses=3 comm="perl" exe="/usr/bin/perl" subj=unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023 key="tmpcontainerid"
See:
https://github.com/linux-audit/audit-kernel/issues/32
https://github.com/linux-audit/audit-userspace/issues/40
https://github.com/linux-audit/audit-testsuite/issues/64
Richard Guy Briggs (13):
audit: add container id
audit: check children and threading before allowing containerid
audit: log container info of syscalls
audit: add containerid filtering
audit: add containerid support for ptrace and signals
audit: add support for non-syscall auxiliary records
audit: add container aux record to watch/tree/mark
audit: add containerid support for tty_audit
audit: add containerid support for config/feature/user records
audit: add containerid support for seccomp and anom_abend records
audit: add support for containerid to network namespaces
audit: NETFILTER_PKT: record each container ID associated with a netNS
debug audit: read container ID of a process
drivers/tty/tty_audit.c | 5 +-
fs/proc/base.c | 53 ++++++++++++++++
include/linux/audit.h | 43 +++++++++++++
include/linux/init_task.h | 4 +-
include/linux/sched.h | 1 +
include/net/net_namespace.h | 12 ++++
include/uapi/linux/audit.h | 8 ++-
kernel/audit.c | 75 ++++++++++++++++++++---
kernel/audit.h | 3 +
kernel/audit_fsnotify.c | 5 +-
kernel/audit_tree.c | 5 +-
kernel/audit_watch.c | 33 +++++-----
kernel/auditfilter.c | 52 +++++++++++++++-
kernel/auditsc.c | 145 ++++++++++++++++++++++++++++++++++++++++++--
kernel/nsproxy.c | 6 ++
net/core/net_namespace.c | 45 ++++++++++++++
net/netfilter/xt_AUDIT.c | 15 ++++-
17 files changed, 473 insertions(+), 37 deletions(-)
--
1.8.3.1
^ permalink raw reply
* [PATCH net-next] cxgb4: notify fatal error to uld drivers
From: Ganesh Goudar @ 2018-03-16 8:52 UTC (permalink / raw)
To: netdev, davem
Cc: nirranjan, dledford, jgg, linux-rdma, swise, indranil, venkatesh,
Ganesh Goudar
notify uld drivers if the adapter encounters fatal
error.
Signed-off-by: Ganesh Goudar <ganeshgr@chelsio.com>
---
drivers/infiniband/hw/cxgb4/device.c | 1 +
drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 1 +
drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 10 ++++++++++
drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 3 ++-
4 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 7a9d0de..e96771d 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -1217,6 +1217,7 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
if (ctx->dev)
c4iw_remove(ctx);
break;
+ case CXGB4_STATE_FATAL_ERROR:
case CXGB4_STATE_START_RECOVERY:
pr_info("%s: Fatal Error\n", pci_name(ctx->lldi.pdev));
if (ctx->dev) {
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index b2df0ff..a5c0a64 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -917,6 +917,7 @@ struct adapter {
struct work_struct tid_release_task;
struct work_struct db_full_task;
struct work_struct db_drop_task;
+ struct work_struct fatal_err_notify_task;
bool tid_release_task_busy;
/* lock for mailbox cmd list */
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 5a349e15..72ec3f7 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3255,6 +3255,14 @@ static const struct ethtool_ops cxgb4_mgmt_ethtool_ops = {
.get_drvinfo = cxgb4_mgmt_get_drvinfo,
};
+static void notify_fatal_err(struct work_struct *work)
+{
+ struct adapter *adap;
+
+ adap = container_of(work, struct adapter, fatal_err_notify_task);
+ notify_ulds(adap, CXGB4_STATE_FATAL_ERROR);
+}
+
void t4_fatal_err(struct adapter *adap)
{
int port;
@@ -3279,6 +3287,7 @@ void t4_fatal_err(struct adapter *adap)
netif_carrier_off(dev);
}
dev_alert(adap->pdev_dev, "encountered fatal error, adapter stopped\n");
+ queue_work(adap->workq, &adap->fatal_err_notify_task);
}
static void setup_memwin(struct adapter *adap)
@@ -5479,6 +5488,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
INIT_WORK(&adapter->tid_release_task, process_tid_release_list);
INIT_WORK(&adapter->db_full_task, process_db_full);
INIT_WORK(&adapter->db_drop_task, process_db_drop);
+ INIT_WORK(&adapter->fatal_err_notify_task, notify_fatal_err);
err = t4_prep_adapter(adapter);
if (err)
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
index a14e8db..788146c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h
@@ -257,7 +257,8 @@ enum cxgb4_state {
CXGB4_STATE_UP,
CXGB4_STATE_START_RECOVERY,
CXGB4_STATE_DOWN,
- CXGB4_STATE_DETACH
+ CXGB4_STATE_DETACH,
+ CXGB4_STATE_FATAL_ERROR
};
enum cxgb4_control {
--
2.1.0
^ permalink raw reply related
* Re: [bpf-next V2 PATCH 10/15] xdp: rhashtable with allocator ID to pointer mapping
From: Jason Wang @ 2018-03-16 8:45 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: netdev, BjörnTöpel, magnus.karlsson, eugenia,
John Fastabend, Eran Ben Elisha, Saeed Mahameed, galp,
Daniel Borkmann, Alexei Starovoitov, Tariq Toukan
In-Reply-To: <20180309170735.5573f4e0@redhat.com>
On 2018年03月10日 00:07, Jesper Dangaard Brouer wrote:
> On Fri, 9 Mar 2018 21:07:36 +0800
> Jason Wang <jasowang@redhat.com> wrote:
>
>>>>> Use the IDA infrastructure for getting a cyclic increasing ID number,
>>>>> that is used for keeping track of each registered allocator per
>>>>> RX-queue xdp_rxq_info.
>>>>>
>>>>> Signed-off-by: Jesper Dangaard Brouer<brouer@redhat.com>
>>>> A stupid question is, can we manage to unify this ID with NAPI id?
>>> Sorry I don't understand the question?
>> I mean can we associate page poll pointer to napi_struct, record NAPI id
>> in xdp_mem_info and do lookup through NAPI id?
> No. The driver can unreg/reg a new XDP memory model,
Is there an actual use case for this?
> without reloading
> the NAPI and generate a new NAPI id.
>
We could still do this by e.g a allocator pointer protected by RCU I think?
Thanks
^ permalink raw reply
* Re: [bpf-next V2 PATCH 02/15] xdp: introduce xdp_return_frame API and use in cpumap
From: Jason Wang @ 2018-03-16 8:43 UTC (permalink / raw)
To: Jesper Dangaard Brouer
Cc: netdev, BjörnTöpel, magnus.karlsson, eugenia,
John Fastabend, Eran Ben Elisha, Saeed Mahameed, galp,
Daniel Borkmann, Alexei Starovoitov, Tariq Toukan
In-Reply-To: <20180309170520.11967b11@redhat.com>
On 2018年03月10日 00:05, Jesper Dangaard Brouer wrote:
> On Fri, 9 Mar 2018 21:04:23 +0800
> Jason Wang<jasowang@redhat.com> wrote:
>
>> On 2018年03月09日 17:35, Jesper Dangaard Brouer wrote:
>>> On Fri, 9 Mar 2018 15:24:10 +0800
>>> Jason Wang<jasowang@redhat.com> wrote:
>>>
>>>> On 2018年03月08日 21:07, Jesper Dangaard Brouer wrote:
>>>>> Introduce an xdp_return_frame API, and convert over cpumap as
>>>>> the first user, given it have queued XDP frame structure to leverage.
>>>>>
>>>>> Signed-off-by: Jesper Dangaard Brouer<brouer@redhat.com>
>>>>> ---
>>>>> include/net/xdp.h | 32 +++++++++++++++++++++++++++
>>>>> kernel/bpf/cpumap.c | 60 +++++++++++++++++++++++++++++++--------------------
>>>>> net/core/xdp.c | 18 +++++++++++++++
>>>>> 3 files changed, 86 insertions(+), 24 deletions(-)
>>>>>
>>>>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>>>>> index b2362ddfa694..3cb726a6dc5b 100644
>>>>> --- a/include/net/xdp.h
>>>>> +++ b/include/net/xdp.h
>>>>> @@ -33,16 +33,48 @@
>>>>> * also mandatory during RX-ring setup.
>>>>> */
>>>>>
>>>>> +enum mem_type {
>>>>> + MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
>>>>> + MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */
>>>>> + // Possible new ideas for types:
>>>>> + // MEM_TYPE_PAGE_POOL, /* Will be added later */
>>>>> + // MEM_TYPE_AF_XDP,
>>>>> + // MEM_TYPE_DEVICE_OWNED -- invoking an dev->ndo?
>>>>> + MEM_TYPE_MAX,
>>>>> +};
>>>> So if we plan to support dev->ndo, it looks to me two types AF_XDP and
>>>> DEVICE_OWNED are sufficient? Driver can do what it wants (e.g page pool
>>>> or ordinary page allocator) in ndo or what ever other callbacks.
>>> So, the design is open to go both ways, we can figure out later.
>>>
>>> The reason I'm not calling page_pool from a driver level dev->ndo, is
>>> that I'm trying to avoid invoking code in a module, as the driver
>>> module code can be (in the process of being) unloaded from the kernel,
>>> while another driver is calling xdp_return_frame. The current design
>>> in patch 10, uses the RCU period to guarantee that the allocator
>>> pointer is valid as long as the ID lookup succeeded.
>>>
>>> To do what you propose, we also need to guarantee that the net_device
>>> cannot disappear so it is safe to invoke dev->ndo. To do so, the
>>> driver likely have to add a rcu_barrier before unloading. I'm also
>>> thinking that for MEM_TYPE_DEVICE_OWNED the allocator pointer ("under
>>> protection") could be the net_device pointer, and we could take a
>>> refcnt on dev (dev_hold/dev_put).
>> Then it looks like a device (e.g tun) can lock another device for
>> indefinite time.
>>
>>> Thus, I think it is doable, but lets
>>> figure this out later.
>>>
>>> Another important design consideration, is that the xdp core need to
>>> know how to release memory in case the ID lookup failed.
>> Can we simply use put_page() in this case?
> For now, yes. The (future) use case is when pages are kept DMA mapped,
> then the page also need to be DMA unmapped.
This probably needs careful thought. E.g current virtio-net driver does
not do mapping by itself, instead it depends on the virtio core to do
map and unmap.
> To DMA unmap we need to
> know the struct device (not net_device). (And then the discussion
> start around how to know struct device is still valid).
So still need a safe fallback if I understand this correctly.
Thanks
^ permalink raw reply
* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Jason Wang @ 2018-03-16 8:34 UTC (permalink / raw)
To: Tiwei Bie; +Cc: mst, netdev, linux-kernel, virtualization, wexu
In-Reply-To: <20180316074028.lun2jy45qqnfeymw@debian>
On 2018年03月16日 15:40, Tiwei Bie wrote:
> On Fri, Mar 16, 2018 at 02:44:12PM +0800, Jason Wang wrote:
>> On 2018年03月16日 14:10, Tiwei Bie wrote:
>>> On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
>>>> On 2018年02月23日 19:18, Tiwei Bie wrote:
>>>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>>>> ---
>>>>> drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
>>>>> include/linux/virtio_ring.h | 8 +-
>>>>> 2 files changed, 618 insertions(+), 89 deletions(-)
> [...]
>>>>> cpu_addr, size, direction);
>>>>> }
>>>>> -static void vring_unmap_one(const struct vring_virtqueue *vq,
>>>>> - struct vring_desc *desc)
>>>>> +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
>>>>> {
>>>> Let's split the helpers to packed/split version like other helpers?
>>>> (Consider the caller has already known the type of vq).
>>> Okay.
>>>
>> [...]
>>
>>>>> + desc[i].flags = flags;
>>>>> +
>>>>> + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>>>> + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>>>> + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>>>> If it's a part of chain, we only need to do this for last buffer I think.
>>> I'm not sure I've got your point about the "last buffer".
>>> But, yes, id just needs to be set for the last desc.
>> Right, I think I meant "last descriptor" :)
>>
>>>>> + prev = i;
>>>>> + i++;
>>>> It looks to me prev is always i - 1?
>>> No. prev will be (vq->vring_packed.num - 1) when i becomes 0.
>> Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.
> Yes, i wraps together with vq->wrap_counter in following code:
>
>>>>> + if (!indirect && i >= vq->vring_packed.num) {
>>>>> + i = 0;
>>>>> + vq->wrap_counter ^= 1;
>>>>> + }
>
>>>>> + }
>>>>> + }
>>>>> + for (; n < (out_sgs + in_sgs); n++) {
>>>>> + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
>>>>> + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
>>>>> + if (vring_mapping_error(vq, addr))
>>>>> + goto unmap_release;
>>>>> +
>>>>> + flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
>>>>> + VRING_DESC_F_WRITE |
>>>>> + VRING_DESC_F_AVAIL(vq->wrap_counter) |
>>>>> + VRING_DESC_F_USED(!vq->wrap_counter));
>>>>> + if (!indirect && i == head)
>>>>> + head_flags = flags;
>>>>> + else
>>>>> + desc[i].flags = flags;
>>>>> +
>>>>> + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>>>> + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>>>> + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>>>>> + prev = i;
>>>>> + i++;
>>>>> + if (!indirect && i >= vq->vring_packed.num) {
>>>>> + i = 0;
>>>>> + vq->wrap_counter ^= 1;
>>>>> + }
>>>>> + }
>>>>> + }
>>>>> + /* Last one doesn't continue. */
>>>>> + if (!indirect && (head + 1) % vq->vring_packed.num == i)
>>>>> + head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>>> I can't get the why we need this here.
>>> If only one desc is used, we will need to clear the
>>> VRING_DESC_F_NEXT flag from the head_flags.
>> Yes, I meant why following desc[prev].flags won't work for this?
> Because the update of desc[head].flags (in above case,
> prev == head) has been delayed. The flags is saved in
> head_flags.
Ok, but let's try to avoid modular here e.g tracking the number of sgs
in a counter.
And I see lots of duplication in the above two loops, I believe we can
unify them with a a single loop. the only difference is dma direction
and write flag.
>
>>>>> + else
>>>>> + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>>>> +
>>>>> + if (indirect) {
>>>>> + /* FIXME: to be implemented */
>>>>> +
>>>>> + /* Now that the indirect table is filled in, map it. */
>>>>> + dma_addr_t addr = vring_map_single(
>>>>> + vq, desc, total_sg * sizeof(struct vring_packed_desc),
>>>>> + DMA_TO_DEVICE);
>>>>> + if (vring_mapping_error(vq, addr))
>>>>> + goto unmap_release;
>>>>> +
>>>>> + head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
>>>>> + VRING_DESC_F_AVAIL(wrap_counter) |
>>>>> + VRING_DESC_F_USED(!wrap_counter));
>>>>> + vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
>>>>> + vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
>>>>> + total_sg * sizeof(struct vring_packed_desc));
>>>>> + vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
>>>>> + }
>>>>> +
>>>>> + /* We're using some buffers from the free list. */
>>>>> + vq->vq.num_free -= descs_used;
>>>>> +
>>>>> + /* Update free pointer */
>>>>> + if (indirect) {
>>>>> + n = head + 1;
>>>>> + if (n >= vq->vring_packed.num) {
>>>>> + n = 0;
>>>>> + vq->wrap_counter ^= 1;
>>>>> + }
>>>>> + vq->free_head = n;
>>>> detach_buf_packed() does not even touch free_head here, so need to explain
>>>> its meaning for packed ring.
>>> Above code is for indirect support which isn't really
>>> implemented in this patch yet.
>>>
>>> For your question, free_head stores the index of the
>>> next avail desc. I'll add a comment for it or move it
>>> to union and give it a better name in next version.
>> Yes, something like avail_idx might be better.
>>
>>>>> + } else
>>>>> + vq->free_head = i;
>>>> ID is only valid in the last descriptor in the list, so head + 1 should be
>>>> ok too?
>>> I don't really get your point. The vq->free_head stores
>>> the index of the next avail desc.
>> I think I get your idea now, free_head has two meanings:
>>
>> - next avail index
>> - buffer id
> In my design, free_head is just the index of the next
> avail desc.
>
> Driver can set anything to buffer ID.
Then you need another method to track id to context e.g hashing.
> And in my design,
> I save desc index in buffer ID.
>
> I'll add comments for them.
>
>> If I'm correct, let's better add a comment for this.
>>
>>>>> +
>>>>> + /* Store token and indirect buffer state. */
>>>>> + vq->desc_state[head].num = descs_used;
>>>>> + vq->desc_state[head].data = data;
>>>>> + if (indirect)
>>>>> + vq->desc_state[head].indir_desc = desc;
>>>>> + else
>>>>> + vq->desc_state[head].indir_desc = ctx;
>>>>> +
>>>>> + virtio_wmb(vq->weak_barriers);
>>>> Let's add a comment to explain the barrier here.
>>> Okay.
>>>
>>>>> + vq->vring_packed.desc[head].flags = head_flags;
>>>>> + vq->num_added++;
>>>>> +
>>>>> + pr_debug("Added buffer head %i to %p\n", head, vq);
>>>>> + END_USE(vq);
>>>>> +
>>>>> + return 0;
>>>>> +
>>>>> +unmap_release:
>>>>> + err_idx = i;
>>>>> + i = head;
>>>>> +
>>>>> + for (n = 0; n < total_sg; n++) {
>>>>> + if (i == err_idx)
>>>>> + break;
>>>>> + vring_unmap_one(vq, &desc[i]);
>>>>> + i++;
>>>>> + if (!indirect && i >= vq->vring_packed.num)
>>>>> + i = 0;
>>>>> + }
>>>>> +
>>>>> + vq->wrap_counter = wrap_counter;
>>>>> +
>>>>> + if (indirect)
>>>>> + kfree(desc);
>>>>> +
>>>>> + END_USE(vq);
>>>>> + return -EIO;
>>>>> +}
> [...]
>>>>> @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
>>>>> if (!queue) {
>>>>> /* Try to get a single page. You are my only hope! */
>>>>> - queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
>>>>> + queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
>>>>> + packed),
>>>>> &dma_addr, GFP_KERNEL|__GFP_ZERO);
>>>>> }
>>>>> if (!queue)
>>>>> return NULL;
>>>>> - queue_size_in_bytes = vring_size(num, vring_align);
>>>>> - vring_init(&vring, num, queue, vring_align);
>>>>> + queue_size_in_bytes = __vring_size(num, vring_align, packed);
>>>>> + if (packed)
>>>>> + vring_packed_init(&vring.vring_packed, num, queue, vring_align);
>>>>> + else
>>>>> + vring_init(&vring.vring_split, num, queue, vring_align);
>>>> Let's rename vring_init to vring_init_split() like other helpers?
>>> The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
>>> I don't think we can rename it.
>> I see, then this need more thoughts to unify the API.
> My thought is to keep the old API as is, and introduce
> new types and helpers for packed ring.
I admit it's not a fault of this patch. But we'd better think of this in
the future, consider we may have new kinds of ring.
>
> More details can be found in this patch:
> https://lkml.org/lkml/2018/2/23/243
> (PS. The type which has bit fields is just for reference,
> and will be changed in next version.)
>
> Do you have any other suggestions?
No.
Thanks
>
> Best regards,
> Tiwei Bie
>
>>>>> - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
>>>>> - notify, callback, name);
>>>>> + vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
>>>>> + context, notify, callback, name);
>>>>> if (!vq) {
>>>>> vring_free_queue(vdev, queue_size_in_bytes, queue,
>>>>> dma_addr);
> [...]
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply
* Re: [PATCH bpf-next 0/4] tools: bpf: minor build improvements
From: Daniel Borkmann @ 2018-03-16 8:29 UTC (permalink / raw)
To: Jakub Kicinski, alexei.starovoitov; +Cc: oss-drivers, netdev
In-Reply-To: <20180316062617.10254-1-jakub.kicinski@netronome.com>
On 03/16/2018 07:26 AM, Jakub Kicinski wrote:
> Hi!
>
> As promised this series addresses nits and minor issues in tools/bpf
> build infra. One GCC-7 warning which is nice to get rid of. Dependencies
> when built with OUTPUT are fixed. make clean will now remove the
> FEATURE-DUMP.* files. PHONY target is also updated to match reality.
>
> Jakub Kicinski (4):
> tools: bpftool: fix dependency file path
> tools: bpftool: fix potential format truncation
> tools: bpf: cleanup PHONY target
> tools: bpf: remove feature detection output
>
> tools/bpf/Makefile | 4 +++-
> tools/bpf/bpftool/Makefile | 4 +++-
> tools/bpf/bpftool/xlated_dumper.h | 2 +-
> 3 files changed, 7 insertions(+), 3 deletions(-)
Applied to bpf-next, thanks Jakub!
^ permalink raw reply
* Re: [PATCH net 5/5] net/sched: fix NULL dereference on the error path of tcf_skbmod_init()
From: Jiri Pirko @ 2018-03-16 8:27 UTC (permalink / raw)
To: Davide Caratti
Cc: Cong Wang, David S. Miller, Roman Mashak, Manish Kurup, netdev
In-Reply-To: <19f092b6f400a8bcc135bb45dc0dd4f4b60add7b.1521154629.git.dcaratti@redhat.com>
Fri, Mar 16, 2018 at 12:00:57AM CET, dcaratti@redhat.com wrote:
>when the following command
>
> # tc action replace action skbmod swap mac index 100
>
>is run for the first time, and tcf_skbmod_init() fails to allocate struct
>tcf_skbmod_params, tcf_skbmod_cleanup() calls kfree_rcu(NULL), thus
>causing the following error:
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000008
> IP: __call_rcu+0x23/0x2b0
> PGD 8000000034057067 P4D 8000000034057067 PUD 74937067 PMD 0
> Oops: 0002 [#1] SMP PTI
> Modules linked in: act_skbmod(E) psample ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 snd_hda_codec_generic snd_hda_intel snd_hda_codec crct10dif_pclmul mbcache jbd2 crc32_pclmul snd_hda_core ghash_clmulni_intel snd_hwdep pcbc snd_seq snd_seq_device snd_pcm aesni_intel snd_timer crypto_simd glue_helper snd cryptd virtio_balloon joydev soundcore pcspkr i2c_piix4 nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm virtio_console virtio_net virtio_blk ata_piix libata crc32c_intel virtio_pci serio_raw virtio_ring virtio i2c_core floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_skbmod]
> CPU: 3 PID: 3144 Comm: tc Tainted: G E 4.16.0-rc4.act_vlan.orig+ #403
> Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> RIP: 0010:__call_rcu+0x23/0x2b0
> RSP: 0018:ffffbd2e403e7798 EFLAGS: 00010246
> RAX: ffffffffc0872080 RBX: ffff981d34bff780 RCX: 00000000ffffffff
> RDX: ffffffff922a5f00 RSI: 0000000000000000 RDI: 0000000000000000
> RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000000021f
> R10: 000000003d003000 R11: 0000000000aaaaaa R12: 0000000000000000
> R13: ffffffff922a5f00 R14: 0000000000000001 R15: ffff981d3b698c2c
> FS: 00007f3678292740(0000) GS:ffff981d3fd80000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000008 CR3: 000000007c57a006 CR4: 00000000001606e0
> Call Trace:
> __tcf_idr_release+0x79/0xf0
> tcf_skbmod_init+0x1d1/0x210 [act_skbmod]
> tcf_action_init_1+0x2cc/0x430
> tcf_action_init+0xd3/0x1b0
> tc_ctl_action+0x18b/0x240
> rtnetlink_rcv_msg+0x29c/0x310
> ? _cond_resched+0x15/0x30
> ? __kmalloc_node_track_caller+0x1b9/0x270
> ? rtnl_calcit.isra.28+0x100/0x100
> netlink_rcv_skb+0xd2/0x110
> netlink_unicast+0x17c/0x230
> netlink_sendmsg+0x2cd/0x3c0
> sock_sendmsg+0x30/0x40
> ___sys_sendmsg+0x27a/0x290
> ? filemap_map_pages+0x34a/0x3a0
> ? __handle_mm_fault+0xbfd/0xe20
> __sys_sendmsg+0x51/0x90
> do_syscall_64+0x6e/0x1a0
> entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> RIP: 0033:0x7f36776a3ba0
> RSP: 002b:00007fff4703b618 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 00007fff4703b740 RCX: 00007f36776a3ba0
> RDX: 0000000000000000 RSI: 00007fff4703b690 RDI: 0000000000000003
> RBP: 000000005aaaba36 R08: 0000000000000002 R09: 0000000000000000
> R10: 00007fff4703b0a0 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007fff4703b754 R14: 0000000000000001 R15: 0000000000669f60
> Code: 5d e9 42 da ff ff 66 90 0f 1f 44 00 00 41 57 41 56 41 55 49 89 d5 41 54 55 48 89 fd 53 48 83 ec 08 40 f6 c7 07 0f 85 19 02 00 00 <48> 89 75 08 48 c7 45 00 00 00 00 00 9c 58 0f 1f 44 00 00 49 89
> RIP: __call_rcu+0x23/0x2b0 RSP: ffffbd2e403e7798
> CR2: 0000000000000008
>
>Fix it in tcf_skbmod_cleanup(), ensuring that kfree_rcu(p, ...) is called
>only when p is not NULL.
>
>Fixes: 86da71b57383 ("net_sched: Introduce skbmod action")
>Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* Re: [PATCH net 4/5] net/sched: fix NULL dereference in the error path of tcf_sample_init()
From: Jiri Pirko @ 2018-03-16 8:27 UTC (permalink / raw)
To: Davide Caratti
Cc: Cong Wang, David S. Miller, Roman Mashak, Manish Kurup, netdev
In-Reply-To: <b271a299b1a3693b472b74cd48874b151392c66b.1521154629.git.dcaratti@redhat.com>
Fri, Mar 16, 2018 at 12:00:56AM CET, dcaratti@redhat.com wrote:
>when the following command
>
> # tc action add action sample rate 100 group 100 index 100
>
>is run for the first time, and psample_group_get(100) fails to create a
>new group, tcf_sample_cleanup() calls psample_group_put(NULL), thus
>causing the following error:
>
> BUG: unable to handle kernel NULL pointer dereference at 000000000000001c
> IP: psample_group_put+0x15/0x71 [psample]
> PGD 8000000075775067 P4D 8000000075775067 PUD 7453c067 PMD 0
> Oops: 0002 [#1] SMP PTI
> Modules linked in: act_sample(E) psample ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hda_core mbcache jbd2 crct10dif_pclmul snd_hwdep crc32_pclmul snd_seq ghash_clmulni_intel pcbc snd_seq_device snd_pcm aesni_intel crypto_simd snd_timer glue_helper snd cryptd joydev pcspkr i2c_piix4 soundcore virtio_balloon nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm virtio_net ata_piix virtio_console virtio_blk libata serio_raw crc32c_intel virtio_pci i2c_core virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_tunnel_key]
> CPU: 2 PID: 5740 Comm: tc Tainted: G E 4.16.0-rc4.act_vlan.orig+ #403
> Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> RIP: 0010:psample_group_put+0x15/0x71 [psample]
> RSP: 0018:ffffb8a80032f7d0 EFLAGS: 00010246
> RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000024
> RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffffffffc06d93c0
> RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000044
> R10: 00000000bd003000 R11: ffff979fba04aa59 R12: 0000000000000000
> R13: 0000000000000000 R14: 0000000000000000 R15: ffff979fbba3f22c
> FS: 00007f7638112740(0000) GS:ffff979fbfd00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 000000000000001c CR3: 00000000734ea001 CR4: 00000000001606e0
> Call Trace:
> __tcf_idr_release+0x79/0xf0
> tcf_sample_init+0x125/0x1d0 [act_sample]
> tcf_action_init_1+0x2cc/0x430
> tcf_action_init+0xd3/0x1b0
> tc_ctl_action+0x18b/0x240
> rtnetlink_rcv_msg+0x29c/0x310
> ? _cond_resched+0x15/0x30
> ? __kmalloc_node_track_caller+0x1b9/0x270
> ? rtnl_calcit.isra.28+0x100/0x100
> netlink_rcv_skb+0xd2/0x110
> netlink_unicast+0x17c/0x230
> netlink_sendmsg+0x2cd/0x3c0
> sock_sendmsg+0x30/0x40
> ___sys_sendmsg+0x27a/0x290
> ? filemap_map_pages+0x34a/0x3a0
> ? __handle_mm_fault+0xbfd/0xe20
> __sys_sendmsg+0x51/0x90
> do_syscall_64+0x6e/0x1a0
> entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> RIP: 0033:0x7f7637523ba0
> RSP: 002b:00007fff0473ef58 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 00007fff0473f080 RCX: 00007f7637523ba0
> RDX: 0000000000000000 RSI: 00007fff0473efd0 RDI: 0000000000000003
> RBP: 000000005aaaac80 R08: 0000000000000002 R09: 0000000000000000
> R10: 00007fff0473e9e0 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007fff0473f094 R14: 0000000000000001 R15: 0000000000669f60
> Code: be 02 00 00 00 48 89 df e8 a9 fe ff ff e9 7c ff ff ff 0f 1f 40 00 0f 1f 44 00 00 53 48 89 fb 48 c7 c7 c0 93 6d c0 e8 db 20 8c ef <83> 6b 1c 01 74 10 48 c7 c7 c0 93 6d c0 ff 14 25 e8 83 83 b0 5b
> RIP: psample_group_put+0x15/0x71 [psample] RSP: ffffb8a80032f7d0
> CR2: 000000000000001c
>
>Fix it in tcf_sample_cleanup(), ensuring that calls to psample_group_put(p)
>are done only when p is not NULL.
>
>Fixes: cadb9c9fdbc6 ("net/sched: act_sample: Fix error path in init")
>Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* Re: [PATCH net 3/5] net/sched: fix NULL dereference in the error path of tunnel_key_init()
From: Jiri Pirko @ 2018-03-16 8:26 UTC (permalink / raw)
To: Davide Caratti
Cc: Cong Wang, David S. Miller, Roman Mashak, Manish Kurup, netdev
In-Reply-To: <1e5eec19547fb24e358551c7d6b789aba4702f6b.1521154629.git.dcaratti@redhat.com>
Fri, Mar 16, 2018 at 12:00:55AM CET, dcaratti@redhat.com wrote:
>when the following command
>
> # tc action add action tunnel_key unset index 100
>
>is run for the first time, and tunnel_key_init() fails to allocate struct
>tcf_tunnel_key_params, tunnel_key_release() dereferences NULL pointers.
>This causes the following error:
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
> IP: tunnel_key_release+0xd/0x40 [act_tunnel_key]
> PGD 8000000033787067 P4D 8000000033787067 PUD 74646067 PMD 0
> Oops: 0000 [#1] SMP PTI
> Modules linked in: act_tunnel_key(E) act_csum ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 mbcache jbd2 crct10dif_pclmul crc32_pclmul snd_hda_codec_generic ghash_clmulni_intel snd_hda_intel pcbc snd_hda_codec snd_hda_core snd_hwdep snd_seq aesni_intel snd_seq_device crypto_simd glue_helper snd_pcm cryptd joydev snd_timer pcspkr virtio_balloon snd i2c_piix4 soundcore nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_net virtio_blk drm virtio_console crc32c_intel ata_piix serio_raw i2c_core virtio_pci libata virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod
> CPU: 2 PID: 3101 Comm: tc Tainted: G E 4.16.0-rc4.act_vlan.orig+ #403
> Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> RIP: 0010:tunnel_key_release+0xd/0x40 [act_tunnel_key]
> RSP: 0018:ffffba46803b7768 EFLAGS: 00010286
> RAX: ffffffffc09010a0 RBX: 0000000000000000 RCX: 0000000000000024
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff99ee336d7480
> RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000044
> R10: 0000000000000220 R11: ffff99ee79d73131 R12: 0000000000000000
> R13: ffff99ee32d67610 R14: ffff99ee7671dc38 R15: 00000000fffffff4
> FS: 00007febcb2cd740(0000) GS:ffff99ee7fd00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000010 CR3: 000000007c8e4005 CR4: 00000000001606e0
> Call Trace:
> __tcf_idr_release+0x79/0xf0
> tunnel_key_init+0xd9/0x460 [act_tunnel_key]
> tcf_action_init_1+0x2cc/0x430
> tcf_action_init+0xd3/0x1b0
> tc_ctl_action+0x18b/0x240
> rtnetlink_rcv_msg+0x29c/0x310
> ? _cond_resched+0x15/0x30
> ? __kmalloc_node_track_caller+0x1b9/0x270
> ? rtnl_calcit.isra.28+0x100/0x100
> netlink_rcv_skb+0xd2/0x110
> netlink_unicast+0x17c/0x230
> netlink_sendmsg+0x2cd/0x3c0
> sock_sendmsg+0x30/0x40
> ___sys_sendmsg+0x27a/0x290
> __sys_sendmsg+0x51/0x90
> do_syscall_64+0x6e/0x1a0
> entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> RIP: 0033:0x7febca6deba0
> RSP: 002b:00007ffe7b0dd128 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 00007ffe7b0dd250 RCX: 00007febca6deba0
> RDX: 0000000000000000 RSI: 00007ffe7b0dd1a0 RDI: 0000000000000003
> RBP: 000000005aaa90cb R08: 0000000000000002 R09: 0000000000000000
> R10: 00007ffe7b0dcba0 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007ffe7b0dd264 R14: 0000000000000001 R15: 0000000000669f60
> Code: 44 00 00 8b 0d b5 23 00 00 48 8b 87 48 10 00 00 48 8b 3c c8 e9 a5 e5 d8 c3 0f 1f 44 00 00 0f 1f 44 00 00 53 48 8b 9f b0 00 00 00 <83> 7b 10 01 74 0b 48 89 df 31 f6 5b e9 f2 fa 7f c3 48 8b 7b 18
> RIP: tunnel_key_release+0xd/0x40 [act_tunnel_key] RSP: ffffba46803b7768
> CR2: 0000000000000010
>
>Fix this in tunnel_key_release(), ensuring 'param' is not NULL before
>dereferencing it.
>
>Fixes: d0f6dd8a914f ("net/sched: Introduce act_tunnel_key")
>Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* Re: [PATCH net 2/5] net/sched: fix NULL dereference in the error path of tcf_csum_init()
From: Jiri Pirko @ 2018-03-16 8:26 UTC (permalink / raw)
To: Davide Caratti
Cc: Cong Wang, David S. Miller, Roman Mashak, Manish Kurup, netdev
In-Reply-To: <cb08146799e6e0a9660ad2a384f76795858b934b.1521154629.git.dcaratti@redhat.com>
Fri, Mar 16, 2018 at 12:00:54AM CET, dcaratti@redhat.com wrote:
>when the following command
>
> # tc action add action csum udp continue index 100
>
>is run for the first time, and tcf_csum_init() fails allocating struct
>tcf_csum, tcf_csum_cleanup() calls kfree_rcu(NULL,...). This causes the
>following error:
>
> BUG: unable to handle kernel NULL pointer dereference at 0000000000000010
> IP: __call_rcu+0x23/0x2b0
> PGD 80000000740b4067 P4D 80000000740b4067 PUD 32e7f067 PMD 0
> Oops: 0002 [#1] SMP PTI
> Modules linked in: act_csum(E) act_vlan ip6table_filter ip6_tables iptable_filter binfmt_misc ext4 mbcache jbd2 crct10dif_pclmul crc32_pclmul ghash_clmulni_intel snd_hda_codec_generic pcbc snd_hda_intel snd_hda_codec snd_hda_core snd_hwdep snd_seq snd_seq_device snd_pcm snd_timer aesni_intel crypto_simd glue_helper cryptd snd joydev pcspkr virtio_balloon i2c_piix4 soundcore nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c ata_generic pata_acpi qxl drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm virtio_blk drm virtio_net virtio_console ata_piix crc32c_intel libata virtio_pci serio_raw i2c_core virtio_ring virtio floppy dm_mirror dm_region_hash dm_log dm_mod [last unloaded: act_vlan]
> CPU: 2 PID: 5763 Comm: tc Tainted: G E 4.16.0-rc4.act_vlan.orig+ #403
> Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
> RIP: 0010:__call_rcu+0x23/0x2b0
> RSP: 0018:ffffb275803e77c0 EFLAGS: 00010246
> RAX: ffffffffc057b080 RBX: ffff9674bc6f5240 RCX: 00000000ffffffff
> RDX: ffffffff928a5f00 RSI: 0000000000000008 RDI: 0000000000000008
> RBP: 0000000000000008 R08: 0000000000000001 R09: 0000000000000044
> R10: 0000000000000220 R11: ffff9674b9ab4821 R12: 0000000000000000
> R13: ffffffff928a5f00 R14: 0000000000000000 R15: 0000000000000001
> FS: 00007fa6368d8740(0000) GS:ffff9674bfd00000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> CR2: 0000000000000010 CR3: 0000000073dec001 CR4: 00000000001606e0
> Call Trace:
> __tcf_idr_release+0x79/0xf0
> tcf_csum_init+0xfb/0x180 [act_csum]
> tcf_action_init_1+0x2cc/0x430
> tcf_action_init+0xd3/0x1b0
> tc_ctl_action+0x18b/0x240
> rtnetlink_rcv_msg+0x29c/0x310
> ? _cond_resched+0x15/0x30
> ? __kmalloc_node_track_caller+0x1b9/0x270
> ? rtnl_calcit.isra.28+0x100/0x100
> netlink_rcv_skb+0xd2/0x110
> netlink_unicast+0x17c/0x230
> netlink_sendmsg+0x2cd/0x3c0
> sock_sendmsg+0x30/0x40
> ___sys_sendmsg+0x27a/0x290
> ? filemap_map_pages+0x34a/0x3a0
> ? __handle_mm_fault+0xbfd/0xe20
> __sys_sendmsg+0x51/0x90
> do_syscall_64+0x6e/0x1a0
> entry_SYSCALL_64_after_hwframe+0x3d/0xa2
> RIP: 0033:0x7fa635ce9ba0
> RSP: 002b:00007ffc185b0fc8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
> RAX: ffffffffffffffda RBX: 00007ffc185b10f0 RCX: 00007fa635ce9ba0
> RDX: 0000000000000000 RSI: 00007ffc185b1040 RDI: 0000000000000003
> RBP: 000000005aaa85e0 R08: 0000000000000002 R09: 0000000000000000
> R10: 00007ffc185b0a20 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007ffc185b1104 R14: 0000000000000001 R15: 0000000000669f60
> Code: 5d e9 42 da ff ff 66 90 0f 1f 44 00 00 41 57 41 56 41 55 49 89 d5 41 54 55 48 89 fd 53 48 83 ec 08 40 f6 c7 07 0f 85 19 02 00 00 <48> 89 75 08 48 c7 45 00 00 00 00 00 9c 58 0f 1f 44 00 00 49 89
> RIP: __call_rcu+0x23/0x2b0 RSP: ffffb275803e77c0
> CR2: 0000000000000010
>
>fix this in tcf_csum_cleanup(), ensuring that kfree_rcu(param, ...) is
>called only when param is not NULL.
>
>Fixes: 9c5f69bbd75a ("net/sched: act_csum: don't use spinlock in the fast path")
>Signed-off-by: Davide Caratti <dcaratti@redhat.com>
Acked-by: Jiri Pirko <jiri@mellanox.com>
^ permalink raw reply
* Re: netns: send uevent messages
From: Kirill Tkhai @ 2018-03-16 8:23 UTC (permalink / raw)
To: Christian Brauner
Cc: Christian Brauner, ebiederm, gregkh, netdev, linux-kernel, serge,
avagin
In-Reply-To: <20180315234634.GA520@gmail.com>
On 16.03.2018 02:46, Christian Brauner wrote:
> On Thu, Mar 15, 2018 at 05:14:13PM +0300, Kirill Tkhai wrote:
>> On 15.03.2018 16:39, Christian Brauner wrote:
>>> On Thu, Mar 15, 2018 at 12:47:30PM +0300, Kirill Tkhai wrote:
>>>> CC Andrey Vagin
>>>
>>> Hey Kirill,
>>>
>>> Thanks for CCing Andrey.
>>>
>>>>
>>>> On 15.03.2018 03:12, Christian Brauner wrote:
>>>>> This patch adds a receive method to NETLINK_KOBJECT_UEVENT netlink sockets
>>>>> to allow sending uevent messages into the network namespace the socket
>>>>> belongs to.
>>>>>
>>>>> Currently non-initial network namespaces are already isolated and don't
>>>>> receive uevents. There are a number of cases where it is beneficial for a
>>>>> sufficiently privileged userspace process to send a uevent into a network
>>>>> namespace.
>>>>>
>>>>> One such use case would be debugging and fuzzing of a piece of software
>>>>> which listens and reacts to uevents. By running a copy of that software
>>>>> inside a network namespace, specific uevents could then be presented to it.
>>>>> More concretely, this would allow for easy testing of udevd/ueventd.
>>>>>
>>>>> This will also allow some piece of software to run components inside a
>>>>> separate network namespace and then effectively filter what that software
>>>>> can receive. Some examples of software that do directly listen to uevents
>>>>> and that we have in the past attempted to run inside a network namespace
>>>>> are rbd (CEPH client) or the X server.
>>>>>
>>>>> Implementation:
>>>>> The implementation has been kept as simple as possible from the kernel's
>>>>> perspective. Specifically, a simple input method uevent_net_rcv() is added
>>>>> to NETLINK_KOBJECT_UEVENT sockets which completely reuses existing
>>>>> af_netlink infrastructure and does neither add an additional netlink family
>>>>> nor requires any user-visible changes.
>>>>>
>>>>> For example, by using netlink_rcv_skb() we can make use of existing netlink
>>>>> infrastructure to report back informative error messages to userspace.
>>>>>
>>>>> Furthermore, this implementation does not introduce any overhead for
>>>>> existing uevent generating codepaths. The struct netns gets a new uevent
>>>>> socket member that records the uevent socket associated with that network
>>>>> namespace. Since we record the uevent socket for each network namespace in
>>>>> struct net we don't have to walk the whole uevent socket list.
>>>>> Instead we can directly retrieve the relevant uevent socket and send the
>>>>> message. This keeps the codepath very performant without introducing
>>>>> needless overhead.
>>>>>
>>>>> Uevent sequence numbers are kept global. When a uevent message is sent to
>>>>> another network namespace the implementation will simply increment the
>>>>> global uevent sequence number and append it to the received uevent. This
>>>>> has the advantage that the kernel will never need to parse the received
>>>>> uevent message to replace any existing uevent sequence numbers. Instead it
>>>>> is up to the userspace process to remove any existing uevent sequence
>>>>> numbers in case the uevent message to be sent contains any.
>>>>>
>>>>> Security:
>>>>> In order for a caller to send uevent messages to a target network namespace
>>>>> the caller must have CAP_SYS_ADMIN in the owning user namespace of the
>>>>> target network namespace. Additionally, any received uevent message is
>>>>> verified to not exceed size UEVENT_BUFFER_SIZE. This includes the space
>>>>> needed to append the uevent sequence number.
>>>>>
>>>>> Testing:
>>>>> This patch has been tested and verified to work with the following udev
>>>>> implementations:
>>>>> 1. CentOS 6 with udevd version 147
>>>>> 2. Debian Sid with systemd-udevd version 237
>>>>> 3. Android 7.1.1 with ueventd
>>>>>
>>>>> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
>>>>> ---
>>>>> include/net/net_namespace.h | 1 +
>>>>> lib/kobject_uevent.c | 88 ++++++++++++++++++++++++++++++++++++++++++++-
>>>>> 2 files changed, 88 insertions(+), 1 deletion(-)
>>>>>
>>>>> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
>>>>> index f306b2aa15a4..467bde763a9b 100644
>>>>> --- a/include/net/net_namespace.h
>>>>> +++ b/include/net/net_namespace.h
>>>>> @@ -78,6 +78,7 @@ struct net {
>>>>>
>>>>> struct sock *rtnl; /* rtnetlink socket */
>>>>> struct sock *genl_sock;
>>>>> + struct sock *uevent_sock; /* uevent socket */
>>>>
>>>> Since you add this per-net uevent_sock pointer, currently existing iterations in uevent_net_exit()
>>>> become to look confusing. There are:
>>>>
>>>> mutex_lock(&uevent_sock_mutex);
>>>> list_for_each_entry(ue_sk, &uevent_sock_list, list) {
>>>> if (sock_net(ue_sk->sk) == net)
>>>> goto found;
>>>> }
>>>>
>>>> Can't we make a small cleanup in lib/kobject_uevent.c after this change
>>>> and before the main part of the patch goes?
>>>
>>> Hm, not sure. It seems it makes sense to maintain them in a separate
>>> list. Looks like this lets us keep the locking simpler. Otherwise we
>>> would have to do something like for_each_net() and it seems that this
>>> would force us to use rntl_{un}lock().
>>
>> I'm about:
>>
>> mutex_lock();
>> list_del(net->ue_sk->list);
>> mutex_unlock();
>> kfree();
>>
>> Thus we avoid iterations in uevent_net_exit().
>
> Ah right, but I only added struct sock *uevent_sock to struct net, not
> struct uevent_sock. Thus there's no list member in there. I mainly only
> added struct sock to keep it clean an aligned with the other sock
> member. We can revisit this later though.
In my opinion, we shouldn't leave the code in inconsistent state just because
of it's not interesting for this patch. There are two logical changes:
1)add fast-access pointer to struct net
2)implement single-net uevent
It's easy to do 1), and this may help further reader not to be confused
by the inconsistency, so I don't see a reason why we shouldn't do that.
>>
>>>>
>>>>>
>>>>> struct list_head dev_base_head;
>>>>> struct hlist_head *dev_name_head;
>>>>> diff --git a/lib/kobject_uevent.c b/lib/kobject_uevent.c
>>>>> index 9fe6ec8fda28..10b2144b9fc3 100644
>>>>> --- a/lib/kobject_uevent.c
>>>>> +++ b/lib/kobject_uevent.c
>>>>> @@ -25,6 +25,7 @@
>>>>> #include <linux/uuid.h>
>>>>> #include <linux/ctype.h>
>>>>> #include <net/sock.h>
>>>>> +#include <net/netlink.h>
>>>>> #include <net/net_namespace.h>
>>>>>
>>>>>
>>>>> @@ -602,12 +603,94 @@ int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...)
>>>>> EXPORT_SYMBOL_GPL(add_uevent_var);
>>>>>
>>>>> #if defined(CONFIG_NET)
>>>>> +static int uevent_net_send(struct sock *usk, struct sk_buff *skb,
>>>>> + struct netlink_ext_ack *extack)
>>>>> +{
>>>>> + int ret;
>>>>> + u64 seqnum;
>>>>> + /* u64 to chars: 2^64 - 1 = 21 chars */
>>>>
>>>> 18446744073709551615 -- 20 chars (+1 '\0'). Comment makes me think
>>>> we forgot +1 in buf declaration.
>>>
>>> sizeof("SEQNUM=") will include the '\0' pointer in contrast to
>>> strlen("SEQNUM=") so this is correct if I'm not completely mistaken.
>>
>> The code is OK, I'm worrying about comment. But I've missed this sizeof().
>> So, there is only 1 bytes excess allocated as 0xFFFFFFFFFFFFFFFF=18446744073709551615
>> Not so important...
>>
>>>>
>>>>> + char buf[sizeof("SEQNUM=") + 21];
>>>>> + struct sk_buff *skbc;
>>>>> +
>>>>> + /* bump sequence number */
>>>>> + mutex_lock(&uevent_sock_mutex);
>>>>> + seqnum = ++uevent_seqnum;
>>>>> + mutex_unlock(&uevent_sock_mutex);
>>>>
>>>> Commit 7b60a18da393 from Andrew Vagin says:
>>>>
>>>> uevent: send events in correct order according to seqnum (v3)
>>>>
>>>> The queue handling in the udev daemon assumes that the events are
>>>> ordered.
>>>>
>>>> Before this patch uevent_seqnum is incremented under sequence_lock,
>>>> than an event is send uner uevent_sock_mutex. I want to say that code
>>>> contained a window between incrementing seqnum and sending an event.
>>>>
>>>> This patch locks uevent_sock_mutex before incrementing uevent_seqnum.
>>>> Signed-off-by: Andrew Vagin <avagin@openvz.org>
>>>>
>>>> After this change the order will be lost. Also the rest of namespaces
>>>> will have holes in uevent numbers, as they won't receive a number sent
>>>> to specific namespace.
>>>
>>> Afaict from looking into udevd when I wrote the patch it only cares
>>> about numbers being ordered (which is also what Andrey's patch states)
>>> not that they are sequential so holes should be fine. udevd will use
>>> the DEVPATH to determine whether the sequence number of the current
>>> uevent should be used as "even->delaying_seqnum" number. All that
>>> matters is that it is greater than the previous one. About the ordering,
>>> if that's an issue then we should simply do what Andrey has been doing
>>> for kobject_uevent_env() and extend the lock being held until after we
>>> sent out the uevent. Since we're not serving all listeners but only
>>> ever one this is also way more lightweight then kobject_uevent_env().
>>
>> Yes, extending the lock to netlink_broadcast() should fix the problem.
>
> Yep, send another version of the patch but forgot to CC you, sorry:
> https://lkml.org/lkml/2018/3/15/1098
>
> Thanks!
> Christian
>
>>
>>>>
>>>> It seems we should made uevent_seqnum per-net to solve this problem.
>>>
>>> Yes, Eric and I have been discussing this already. The idea was to do
>>> this in a follow-up patch to keep this patch as simple as possible. If
>>> we agree that it would make sense right away then I will dig out the
>>> corresponding patch.
>>> It would basically just involve giving each struct net a uevent_seqnum
>>> member.
>>
>> pernet seqnum may have a sense if we introduce per-net locks to protect it.
>> If there is single mutex, it does not matter either they are pernet or not.
>> Per-net mutex may be useful only if we have many single-net messages like
>> you introduce in this patch, or if we are going to filter net in existing
>> kobject_uevent_net_broadcast() (by user_ns?!) in the future.
>>
>> Kirill
^ permalink raw reply
* Re: [PATCH v4 2/2] Remove false-positive VLAs when using max()
From: Nikolay Borisov @ 2018-03-16 7:52 UTC (permalink / raw)
To: Kees Cook, Andrew Morton
Cc: Linus Torvalds, Josh Poimboeuf, Rasmus Villemoes, Randy Dunlap,
Miguel Ojeda, Ingo Molnar, David Laight, Ian Abbott, linux-input,
linux-btrfs, netdev, linux-kernel, kernel-hardening
In-Reply-To: <1521143266-31350-3-git-send-email-keescook@chromium.org>
On 15.03.2018 21:47, Kees Cook wrote:
> As part of removing VLAs from the kernel[1], we want to build with -Wvla,
> but it is overly pessimistic and only accepts constant expressions for
> stack array sizes, instead of also constant values. The max() macro
> triggers the warning, so this refactors these uses of max() to use the
> new const_max() instead.
>
> [1] https://lkml.org/lkml/2018/3/7/621
For the btrfs portion :
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
>
> Signed-off-by: Kees Cook <keescook@chromium.org>
> ---
> drivers/input/touchscreen/cyttsp4_core.c | 2 +-
> fs/btrfs/tree-checker.c | 3 ++-
> lib/vsprintf.c | 4 ++--
> net/ipv4/proc.c | 8 ++++----
> net/ipv6/proc.c | 10 ++++------
> 5 files changed, 13 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/input/touchscreen/cyttsp4_core.c b/drivers/input/touchscreen/cyttsp4_core.c
> index 727c3232517c..f89497940051 100644
> --- a/drivers/input/touchscreen/cyttsp4_core.c
> +++ b/drivers/input/touchscreen/cyttsp4_core.c
> @@ -868,7 +868,7 @@ static void cyttsp4_get_mt_touches(struct cyttsp4_mt_data *md, int num_cur_tch)
> struct cyttsp4_touch tch;
> int sig;
> int i, j, t = 0;
> - int ids[max(CY_TMA1036_MAX_TCH, CY_TMA4XX_MAX_TCH)];
> + int ids[const_max(CY_TMA1036_MAX_TCH, CY_TMA4XX_MAX_TCH)];
>
> memset(ids, 0, si->si_ofs.tch_abs[CY_TCH_T].max * sizeof(int));
> for (i = 0; i < num_cur_tch; i++) {
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index c3c8d48f6618..1ddd6cc3c4fc 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -341,7 +341,8 @@ static int check_dir_item(struct btrfs_root *root,
> */
> if (key->type == BTRFS_DIR_ITEM_KEY ||
> key->type == BTRFS_XATTR_ITEM_KEY) {
> - char namebuf[max(BTRFS_NAME_LEN, XATTR_NAME_MAX)];
> + char namebuf[const_max(BTRFS_NAME_LEN,
> + XATTR_NAME_MAX)];
>
> read_extent_buffer(leaf, namebuf,
> (unsigned long)(di + 1), name_len);
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index d7a708f82559..9d5610b643ce 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -744,8 +744,8 @@ char *resource_string(char *buf, char *end, struct resource *res,
> #define FLAG_BUF_SIZE (2 * sizeof(res->flags))
> #define DECODED_BUF_SIZE sizeof("[mem - 64bit pref window disabled]")
> #define RAW_BUF_SIZE sizeof("[mem - flags 0x]")
> - char sym[max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
> - 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];
> + char sym[const_max(2*RSRC_BUF_SIZE + DECODED_BUF_SIZE,
> + 2*RSRC_BUF_SIZE + FLAG_BUF_SIZE + RAW_BUF_SIZE)];
>
> char *p = sym, *pend = sym + sizeof(sym);
> int decode = (fmt[0] == 'R') ? 1 : 0;
> diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
> index dc5edc8f7564..fad6f989004e 100644
> --- a/net/ipv4/proc.c
> +++ b/net/ipv4/proc.c
> @@ -46,7 +46,7 @@
> #include <net/sock.h>
> #include <net/raw.h>
>
> -#define TCPUDP_MIB_MAX max_t(u32, UDP_MIB_MAX, TCP_MIB_MAX)
> +#define TCPUDP_MIB_MAX const_max(UDP_MIB_MAX, TCP_MIB_MAX)
>
> /*
> * Report socket allocation statistics [mea@utu.fi]
> @@ -404,7 +404,7 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
> struct net *net = seq->private;
> int i;
>
> - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
> + memset(buff, 0, sizeof(buff));
>
> seq_puts(seq, "\nTcp:");
> for (i = 0; snmp4_tcp_list[i].name; i++)
> @@ -421,7 +421,7 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
> seq_printf(seq, " %lu", buff[i]);
> }
>
> - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
> + memset(buff, 0, sizeof(buff));
>
> snmp_get_cpu_field_batch(buff, snmp4_udp_list,
> net->mib.udp_statistics);
> @@ -432,7 +432,7 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
> for (i = 0; snmp4_udp_list[i].name; i++)
> seq_printf(seq, " %lu", buff[i]);
>
> - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
> + memset(buff, 0, sizeof(buff));
>
> /* the UDP and UDP-Lite MIBs are the same */
> seq_puts(seq, "\nUdpLite:");
> diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
> index b67814242f78..58bbfc4fa7fa 100644
> --- a/net/ipv6/proc.c
> +++ b/net/ipv6/proc.c
> @@ -30,10 +30,8 @@
> #include <net/transp_v6.h>
> #include <net/ipv6.h>
>
> -#define MAX4(a, b, c, d) \
> - max_t(u32, max_t(u32, a, b), max_t(u32, c, d))
> -#define SNMP_MIB_MAX MAX4(UDP_MIB_MAX, TCP_MIB_MAX, \
> - IPSTATS_MIB_MAX, ICMP_MIB_MAX)
> +#define SNMP_MIB_MAX const_max(const_max(UDP_MIB_MAX, TCP_MIB_MAX), \
> + const_max(IPSTATS_MIB_MAX, ICMP_MIB_MAX))
>
> static int sockstat6_seq_show(struct seq_file *seq, void *v)
> {
> @@ -199,7 +197,7 @@ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
> int i;
>
> if (pcpumib) {
> - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
> + memset(buff, 0, sizeof(buff));
>
> snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
> for (i = 0; itemlist[i].name; i++)
> @@ -218,7 +216,7 @@ static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
> u64 buff64[SNMP_MIB_MAX];
> int i;
>
> - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
> + memset(buff64, 0, sizeof(buff64));
>
> snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
> for (i = 0; itemlist[i].name; i++)
>
^ permalink raw reply
* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Tiwei Bie @ 2018-03-16 7:40 UTC (permalink / raw)
To: Jason Wang; +Cc: mst, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <0a0ecf42-8f7f-9387-8ca4-cb65d0835b56@redhat.com>
On Fri, Mar 16, 2018 at 02:44:12PM +0800, Jason Wang wrote:
> On 2018年03月16日 14:10, Tiwei Bie wrote:
> > On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
> > > On 2018年02月23日 19:18, Tiwei Bie wrote:
> > > > Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
> > > > ---
> > > > drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
> > > > include/linux/virtio_ring.h | 8 +-
> > > > 2 files changed, 618 insertions(+), 89 deletions(-)
[...]
> > > > cpu_addr, size, direction);
> > > > }
> > > > -static void vring_unmap_one(const struct vring_virtqueue *vq,
> > > > - struct vring_desc *desc)
> > > > +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
> > > > {
> > > Let's split the helpers to packed/split version like other helpers?
> > > (Consider the caller has already known the type of vq).
> > Okay.
> >
>
> [...]
>
> > > > + desc[i].flags = flags;
> > > > +
> > > > + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > If it's a part of chain, we only need to do this for last buffer I think.
> > I'm not sure I've got your point about the "last buffer".
> > But, yes, id just needs to be set for the last desc.
>
> Right, I think I meant "last descriptor" :)
>
> >
> > > > + prev = i;
> > > > + i++;
> > > It looks to me prev is always i - 1?
> > No. prev will be (vq->vring_packed.num - 1) when i becomes 0.
>
> Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.
Yes, i wraps together with vq->wrap_counter in following code:
> > > > + if (!indirect && i >= vq->vring_packed.num) {
> > > > + i = 0;
> > > > + vq->wrap_counter ^= 1;
> > > > + }
> > > > + }
> > > > + }
> > > > + for (; n < (out_sgs + in_sgs); n++) {
> > > > + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
> > > > + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
> > > > + if (vring_mapping_error(vq, addr))
> > > > + goto unmap_release;
> > > > +
> > > > + flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
> > > > + VRING_DESC_F_WRITE |
> > > > + VRING_DESC_F_AVAIL(vq->wrap_counter) |
> > > > + VRING_DESC_F_USED(!vq->wrap_counter));
> > > > + if (!indirect && i == head)
> > > > + head_flags = flags;
> > > > + else
> > > > + desc[i].flags = flags;
> > > > +
> > > > + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
> > > > + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
> > > > + prev = i;
> > > > + i++;
> > > > + if (!indirect && i >= vq->vring_packed.num) {
> > > > + i = 0;
> > > > + vq->wrap_counter ^= 1;
> > > > + }
> > > > + }
> > > > + }
> > > > + /* Last one doesn't continue. */
> > > > + if (!indirect && (head + 1) % vq->vring_packed.num == i)
> > > > + head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > I can't get the why we need this here.
> > If only one desc is used, we will need to clear the
> > VRING_DESC_F_NEXT flag from the head_flags.
>
> Yes, I meant why following desc[prev].flags won't work for this?
Because the update of desc[head].flags (in above case,
prev == head) has been delayed. The flags is saved in
head_flags.
>
> >
> > > > + else
> > > > + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
> > > > +
> > > > + if (indirect) {
> > > > + /* FIXME: to be implemented */
> > > > +
> > > > + /* Now that the indirect table is filled in, map it. */
> > > > + dma_addr_t addr = vring_map_single(
> > > > + vq, desc, total_sg * sizeof(struct vring_packed_desc),
> > > > + DMA_TO_DEVICE);
> > > > + if (vring_mapping_error(vq, addr))
> > > > + goto unmap_release;
> > > > +
> > > > + head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
> > > > + VRING_DESC_F_AVAIL(wrap_counter) |
> > > > + VRING_DESC_F_USED(!wrap_counter));
> > > > + vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
> > > > + vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
> > > > + total_sg * sizeof(struct vring_packed_desc));
> > > > + vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
> > > > + }
> > > > +
> > > > + /* We're using some buffers from the free list. */
> > > > + vq->vq.num_free -= descs_used;
> > > > +
> > > > + /* Update free pointer */
> > > > + if (indirect) {
> > > > + n = head + 1;
> > > > + if (n >= vq->vring_packed.num) {
> > > > + n = 0;
> > > > + vq->wrap_counter ^= 1;
> > > > + }
> > > > + vq->free_head = n;
> > > detach_buf_packed() does not even touch free_head here, so need to explain
> > > its meaning for packed ring.
> > Above code is for indirect support which isn't really
> > implemented in this patch yet.
> >
> > For your question, free_head stores the index of the
> > next avail desc. I'll add a comment for it or move it
> > to union and give it a better name in next version.
>
> Yes, something like avail_idx might be better.
>
> >
> > > > + } else
> > > > + vq->free_head = i;
> > > ID is only valid in the last descriptor in the list, so head + 1 should be
> > > ok too?
> > I don't really get your point. The vq->free_head stores
> > the index of the next avail desc.
>
> I think I get your idea now, free_head has two meanings:
>
> - next avail index
> - buffer id
In my design, free_head is just the index of the next
avail desc.
Driver can set anything to buffer ID. And in my design,
I save desc index in buffer ID.
I'll add comments for them.
>
> If I'm correct, let's better add a comment for this.
>
> >
> > > > +
> > > > + /* Store token and indirect buffer state. */
> > > > + vq->desc_state[head].num = descs_used;
> > > > + vq->desc_state[head].data = data;
> > > > + if (indirect)
> > > > + vq->desc_state[head].indir_desc = desc;
> > > > + else
> > > > + vq->desc_state[head].indir_desc = ctx;
> > > > +
> > > > + virtio_wmb(vq->weak_barriers);
> > > Let's add a comment to explain the barrier here.
> > Okay.
> >
> > > > + vq->vring_packed.desc[head].flags = head_flags;
> > > > + vq->num_added++;
> > > > +
> > > > + pr_debug("Added buffer head %i to %p\n", head, vq);
> > > > + END_USE(vq);
> > > > +
> > > > + return 0;
> > > > +
> > > > +unmap_release:
> > > > + err_idx = i;
> > > > + i = head;
> > > > +
> > > > + for (n = 0; n < total_sg; n++) {
> > > > + if (i == err_idx)
> > > > + break;
> > > > + vring_unmap_one(vq, &desc[i]);
> > > > + i++;
> > > > + if (!indirect && i >= vq->vring_packed.num)
> > > > + i = 0;
> > > > + }
> > > > +
> > > > + vq->wrap_counter = wrap_counter;
> > > > +
> > > > + if (indirect)
> > > > + kfree(desc);
> > > > +
> > > > + END_USE(vq);
> > > > + return -EIO;
> > > > +}
[...]
> > > > @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
> > > > if (!queue) {
> > > > /* Try to get a single page. You are my only hope! */
> > > > - queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
> > > > + queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
> > > > + packed),
> > > > &dma_addr, GFP_KERNEL|__GFP_ZERO);
> > > > }
> > > > if (!queue)
> > > > return NULL;
> > > > - queue_size_in_bytes = vring_size(num, vring_align);
> > > > - vring_init(&vring, num, queue, vring_align);
> > > > + queue_size_in_bytes = __vring_size(num, vring_align, packed);
> > > > + if (packed)
> > > > + vring_packed_init(&vring.vring_packed, num, queue, vring_align);
> > > > + else
> > > > + vring_init(&vring.vring_split, num, queue, vring_align);
> > > Let's rename vring_init to vring_init_split() like other helpers?
> > The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
> > I don't think we can rename it.
>
> I see, then this need more thoughts to unify the API.
My thought is to keep the old API as is, and introduce
new types and helpers for packed ring.
More details can be found in this patch:
https://lkml.org/lkml/2018/2/23/243
(PS. The type which has bit fields is just for reference,
and will be changed in next version.)
Do you have any other suggestions?
Best regards,
Tiwei Bie
>
> >
> > > > - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
> > > > - notify, callback, name);
> > > > + vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
> > > > + context, notify, callback, name);
> > > > if (!vq) {
> > > > vring_free_queue(vdev, queue_size_in_bytes, queue,
> > > > dma_addr);
[...]
^ permalink raw reply
* Re: Fw: [Bug 199121] New: Packet header is incorrect when following through an IPsec tunnel after upgrade kernel to 4.15
From: Steffen Klassert @ 2018-03-16 6:58 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: Herbert Xu, David S. Miller, netdev, posonsky
In-Reply-To: <20180315075951.2cee5ea0@xeon-e3>
Ccing the reporter of this bug.
On Thu, Mar 15, 2018 at 07:59:51AM -0700, Stephen Hemminger wrote:
>
>
> Begin forwarded message:
>
> Date: Thu, 15 Mar 2018 06:37:27 +0000
> From: bugzilla-daemon@bugzilla.kernel.org
> To: stephen@networkplumber.org
> Subject: [Bug 199121] New: Packet header is incorrect when following through an IPsec tunnel after upgrade kernel to 4.15
>
>
> https://bugzilla.kernel.org/show_bug.cgi?id=199121
>
> Bug ID: 199121
> Summary: Packet header is incorrect when following through an
> IPsec tunnel after upgrade kernel to 4.15
> Product: Networking
> Version: 2.5
> Kernel Version: 4.15.9
> Hardware: All
> OS: Linux
> Tree: Mainline
> Status: NEW
> Severity: normal
> Priority: P1
> Component: Other
> Assignee: stephen@networkplumber.org
> Reporter: posonsky@yandex.ru
> Regression: No
>
> I have been using IPsec tunnel for a while. StrongSwan is used for management:
> ```
> # swanctl -l
> pfsense2: #1, ESTABLISHED, IKEv2, cc04d3c5b34b4bda_i* f150c78e4fc042ef_r
> local '90.188.239.175' @ 90.188.239.175[500]
> remote '62.152.54.102' @ 62.152.54.102[500]
> 3DES_CBC/HMAC_SHA1_96/PRF_HMAC_SHA1/MODP_2048
> established 649s ago, reauth in 2746s
> pfsense2: #1, reqid 1, INSTALLED, TUNNEL, ESP:AES_CBC-256/HMAC_SHA1_96
> installed 649s ago, rekeying in 286s, expires in 551s
> in c41e18d6, 588 bytes, 7 packets, 643s ago
> out cfad3c32, 588 bytes, 7 packets, 643s ago
> local 192.168.8.0/24
> remote 10.10.1.0/24
> ```
> And everything worked fine. But after updating to 4.15 traffic stopped passing.
>
> I created [issue](https://wiki.strongswan.org/issues/2571) on
> wiki.strongswan.org. During the analysis of the situation it was found, when I
> try to send ICMP request to 10.10.1.248, for example,
> ```
> $ ping 10.10.1.248
> PING 10.10.1.248 (10.10.1.248) 56(84) bytes of data.
> ^C
> --- 10.10.1.248 ping statistics ---
> 4 packets transmitted, 0 received, 100% packet loss, time 3068ms
> ```
> the response is returned as if from 8.0.1.248.
Can you please try to revert the following patch:
5efec5c655dd ("xfrm: Fix eth_hdr(skb)->h_proto to reflect inner IP version")
In case of IPv4, this writes 0x0800 to the h_proto field
of the ethernet header. The h_proto field has an offset
of 12 bytes in the ethernet header. I bet that the MAC
header is not set in your case and we write this to
the IPv4 header instead. The src address in the IPv4
header has 12 bytes offset too, so I think that's why
the src address is reported with 8.0.1.248.
Please also try thre current 'net' tree, it has a fix
for this issue. It should be fixed with:
87cdf3148b11 ("xfrm: Verify MAC header exists before overwriting
eth_hdr(skb)->h_proto")
^ permalink raw reply
* Re: linux-next: manual merge of the net-next tree with the rdma-fixes tree
From: Saeed Mahameed @ 2018-03-16 6:54 UTC (permalink / raw)
To: sfr@canb.auug.org.au, davem@davemloft.net, netdev@vger.kernel.org,
Jason Gunthorpe, dledford@redhat.com
Cc: linux-kernel@vger.kernel.org, Leon Romanovsky,
linux-next@vger.kernel.org, Mark Bloch
In-Reply-To: <1521163082.18703.191.camel@redhat.com>
On Thu, 2018-03-15 at 21:18 -0400, Doug Ledford wrote:
> On Fri, 2018-03-16 at 11:56 +1100, Stephen Rothwell wrote:
> > Hi all,
> >
> > Today's linux-next merge of the net-next tree got a conflict in:
> >
> > drivers/infiniband/hw/mlx5/main.c
> >
> > between commit:
> >
> > 42cea83f9524 ("IB/mlx5: Fix cleanup order on unload")
> >
> > from the rdma-fixes tree and commit:
> >
> > b5ca15ad7e61 ("IB/mlx5: Add proper representors support")
> >
> > from the net-next tree.
>
> We are aware of the merge conflict. This is a result of the fact
> that
> code had been submitted to the for-next area (the representors
> support)
> and after that an issue was found by the syzkaller bot that deserved
> rc
> fix status and which conflicted. The fixup you list below is
> insufficient to fix the merge conflict. The full fixup can be found
> in
> the rdma tree from where I merged the for-rc branch into the for-next
> branch and created a complete fixup of the merge conflict. The
> problem
> is that one patch change the device init stage flow, while the other
> patch duplicates the normal device init stage flow to the representor
> device stage flow. To resolve the fix, you not only have to resolve
> the
> contextual diffs, but you have to duplicate the changes to the normal
> device stage flow into the representor device stage flow. It is very
> far from a trivial merge. We were planning on talking to Dave about
> this issue tomorrow, but you beat us to raising the issue ;-).
>
> Here's the commit (from the rdma git repo) with the proper merge fix
> (although it also has other minor merge stuff that needs to be
> ignored):
>
> 2d873449a202 (Merge branch 'k.o/wip/dl-for-rc' into k.o/wip/dl-for-
> next)
>
Dave, Will you be able to take care of this? you will see this once the
rdma fix gets submitted to linus and you back merge rc into net-next.
after that we need to verify that net-next merges cleanly with rdma-
next (from mlx5 point of view) to make sure we won't run into trouble
in the next merge window.
I would like also to avoid such conflicts in the future, since now
rdma-rc is active we are more likely to run into such issues, to solve
this maybe the right way to go is to avoid touching rdma/netdev from
mlx5 shared code pull requests and keep mlx5 core stuff clean, the idea
is to run a pure mlx5 core incremental branch in parallel to netdev and
rdma, this branch will include all pure updates to mlx5 core, mlx5
netdev or rdma patches will be sent to netdev and rdma branches
separately they can be based on latest mlx5 core branch or not.
This way you don't need to pull in rdma stuff from shared code and
Doug/Jason won't need to pull in netdev stuff. it is more work for me,
but for you and rdma, you will see only clean mlx5 core stuff from
shared code, the stack part (netdev or rdma) will go only to the
specific maintainer respectively.
What do you think ? I can do a sample run in one of my next pull
requests and see how it goes.
Thanks,
Saeed.
^ permalink raw reply
* Re: [PATCH net-next] tuntap: XDP_TX can use native XDP
From: Jason Wang @ 2018-03-16 6:46 UTC (permalink / raw)
To: Michael S. Tsirkin; +Cc: netdev, linux-kernel
In-Reply-To: <20180315153150-mutt-send-email-mst@kernel.org>
On 2018年03月15日 21:32, Michael S. Tsirkin wrote:
> On Thu, Mar 15, 2018 at 04:39:25PM +0800, Jason Wang wrote:
>>
>> On 2018年03月14日 11:37, Michael S. Tsirkin wrote:
>>>> return NULL;
>>>> case XDP_TX:
>>>> - xdp_xmit = true;
>>>> - /* fall through */
>>>> + get_page(alloc_frag->page);
>>>> + alloc_frag->offset += buflen;
>>>> + if (tun_xdp_xmit(tun->dev, &xdp))
>>>> + goto err_redirect;
>>>> + tun_xdp_flush(tun->dev);
>>> Why do we have to flush here though?
>>> It might be a good idea to document the reason in a code comment.
>>>
>> ndo_xdp_xmit() does not touch doorbell, so we need a ndo_xdp_flush() here.
>> It's the assumption of XDP API I think, so not sure it's worth to mention it
>> here.
>>
>> Thanks
> Can't one flush we called after multiple xmit calls?
We can and could be another patch on top. But I want to unify it with
the batching of XDP_REDIRECT by e.g let vhost submit more than one
packets through msg.control in sendmsg().
Thanks
^ permalink raw reply
* Re: [PATCH RFC 2/2] virtio_ring: support packed ring
From: Jason Wang @ 2018-03-16 6:44 UTC (permalink / raw)
To: Tiwei Bie; +Cc: mst, virtualization, linux-kernel, netdev, wexu, jfreimann
In-Reply-To: <20180316061047.o2xdyuqhak3mzjyw@debian>
On 2018年03月16日 14:10, Tiwei Bie wrote:
> On Fri, Mar 16, 2018 at 12:03:25PM +0800, Jason Wang wrote:
>> On 2018年02月23日 19:18, Tiwei Bie wrote:
>>> Signed-off-by: Tiwei Bie <tiwei.bie@intel.com>
>>> ---
>>> drivers/virtio/virtio_ring.c | 699 +++++++++++++++++++++++++++++++++++++------
>>> include/linux/virtio_ring.h | 8 +-
>>> 2 files changed, 618 insertions(+), 89 deletions(-)
>>>
>>> diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
>>> index eb30f3e09a47..393778a2f809 100644
>>> --- a/drivers/virtio/virtio_ring.c
>>> +++ b/drivers/virtio/virtio_ring.c
>>> @@ -58,14 +58,14 @@
>>> struct vring_desc_state {
>>> void *data; /* Data for callback. */
>>> - struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
>>> + void *indir_desc; /* Indirect descriptor, if any. */
>>> + int num; /* Descriptor list length. */
>>> };
>>> struct vring_virtqueue {
>>> struct virtqueue vq;
>>> - /* Actual memory layout for this queue */
>>> - struct vring vring;
>>> + bool packed;
>>> /* Can we use weak barriers? */
>>> bool weak_barriers;
>>> @@ -87,11 +87,28 @@ struct vring_virtqueue {
>>> /* Last used index we've seen. */
>>> u16 last_used_idx;
>>> - /* Last written value to avail->flags */
>>> - u16 avail_flags_shadow;
>>> -
>>> - /* Last written value to avail->idx in guest byte order */
>>> - u16 avail_idx_shadow;
>>> + union {
>>> + /* Available for split ring */
>>> + struct {
>>> + /* Actual memory layout for this queue */
>>> + struct vring vring;
>>> +
>>> + /* Last written value to avail->flags */
>>> + u16 avail_flags_shadow;
>>> +
>>> + /* Last written value to avail->idx in
>>> + * guest byte order */
>>> + u16 avail_idx_shadow;
>>> + };
>>> +
>>> + /* Available for packed ring */
>>> + struct {
>>> + /* Actual memory layout for this queue */
>>> + struct vring_packed vring_packed;
>>> + u8 wrap_counter : 1;
>>> + bool chaining;
>>> + };
>>> + };
>>> /* How to notify other side. FIXME: commonalize hcalls! */
>>> bool (*notify)(struct virtqueue *vq);
>>> @@ -201,26 +218,37 @@ static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
>>> cpu_addr, size, direction);
>>> }
>>> -static void vring_unmap_one(const struct vring_virtqueue *vq,
>>> - struct vring_desc *desc)
>>> +static void vring_unmap_one(const struct vring_virtqueue *vq, void *_desc)
>>> {
>> Let's split the helpers to packed/split version like other helpers?
>> (Consider the caller has already known the type of vq).
> Okay.
>
[...]
>>> + desc[i].flags = flags;
>>> +
>>> + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>> + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>> If it's a part of chain, we only need to do this for last buffer I think.
> I'm not sure I've got your point about the "last buffer".
> But, yes, id just needs to be set for the last desc.
Right, I think I meant "last descriptor" :)
>
>>> + prev = i;
>>> + i++;
>> It looks to me prev is always i - 1?
> No. prev will be (vq->vring_packed.num - 1) when i becomes 0.
Right, so prev = i ? i - 1 : vq->vring_packed.num - 1.
>
>>> + if (!indirect && i >= vq->vring_packed.num) {
>>> + i = 0;
>>> + vq->wrap_counter ^= 1;
>>> + }
>>> + }
>>> + }
>>> + for (; n < (out_sgs + in_sgs); n++) {
>>> + for (sg = sgs[n]; sg; sg = sg_next(sg)) {
>>> + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
>>> + if (vring_mapping_error(vq, addr))
>>> + goto unmap_release;
>>> +
>>> + flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT |
>>> + VRING_DESC_F_WRITE |
>>> + VRING_DESC_F_AVAIL(vq->wrap_counter) |
>>> + VRING_DESC_F_USED(!vq->wrap_counter));
>>> + if (!indirect && i == head)
>>> + head_flags = flags;
>>> + else
>>> + desc[i].flags = flags;
>>> +
>>> + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> + desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
>>> + desc[i].id = cpu_to_virtio32(_vq->vdev, head);
>>> + prev = i;
>>> + i++;
>>> + if (!indirect && i >= vq->vring_packed.num) {
>>> + i = 0;
>>> + vq->wrap_counter ^= 1;
>>> + }
>>> + }
>>> + }
>>> + /* Last one doesn't continue. */
>>> + if (!indirect && (head + 1) % vq->vring_packed.num == i)
>>> + head_flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>> I can't get the why we need this here.
> If only one desc is used, we will need to clear the
> VRING_DESC_F_NEXT flag from the head_flags.
Yes, I meant why following desc[prev].flags won't work for this?
>
>>> + else
>>> + desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
>>> +
>>> + if (indirect) {
>>> + /* FIXME: to be implemented */
>>> +
>>> + /* Now that the indirect table is filled in, map it. */
>>> + dma_addr_t addr = vring_map_single(
>>> + vq, desc, total_sg * sizeof(struct vring_packed_desc),
>>> + DMA_TO_DEVICE);
>>> + if (vring_mapping_error(vq, addr))
>>> + goto unmap_release;
>>> +
>>> + head_flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT |
>>> + VRING_DESC_F_AVAIL(wrap_counter) |
>>> + VRING_DESC_F_USED(!wrap_counter));
>>> + vq->vring_packed.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
>>> + vq->vring_packed.desc[head].len = cpu_to_virtio32(_vq->vdev,
>>> + total_sg * sizeof(struct vring_packed_desc));
>>> + vq->vring_packed.desc[head].id = cpu_to_virtio32(_vq->vdev, head);
>>> + }
>>> +
>>> + /* We're using some buffers from the free list. */
>>> + vq->vq.num_free -= descs_used;
>>> +
>>> + /* Update free pointer */
>>> + if (indirect) {
>>> + n = head + 1;
>>> + if (n >= vq->vring_packed.num) {
>>> + n = 0;
>>> + vq->wrap_counter ^= 1;
>>> + }
>>> + vq->free_head = n;
>> detach_buf_packed() does not even touch free_head here, so need to explain
>> its meaning for packed ring.
> Above code is for indirect support which isn't really
> implemented in this patch yet.
>
> For your question, free_head stores the index of the
> next avail desc. I'll add a comment for it or move it
> to union and give it a better name in next version.
Yes, something like avail_idx might be better.
>
>>> + } else
>>> + vq->free_head = i;
>> ID is only valid in the last descriptor in the list, so head + 1 should be
>> ok too?
> I don't really get your point. The vq->free_head stores
> the index of the next avail desc.
I think I get your idea now, free_head has two meanings:
- next avail index
- buffer id
If I'm correct, let's better add a comment for this.
>
>>> +
>>> + /* Store token and indirect buffer state. */
>>> + vq->desc_state[head].num = descs_used;
>>> + vq->desc_state[head].data = data;
>>> + if (indirect)
>>> + vq->desc_state[head].indir_desc = desc;
>>> + else
>>> + vq->desc_state[head].indir_desc = ctx;
>>> +
>>> + virtio_wmb(vq->weak_barriers);
>> Let's add a comment to explain the barrier here.
> Okay.
>
>>> + vq->vring_packed.desc[head].flags = head_flags;
>>> + vq->num_added++;
>>> +
>>> + pr_debug("Added buffer head %i to %p\n", head, vq);
>>> + END_USE(vq);
>>> +
>>> + return 0;
>>> +
>>> +unmap_release:
>>> + err_idx = i;
>>> + i = head;
>>> +
>>> + for (n = 0; n < total_sg; n++) {
>>> + if (i == err_idx)
>>> + break;
>>> + vring_unmap_one(vq, &desc[i]);
>>> + i++;
>>> + if (!indirect && i >= vq->vring_packed.num)
>>> + i = 0;
>>> + }
>>> +
>>> + vq->wrap_counter = wrap_counter;
>>> +
>>> + if (indirect)
>>> + kfree(desc);
>>> +
>>> + END_USE(vq);
>>> + return -EIO;
>>> +}
>>> +
>>> +static inline int virtqueue_add(struct virtqueue *_vq,
>>> + struct scatterlist *sgs[],
>>> + unsigned int total_sg,
>>> + unsigned int out_sgs,
>>> + unsigned int in_sgs,
>>> + void *data,
>>> + void *ctx,
>>> + gfp_t gfp)
>>> +{
>>> + struct vring_virtqueue *vq = to_vvq(_vq);
>>> +
>>> + return vq->packed ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs,
>>> + in_sgs, data, ctx, gfp) :
>>> + virtqueue_add_split(_vq, sgs, total_sg, out_sgs,
>>> + in_sgs, data, ctx, gfp);
>>> +}
>>> +
>>> /**
>>> * virtqueue_add_sgs - expose buffers to other end
>>> * @vq: the struct virtqueue we're talking about.
>>> @@ -561,6 +845,12 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>>> * event. */
>>> virtio_mb(vq->weak_barriers);
>>> + if (vq->packed) {
>>> + /* FIXME: to be implemented */
>>> + needs_kick = true;
>>> + goto out;
>>> + }
>>> +
>>> old = vq->avail_idx_shadow - vq->num_added;
>>> new = vq->avail_idx_shadow;
>>> vq->num_added = 0;
>>> @@ -579,6 +869,8 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
>>> } else {
>>> needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
>>> }
>>> +
>>> +out:
>>> END_USE(vq);
>>> return needs_kick;
>>> }
>>> @@ -628,8 +920,8 @@ bool virtqueue_kick(struct virtqueue *vq)
>>> }
>>> EXPORT_SYMBOL_GPL(virtqueue_kick);
>>> -static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>>> - void **ctx)
>>> +static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head,
>>> + void **ctx)
>>> {
>>> unsigned int i, j;
>>> __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
>>> @@ -677,29 +969,81 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head,
>>> }
>>> }
>>> -static inline bool more_used(const struct vring_virtqueue *vq)
>>> +static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int head,
>>> + void **ctx)
>>> +{
>>> + struct vring_packed_desc *desc;
>>> + unsigned int i, j;
>>> +
>>> + /* Clear data ptr. */
>>> + vq->desc_state[head].data = NULL;
>>> +
>>> + i = head;
>>> +
>>> + for (j = 0; j < vq->desc_state[head].num; j++) {
>>> + desc = &vq->vring_packed.desc[i];
>>> + vring_unmap_one(vq, desc);
>>> + i++;
>>> + if (i >= vq->vring_packed.num)
>>> + i = 0;
>>> + }
>>> +
>>> + vq->vq.num_free += vq->desc_state[head].num;
>> It looks to me vq->free_head grows always, how can we make sure it does not
>> exceeds vq.num here?
> The vq->free_head stores the index of the next avail
> desc. You can find it wraps together with vq->wrap_counter
> in virtqueue_add_packed().
>
I see, thanks.
[...]
>>> +
>>> + /* detach_buf_packed clears data, so grab it now. */
>>> + ret = vq->desc_state[i].data;
>>> + detach_buf_packed(vq, i, ctx);
>>> +
>>> + vq->last_used_idx += vq->desc_state[i].num;
>>> + if (vq->last_used_idx >= vq->vring_packed.num)
>>> + vq->last_used_idx %= vq->vring_packed.num;
>> '-=' should be sufficient here?
> Good suggestion. I think so.
>
>>> +
>>> + // FIXME: implement the desc event support
>>> +
>>> +#ifdef DEBUG
>>> + vq->last_add_time_valid = false;
>>> +#endif
>>> +
>>> + END_USE(vq);
>>> + return ret;
>>> +}
>>> +
[...]
>>> +
>>> +#if 0
>>> + vq->chaining = virtio_has_feature(vdev,
>>> + VIRTIO_RING_F_LIST_DESC);
>>> +#else
>>> + vq->chaining = true;
>> Looks like in V10 there's no F_LIST_DESC.
> Yes. I kept this in this patch just because the
> desc chaining is optional in the old spec draft
> when sending out this patch set. I'll remove it
> in next version.
>
>>> +#endif
>>> + } else {
>>> + vq->vring = vring.vring_split;
>>> + vq->avail_flags_shadow = 0;
>>> + vq->avail_idx_shadow = 0;
>>> +
>>> + /* Put everything in free lists. */
>>> + vq->free_head = 0;
>>> + for (i = 0; i < num-1; i++)
>>> + vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
>>> + }
>>> +
>>> /* No callback? Tell other side not to bother us. */
>>> if (!callback) {
>>> - vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
>>> - if (!vq->event)
>>> - vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
>>> + if (packed) {
>>> + // FIXME: to be implemented
>>> + } else {
>>> + vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
>>> + if (!vq->event)
>>> + vq->vring.avail->flags = cpu_to_virtio16(vdev,
>>> + vq->avail_flags_shadow);
>>> + }
>>> }
>>> - /* Put everything in free lists. */
>>> - vq->free_head = 0;
>>> - for (i = 0; i < vring.num-1; i++)
>>> - vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
>>> - memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
>>> + memset(vq->desc_state, 0, num * sizeof(struct vring_desc_state));
>>> return &vq->vq;
>>> }
>>> @@ -1058,6 +1548,14 @@ static void vring_free_queue(struct virtio_device *vdev, size_t size,
>>> }
>>> }
>>> +static inline int
>>> +__vring_size(unsigned int num, unsigned long align, bool packed)
>>> +{
>>> + if (packed)
>>> + return vring_packed_size(num, align);
>>> + return vring_size(num, align);
>>> +}
>>> +
>>> struct virtqueue *vring_create_virtqueue(
>>> unsigned int index,
>>> unsigned int num,
>>> @@ -1074,7 +1572,8 @@ struct virtqueue *vring_create_virtqueue(
>>> void *queue = NULL;
>>> dma_addr_t dma_addr;
>>> size_t queue_size_in_bytes;
>>> - struct vring vring;
>>> + union vring_union vring;
>>> + bool packed;
>>> /* We assume num is a power of 2. */
>>> if (num & (num - 1)) {
>>> @@ -1082,9 +1581,13 @@ struct virtqueue *vring_create_virtqueue(
>>> return NULL;
>>> }
>>> + packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
>>> +
>>> /* TODO: allocate each queue chunk individually */
>>> - for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
>>> - queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
>>> + for (; num && __vring_size(num, vring_align, packed) > PAGE_SIZE;
>>> + num /= 2) {
>>> + queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
>>> + packed),
>>> &dma_addr,
>>> GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
>>> if (queue)
>>> @@ -1096,17 +1599,21 @@ struct virtqueue *vring_create_virtqueue(
>>> if (!queue) {
>>> /* Try to get a single page. You are my only hope! */
>>> - queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
>>> + queue = vring_alloc_queue(vdev, __vring_size(num, vring_align,
>>> + packed),
>>> &dma_addr, GFP_KERNEL|__GFP_ZERO);
>>> }
>>> if (!queue)
>>> return NULL;
>>> - queue_size_in_bytes = vring_size(num, vring_align);
>>> - vring_init(&vring, num, queue, vring_align);
>>> + queue_size_in_bytes = __vring_size(num, vring_align, packed);
>>> + if (packed)
>>> + vring_packed_init(&vring.vring_packed, num, queue, vring_align);
>>> + else
>>> + vring_init(&vring.vring_split, num, queue, vring_align);
>> Let's rename vring_init to vring_init_split() like other helpers?
> The vring_init() is a public API in include/uapi/linux/virtio_ring.h.
> I don't think we can rename it.
I see, then this need more thoughts to unify the API.
>
>>> - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
>>> - notify, callback, name);
>>> + vq = __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
>>> + context, notify, callback, name);
>>> if (!vq) {
>>> vring_free_queue(vdev, queue_size_in_bytes, queue,
>>> dma_addr);
>>> @@ -1132,10 +1639,17 @@ struct virtqueue *vring_new_virtqueue(unsigned int index,
>>> void (*callback)(struct virtqueue *vq),
>>> const char *name)
>>> {
>>> - struct vring vring;
>>> - vring_init(&vring, num, pages, vring_align);
>>> - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context,
>>> - notify, callback, name);
>>> + union vring_union vring;
>>> + bool packed;
>>> +
>>> + packed = virtio_has_feature(vdev, VIRTIO_F_RING_PACKED);
>>> + if (packed)
>>> + vring_packed_init(&vring.vring_packed, num, pages, vring_align);
>>> + else
>>> + vring_init(&vring.vring_split, num, pages, vring_align);
>>> +
>>> + return __vring_new_virtqueue(index, vring, packed, vdev, weak_barriers,
>>> + context, notify, callback, name);
>>> }
>>> EXPORT_SYMBOL_GPL(vring_new_virtqueue);
>>> @@ -1145,7 +1659,9 @@ void vring_del_virtqueue(struct virtqueue *_vq)
>>> if (vq->we_own_ring) {
>>> vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
>>> - vq->vring.desc, vq->queue_dma_addr);
>>> + vq->packed ? (void *)vq->vring_packed.desc :
>>> + (void *)vq->vring.desc,
>>> + vq->queue_dma_addr);
>>> }
>>> list_del(&_vq->list);
>>> kfree(vq);
>>> @@ -1159,14 +1675,18 @@ void vring_transport_features(struct virtio_device *vdev)
>>> for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
>>> switch (i) {
>>> +#if 0 // FIXME: to be implemented
>>> case VIRTIO_RING_F_INDIRECT_DESC:
>>> break;
>>> +#endif
>>> case VIRTIO_RING_F_EVENT_IDX:
>>> break;
>>> case VIRTIO_F_VERSION_1:
>>> break;
>>> case VIRTIO_F_IOMMU_PLATFORM:
>>> break;
>>> + case VIRTIO_F_RING_PACKED:
>>> + break;
>>> default:
>>> /* We don't understand this bit. */
>>> __virtio_clear_bit(vdev, i);
>>> @@ -1187,7 +1707,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
>>> struct vring_virtqueue *vq = to_vvq(_vq);
>>> - return vq->vring.num;
>>> + return vq->packed ? vq->vring_packed.num : vq->vring.num;
>>> }
>>> EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
>>> @@ -1224,6 +1744,7 @@ dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
>>> }
>>> EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
>>> +/* Only available for split ring */
>> Interesting, I think we need this for correctly configure pci. e.g in
>> setup_vq()?
> Yes. The setup_vq() should be updated. But it requires
> QEMU change, so I just kept it as is in this RFC patch.
Ok.
>
>>> dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>>> {
>>> struct vring_virtqueue *vq = to_vvq(_vq);
>>> @@ -1235,6 +1756,7 @@ dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
>>> }
>>> EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
>>> +/* Only available for split ring */
>>> dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>> Maybe it's better to rename this to get_device_addr().
> It's a kernel API which has been exported by EXPORT_SYMBOL_GPL(),
> I'm not sure whether it's a good idea to rename it.
If it's not a part of uapi, I think we can.
Thanks
>
> Best regards,
> Tiwei Bie
>
>> Thanks
>>
>>> {
>>> struct vring_virtqueue *vq = to_vvq(_vq);
>>> @@ -1246,6 +1768,7 @@ dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
>>> }
>>> EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
>>> +/* Only available for split ring */
>>> const struct vring *virtqueue_get_vring(struct virtqueue *vq)
>>> {
>>> return &to_vvq(vq)->vring;
>>> diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
>>> index bbf32524ab27..a0075894ad16 100644
>>> --- a/include/linux/virtio_ring.h
>>> +++ b/include/linux/virtio_ring.h
>>> @@ -60,6 +60,11 @@ static inline void virtio_store_mb(bool weak_barriers,
>>> struct virtio_device;
>>> struct virtqueue;
>>> +union vring_union {
>>> + struct vring vring_split;
>>> + struct vring_packed vring_packed;
>>> +};
>>> +
>>> /*
>>> * Creates a virtqueue and allocates the descriptor ring. If
>>> * may_reduce_num is set, then this may allocate a smaller ring than
>>> @@ -79,7 +84,8 @@ struct virtqueue *vring_create_virtqueue(unsigned int index,
>>> /* Creates a virtqueue with a custom layout. */
>>> struct virtqueue *__vring_new_virtqueue(unsigned int index,
>>> - struct vring vring,
>>> + union vring_union vring,
>>> + bool packed,
>>> struct virtio_device *vdev,
>>> bool weak_barriers,
>>> bool ctx,
^ permalink raw reply
* [PATCH bpf-next 3/4] tools: bpf: cleanup PHONY target
From: Jakub Kicinski @ 2018-03-16 6:26 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180316062617.10254-1-jakub.kicinski@netronome.com>
There is no FORCE target in the Makefile and some of the PHONY
targets are missing, update the list.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
tools/bpf/Makefile | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index c07b4e718eeb..acfaf4c8cc32 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -100,4 +100,4 @@ install: $(PROGS) bpftool_install
bpftool_clean:
$(call descend,bpftool,clean)
-.PHONY: bpftool FORCE
+.PHONY: all install clean bpftool bpftool_install bpftool_clean
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 2/4] tools: bpftool: fix potential format truncation
From: Jakub Kicinski @ 2018-03-16 6:26 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180316062617.10254-1-jakub.kicinski@netronome.com>
GCC 7 complains:
xlated_dumper.c: In function ‘print_call’:
xlated_dumper.c:179:10: warning: ‘%s’ directive output may be truncated writing up to 255 bytes into a region of size between 249 and 253 [-Wformat-truncation=]
"%+d#%s", insn->off, sym->name);
Add a bit more space to the buffer so it can handle the entire
string and integer without truncation.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
tools/bpf/bpftool/xlated_dumper.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/bpf/bpftool/xlated_dumper.h b/tools/bpf/bpftool/xlated_dumper.h
index 51c935d38ae2..b34affa7ef2d 100644
--- a/tools/bpf/bpftool/xlated_dumper.h
+++ b/tools/bpf/bpftool/xlated_dumper.h
@@ -49,7 +49,7 @@ struct dump_data {
unsigned long address_call_base;
struct kernel_sym *sym_mapping;
__u32 sym_count;
- char scratch_buff[SYM_MAX_NAME];
+ char scratch_buff[SYM_MAX_NAME + 8];
};
void kernel_syms_load(struct dump_data *dd);
--
2.15.1
^ permalink raw reply related
* [PATCH bpf-next 4/4] tools: bpf: remove feature detection output
From: Jakub Kicinski @ 2018-03-16 6:26 UTC (permalink / raw)
To: alexei.starovoitov, daniel; +Cc: oss-drivers, netdev, Jakub Kicinski
In-Reply-To: <20180316062617.10254-1-jakub.kicinski@netronome.com>
bpf tools use feature detection for libbfd dependency, clean up
the output files on make clean.
Signed-off-by: Jakub Kicinski <jakub.kicinski@netronome.com>
---
tools/bpf/Makefile | 2 ++
tools/bpf/bpftool/Makefile | 2 ++
2 files changed, 4 insertions(+)
diff --git a/tools/bpf/Makefile b/tools/bpf/Makefile
index acfaf4c8cc32..1ea545965ee3 100644
--- a/tools/bpf/Makefile
+++ b/tools/bpf/Makefile
@@ -81,6 +81,8 @@ clean: bpftool_clean
$(call QUIET_CLEAN, bpf-progs)
$(Q)rm -rf $(OUTPUT)*.o $(OUTPUT)bpf_jit_disasm $(OUTPUT)bpf_dbg \
$(OUTPUT)bpf_asm $(OUTPUT)bpf_exp.yacc.* $(OUTPUT)bpf_exp.lex.*
+ $(call QUIET_CLEAN, core-gen)
+ $(Q)rm -f $(OUTPUT)FEATURE-DUMP.bpf
install: $(PROGS) bpftool_install
$(call QUIET_INSTALL, bpf_jit_disasm)
diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 50a715ac9e8b..4e69782c4a79 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -89,6 +89,8 @@ $(OUTPUT)%.o: %.c
clean: $(LIBBPF)-clean
$(call QUIET_CLEAN, bpftool)
$(Q)$(RM) $(OUTPUT)bpftool $(OUTPUT)*.o $(OUTPUT)*.d
+ $(call QUIET_CLEAN, core-gen)
+ $(Q)$(RM) $(OUTPUT)FEATURE-DUMP.bpftool
install: $(OUTPUT)bpftool
$(call QUIET_INSTALL, bpftool)
--
2.15.1
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox