Netdev List

Netdev List
 help / color / mirror / Atom feed

* Re: [RFC PATCHv2 bridge 7/7] bridge: Add the ability to show dump the vlan map from a bridge port
From: Stephen Hemminger @ 2012-09-22 20:05 UTC (permalink / raw)
  To: David Miller; +Cc: bhutchings, vyasevic, netdev
In-Reply-To: <20120922.132747.1937291269030903700.davem@davemloft.net>

On Sat, 22 Sep 2012 13:27:47 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: Ben Hutchings <bhutchings@solarflare.com>
> Date: Sat, 22 Sep 2012 18:15:32 +0100
> 
> > On Wed, 2012-09-19 at 08:42 -0400, Vlad Yasevich wrote:
> >> Using the RTM_GETLINK dump the vlan map of a given bridge port.
> > [...]
> > 
> > This enlarges the RTM_GETLINK response quite a bit.  I think perhaps
> > this should be optional, like IFLA_VFINFO_LIST is now.
> 
> Completely agreed.

Since most users won't use it, it should be not necessary to include
it if the map is blank.

^ permalink raw reply

* Re: [patch net] sky2: fix rx filter setup on link up
From: Stephen Hemminger @ 2012-09-22 20:14 UTC (permalink / raw)
  To: Jiri Pirko; +Cc: netdev, davem, mlindner, linux-kernel
In-Reply-To: <1347894617-13614-1-git-send-email-jiri@resnulli.us>

On Mon, 17 Sep 2012 17:10:17 +0200
Jiri Pirko <jiri@resnulli.us> wrote:

> +	sky2_write_rx_filter(sky2, filter);
> +	memcpy(sky2->rx_filter, filter, sizeof(sky2->rx_filter));

This isn't safe since rx_filter might be referred to from interrupt
context (ie link up PHY interrupt). Maybe using atomic64 would be
best.

^ permalink raw reply

* Re: [PATCH 2/7][RFC] netfilter: add xt_qtaguid matching module
From: richard -rw- weinberger @ 2012-09-22 21:18 UTC (permalink / raw)
  To: John Stultz
  Cc: LKML, JP Abgrall, netdev, Ashish Sharma, Peter P Waskiewicz Jr
In-Reply-To: <1348279853-44499-3-git-send-email-john.stultz@linaro.org>

Just a few comments, please see below.
In general I'd send this module also to netfilter-devel@... and get
rid of most debugging stuff.

On Sat, Sep 22, 2012 at 4:10 AM, John Stultz <john.stultz@linaro.org> wrote:
> Put procfs dirs in /proc/net/xt_qtaguid/
>   ctrl
>   stats
>   iface_stat/<iface>/...
> The uid stats are obtainable in ./stats.

Do we really want new files in /proc?

> +static const char *module_procdirname = "xt_qtaguid";

Why not char[]?

> +static struct proc_dir_entry *xt_qtaguid_procdir;
> +
> +static unsigned int proc_iface_perms = S_IRUGO;
> +module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
> +
> +static struct proc_dir_entry *xt_qtaguid_stats_file;
> +static unsigned int proc_stats_perms = S_IRUGO;
> +module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
> +
> +static struct proc_dir_entry *xt_qtaguid_ctrl_file;
> +static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUSR;
> +module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
> +
> +/* 0 means, don't limit anybody */
> +static gid_t proc_stats_readall_gid;
> +static gid_t proc_ctrl_write_gid;
> +module_param_named(stats_readall_gid, proc_stats_readall_gid, uint,
> +                  S_IRUGO | S_IWUSR);
> +module_param_named(ctrl_write_gid, proc_ctrl_write_gid, uint,
> +                  S_IRUGO | S_IWUSR);
> +
> +/*
> + * Limit the number of active tags (via socket tags) for a given UID.
> + * Multiple processes could share the UID.
> + */
> +static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
> +module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
> +
> +/*
> + * After the kernel has initiallized this module, it is still possible
> + * to make it passive.
> + * Setting passive to Y:
> + *  - the iface stats handling will not act on notifications.
> + *  - iptables matches will never match.
> + *  - ctrl commands silently succeed.
> + *  - stats are always empty.
> + * This is mostly usefull when a bug is suspected.
> + */
> +static bool module_passive;
> +module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
> +
> +/*
> + * Control how qtaguid data is tracked per proc/uid.
> + * Setting tag_tracking_passive to Y:
> + *  - don't create proc specific structs to track tags
> + *  - don't check that active tag stats exceed some limits.
> + *  - don't clean up socket tags on process exits.
> + * This is mostly usefull when a bug is suspected.
> + */
> +static bool qtu_proc_handling_passive;
> +module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
> +                  S_IRUGO | S_IWUSR);
> +
> +#define QTU_DEV_NAME "xt_qtaguid"

We have this string already in module_procdirname.

> +static LIST_HEAD(iface_stat_list);
> +static DEFINE_SPINLOCK(iface_stat_list_lock);
> +
> +static struct rb_root sock_tag_tree = RB_ROOT;
> +static DEFINE_SPINLOCK(sock_tag_list_lock);
> +
> +static struct rb_root tag_counter_set_tree = RB_ROOT;
> +static DEFINE_SPINLOCK(tag_counter_set_list_lock);
> +
> +static struct rb_root uid_tag_data_tree = RB_ROOT;
> +static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
> +
> +static struct rb_root proc_qtu_data_tree = RB_ROOT;
> +/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
> +
> +static struct qtaguid_event_counts qtu_events;
> +/*----------------------------------------------*/
> +static bool can_manipulate_uids(void)
> +{
> +       /* root pwnd */
> +       return unlikely(!current_fsuid()) || unlikely(!proc_ctrl_write_gid)
> +               || in_egroup_p(proc_ctrl_write_gid);
> +}
> +
> +static bool can_impersonate_uid(uid_t uid)
> +{
> +       return uid == current_fsuid() || can_manipulate_uids();
> +}
> +
> +static bool can_read_other_uid_stats(uid_t uid)
> +{
> +       /* root pwnd */
> +       return unlikely(!current_fsuid()) || uid == current_fsuid()
> +               || unlikely(!proc_stats_readall_gid)
> +               || in_egroup_p(proc_stats_readall_gid);
> +}
> +
> +static inline void dc_add_byte_packets(struct data_counters *counters, int set,
> +                                 enum ifs_tx_rx direction,
> +                                 enum ifs_proto ifs_proto,
> +                                 int bytes,
> +                                 int packets)
> +{
> +       counters->bpc[set][direction][ifs_proto].bytes += bytes;
> +       counters->bpc[set][direction][ifs_proto].packets += packets;
> +}
> +
> +static inline uint64_t dc_sum_bytes(struct data_counters *counters,
> +                                   int set,
> +                                   enum ifs_tx_rx direction)
> +{
> +       return counters->bpc[set][direction][IFS_TCP].bytes
> +               + counters->bpc[set][direction][IFS_UDP].bytes
> +               + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
> +}
> +
> +static inline uint64_t dc_sum_packets(struct data_counters *counters,
> +                                     int set,
> +                                     enum ifs_tx_rx direction)
> +{
> +       return counters->bpc[set][direction][IFS_TCP].packets
> +               + counters->bpc[set][direction][IFS_UDP].packets
> +               + counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
> +}
> +
> +static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
> +{
> +       struct rb_node *node = root->rb_node;
> +
> +       while (node) {
> +               struct tag_node *data = rb_entry(node, struct tag_node, node);
> +               int result;
> +               RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
> +                        " node=%p data=%p\n", tag, node, data);
> +               result = tag_compare(tag, data->tag);
> +               RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
> +                        " data.tag=0x%llx (uid=%u) res=%d\n",
> +                        tag, data->tag, get_uid_from_tag(data->tag), result);
> +               if (result < 0)
> +                       node = node->rb_left;
> +               else if (result > 0)
> +                       node = node->rb_right;
> +               else
> +                       return data;
> +       }
> +       return NULL;
> +}
> +
> +static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
> +{
> +       struct rb_node **new = &(root->rb_node), *parent = NULL;
> +
> +       /* Figure out where to put new node */
> +       while (*new) {
> +               struct tag_node *this = rb_entry(*new, struct tag_node,
> +                                                node);
> +               int result = tag_compare(data->tag, this->tag);
> +               RB_DEBUG("qtaguid: %s(): tag=0x%llx"
> +                        " (uid=%u)\n", __func__,
> +                        this->tag,
> +                        get_uid_from_tag(this->tag));
> +               parent = *new;
> +               if (result < 0)
> +                       new = &((*new)->rb_left);
> +               else if (result > 0)
> +                       new = &((*new)->rb_right);
> +               else
> +                       BUG();

WARN_ONCE(), please.
Otherwise an attacker may trigger the BUG() remotely in case of an
implementation error...

> +       }
> +
> +       /* Add new node and rebalance tree. */
> +       rb_link_node(&data->node, parent, new);
> +       rb_insert_color(&data->node, root);
> +}
> +
> +static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
> +{
> +       tag_node_tree_insert(&data->tn, root);
> +}
> +
> +static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
> +{
> +       struct tag_node *node = tag_node_tree_search(root, tag);
> +       if (!node)
> +               return NULL;
> +       return rb_entry(&node->node, struct tag_stat, tn.node);
> +}
> +
> +static void tag_counter_set_tree_insert(struct tag_counter_set *data,
> +                                       struct rb_root *root)
> +{
> +       tag_node_tree_insert(&data->tn, root);
> +}
> +
> +static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
> +                                                          tag_t tag)
> +{
> +       struct tag_node *node = tag_node_tree_search(root, tag);
> +       if (!node)
> +               return NULL;
> +       return rb_entry(&node->node, struct tag_counter_set, tn.node);
> +
> +}
> +
> +static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
> +{
> +       tag_node_tree_insert(&data->tn, root);
> +}
> +
> +static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
> +{
> +       struct tag_node *node = tag_node_tree_search(root, tag);
> +       if (!node)
> +               return NULL;
> +       return rb_entry(&node->node, struct tag_ref, tn.node);
> +}
> +
> +static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
> +                                            const struct sock *sk)
> +{
> +       struct rb_node *node = root->rb_node;
> +
> +       while (node) {
> +               struct sock_tag *data = rb_entry(node, struct sock_tag,
> +                                                sock_node);
> +               if (sk < data->sk)
> +                       node = node->rb_left;
> +               else if (sk > data->sk)
> +                       node = node->rb_right;
> +               else
> +                       return data;
> +       }
> +       return NULL;
> +}
> +
> +static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
> +{
> +       struct rb_node **new = &(root->rb_node), *parent = NULL;
> +
> +       /* Figure out where to put new node */
> +       while (*new) {
> +               struct sock_tag *this = rb_entry(*new, struct sock_tag,
> +                                                sock_node);
> +               parent = *new;
> +               if (data->sk < this->sk)
> +                       new = &((*new)->rb_left);
> +               else if (data->sk > this->sk)
> +                       new = &((*new)->rb_right);
> +               else
> +                       BUG();

Same here.

> +       }
> +
> +       /* Add new node and rebalance tree. */
> +       rb_link_node(&data->sock_node, parent, new);
> +       rb_insert_color(&data->sock_node, root);
> +}
> +
> +static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
> +{
> +       struct rb_node *node;
> +       struct sock_tag *st_entry;
> +
> +       node = rb_first(st_to_free_tree);
> +       while (node) {
> +               st_entry = rb_entry(node, struct sock_tag, sock_node);
> +               node = rb_next(node);
> +               CT_DEBUG("qtaguid: %s(): "
> +                        "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
> +                        st_entry->sk,
> +                        st_entry->tag,
> +                        get_uid_from_tag(st_entry->tag));
> +               rb_erase(&st_entry->sock_node, st_to_free_tree);
> +               sockfd_put(st_entry->socket);
> +               kfree(st_entry);
> +       }
> +}
> +
> +static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
> +                                                      const pid_t pid)
> +{
> +       struct rb_node *node = root->rb_node;
> +
> +       while (node) {
> +               struct proc_qtu_data *data = rb_entry(node,
> +                                                     struct proc_qtu_data,
> +                                                     node);
> +               if (pid < data->pid)
> +                       node = node->rb_left;
> +               else if (pid > data->pid)
> +                       node = node->rb_right;
> +               else
> +                       return data;
> +       }
> +       return NULL;
> +}
> +
> +static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
> +                                     struct rb_root *root)
> +{
> +       struct rb_node **new = &(root->rb_node), *parent = NULL;
> +
> +       /* Figure out where to put new node */
> +       while (*new) {
> +               struct proc_qtu_data *this = rb_entry(*new,
> +                                                     struct proc_qtu_data,
> +                                                     node);
> +               parent = *new;
> +               if (data->pid < this->pid)
> +                       new = &((*new)->rb_left);
> +               else if (data->pid > this->pid)
> +                       new = &((*new)->rb_right);
> +               else
> +                       BUG();

Same here.

> +       }
> +
> +       /* Add new node and rebalance tree. */
> +       rb_link_node(&data->node, parent, new);
> +       rb_insert_color(&data->node, root);
> +}
> +
> +static void uid_tag_data_tree_insert(struct uid_tag_data *data,
> +                                    struct rb_root *root)
> +{
> +       struct rb_node **new = &(root->rb_node), *parent = NULL;
> +
> +       /* Figure out where to put new node */
> +       while (*new) {
> +               struct uid_tag_data *this = rb_entry(*new,
> +                                                    struct uid_tag_data,
> +                                                    node);
> +               parent = *new;
> +               if (data->uid < this->uid)
> +                       new = &((*new)->rb_left);
> +               else if (data->uid > this->uid)
> +                       new = &((*new)->rb_right);
> +               else
> +                       BUG();

Same here.

> +       }
> +
> +       /* Add new node and rebalance tree. */
> +       rb_link_node(&data->node, parent, new);
> +       rb_insert_color(&data->node, root);
> +}
> +
> +static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
> +                                                    uid_t uid)
> +{
> +       struct rb_node *node = root->rb_node;
> +
> +       while (node) {
> +               struct uid_tag_data *data = rb_entry(node,
> +                                                    struct uid_tag_data,
> +                                                    node);
> +               if (uid < data->uid)
> +                       node = node->rb_left;
> +               else if (uid > data->uid)
> +                       node = node->rb_right;
> +               else
> +                       return data;
> +       }
> +       return NULL;
> +}
> +
> +/*
> + * Allocates a new uid_tag_data struct if needed.
> + * Returns a pointer to the found or allocated uid_tag_data.
> + * Returns a PTR_ERR on failures, and lock is not held.
> + * If found is not NULL:
> + *   sets *found to true if not allocated.
> + *   sets *found to false if allocated.
> + */
> +struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
> +{
> +       struct uid_tag_data *utd_entry;
> +
> +       /* Look for top level uid_tag_data for the UID */
> +       utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
> +       DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
> +
> +       if (found_res)
> +               *found_res = utd_entry;
> +       if (utd_entry)
> +               return utd_entry;
> +
> +       utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
> +       if (!utd_entry) {
> +               pr_err("qtaguid: get_uid_data(%u): "
> +                      "tag data alloc failed\n", uid);
> +               return ERR_PTR(-ENOMEM);
> +       }
> +
> +       utd_entry->uid = uid;
> +       utd_entry->tag_ref_tree = RB_ROOT;
> +       uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
> +       DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
> +       return utd_entry;
> +}
> +
> +/* Never returns NULL. Either PTR_ERR or a valid ptr. */
> +static struct tag_ref *new_tag_ref(tag_t new_tag,
> +                                  struct uid_tag_data *utd_entry)
> +{
> +       struct tag_ref *tr_entry;
> +       int res;
> +
> +       if (utd_entry->num_active_tags + 1 > max_sock_tags) {
> +               pr_info("qtaguid: new_tag_ref(0x%llx): "
> +                       "tag ref alloc quota exceeded. max=%d\n",
> +                       new_tag, max_sock_tags);
> +               res = -EMFILE;
> +               goto err_res;
> +
> +       }
> +
> +       tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
> +       if (!tr_entry) {
> +               pr_err("qtaguid: new_tag_ref(0x%llx): "
> +                      "tag ref alloc failed\n",
> +                      new_tag);
> +               res = -ENOMEM;
> +               goto err_res;
> +       }
> +       tr_entry->tn.tag = new_tag;
> +       /* tr_entry->num_sock_tags  handled by caller */
> +       utd_entry->num_active_tags++;
> +       tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
> +       DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
> +                " inserted new tag ref %p\n",
> +                new_tag, tr_entry);
> +       return tr_entry;
> +
> +err_res:
> +       return ERR_PTR(res);
> +}
> +
> +static struct tag_ref *lookup_tag_ref(tag_t full_tag,
> +                                     struct uid_tag_data **utd_res)
> +{
> +       struct uid_tag_data *utd_entry;
> +       struct tag_ref *tr_entry;
> +       bool found_utd;
> +       uid_t uid = get_uid_from_tag(full_tag);
> +
> +       DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
> +                full_tag, uid);
> +
> +       utd_entry = get_uid_data(uid, &found_utd);
> +       if (IS_ERR_OR_NULL(utd_entry)) {
> +               if (utd_res)
> +                       *utd_res = utd_entry;
> +               return NULL;
> +       }
> +
> +       tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
> +       if (utd_res)
> +               *utd_res = utd_entry;
> +       DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
> +                full_tag, utd_entry, tr_entry);
> +       return tr_entry;
> +}
> +
> +/* Never returns NULL. Either PTR_ERR or a valid ptr. */
> +static struct tag_ref *get_tag_ref(tag_t full_tag,
> +                                  struct uid_tag_data **utd_res)
> +{
> +       struct uid_tag_data *utd_entry;
> +       struct tag_ref *tr_entry;
> +
> +       DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
> +                full_tag);
> +       spin_lock_bh(&uid_tag_data_tree_lock);
> +       tr_entry = lookup_tag_ref(full_tag, &utd_entry);
> +       BUG_ON(IS_ERR_OR_NULL(utd_entry));
> +       if (!tr_entry)
> +               tr_entry = new_tag_ref(full_tag, utd_entry);
> +
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +       if (utd_res)
> +               *utd_res = utd_entry;
> +       DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
> +                full_tag, utd_entry, tr_entry);
> +       return tr_entry;
> +}
> +
> +/* Checks and maybe frees the UID Tag Data entry */
> +static void put_utd_entry(struct uid_tag_data *utd_entry)
> +{
> +       /* Are we done with the UID tag data entry? */
> +       if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
> +               !utd_entry->num_pqd) {
> +               DR_DEBUG("qtaguid: %s(): "
> +                        "erase utd_entry=%p uid=%u "
> +                        "by pid=%u tgid=%u uid=%u\n", __func__,
> +                        utd_entry, utd_entry->uid,
> +                        current->pid, current->tgid, current_fsuid());
> +               BUG_ON(utd_entry->num_active_tags);
> +               rb_erase(&utd_entry->node, &uid_tag_data_tree);
> +               kfree(utd_entry);
> +       } else {
> +               DR_DEBUG("qtaguid: %s(): "
> +                        "utd_entry=%p still has %d tags %d proc_qtu_data\n",
> +                        __func__, utd_entry, utd_entry->num_active_tags,
> +                        utd_entry->num_pqd);
> +               BUG_ON(!(utd_entry->num_active_tags ||
> +                        utd_entry->num_pqd));
> +       }
> +}
> +
> +/*
> + * If no sock_tags are using this tag_ref,
> + * decrements refcount of utd_entry, removes tr_entry
> + * from utd_entry->tag_ref_tree and frees.
> + */
> +static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
> +                                       struct uid_tag_data *utd_entry)
> +{
> +       DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
> +                tr_entry, tr_entry->tn.tag,
> +                get_uid_from_tag(tr_entry->tn.tag));
> +       if (!tr_entry->num_sock_tags) {
> +               BUG_ON(!utd_entry->num_active_tags);
> +               utd_entry->num_active_tags--;
> +               rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
> +               DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
> +               kfree(tr_entry);
> +       }
> +}
> +
> +static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
> +{
> +       struct rb_node *node;
> +       struct tag_ref *tr_entry;
> +       tag_t acct_tag;
> +
> +       DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
> +                full_tag, get_uid_from_tag(full_tag));
> +       acct_tag = get_atag_from_tag(full_tag);
> +       node = rb_first(&utd_entry->tag_ref_tree);
> +       while (node) {
> +               tr_entry = rb_entry(node, struct tag_ref, tn.node);
> +               node = rb_next(node);
> +               if (!acct_tag || tr_entry->tn.tag == full_tag)
> +                       free_tag_ref_from_utd_entry(tr_entry, utd_entry);
> +       }
> +}
> +
> +static int read_proc_u64(char *page, char **start, off_t off,
> +                       int count, int *eof, void *data)
> +{
> +       int len;
> +       uint64_t value;
> +       char *p = page;
> +       uint64_t *iface_entry = data;
> +
> +       if (!data)
> +               return 0;
> +
> +       value = *iface_entry;
> +       p += sprintf(p, "%llu\n", value);
> +       len = (p - page) - off;
> +       *eof = (len <= count) ? 1 : 0;
> +       *start = page + off;
> +       return len;
> +}
> +
> +static int read_proc_bool(char *page, char **start, off_t off,
> +                       int count, int *eof, void *data)
> +{
> +       int len;
> +       bool value;
> +       char *p = page;
> +       bool *bool_entry = data;
> +
> +       if (!data)
> +               return 0;
> +
> +       value = *bool_entry;
> +       p += sprintf(p, "%u\n", value);
> +       len = (p - page) - off;
> +       *eof = (len <= count) ? 1 : 0;
> +       *start = page + off;
> +       return len;
> +}
> +
> +static int get_active_counter_set(tag_t tag)
> +{
> +       int active_set = 0;
> +       struct tag_counter_set *tcs;
> +
> +       MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
> +                " (uid=%u)\n",
> +                tag, get_uid_from_tag(tag));
> +       /* For now we only handle UID tags for active sets */
> +       tag = get_utag_from_tag(tag);
> +       spin_lock_bh(&tag_counter_set_list_lock);
> +       tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
> +       if (tcs)
> +               active_set = tcs->active_set;
> +       spin_unlock_bh(&tag_counter_set_list_lock);
> +       return active_set;
> +}
> +
> +/*
> + * Find the entry for tracking the specified interface.
> + * Caller must hold iface_stat_list_lock
> + */
> +static struct iface_stat *get_iface_entry(const char *ifname)
> +{
> +       struct iface_stat *iface_entry;
> +
> +       /* Find the entry for tracking the specified tag within the interface */
> +       if (ifname == NULL) {
> +               pr_info("qtaguid: iface_stat: get() NULL device name\n");
> +               return NULL;
> +       }

Can ifname really become NULL?

> +       /* Iterate over interfaces */
> +       list_for_each_entry(iface_entry, &iface_stat_list, list) {
> +               if (!strcmp(ifname, iface_entry->ifname))
> +                       goto done;
> +       }
> +       iface_entry = NULL;
> +done:
> +       return iface_entry;
> +}
> +
> +static int iface_stat_all_proc_read(char *page, char **num_items_returned,
> +                                   off_t items_to_skip, int char_count,
> +                                   int *eof, void *data)
> +{
> +       char *outp = page;
> +       int item_index = 0;
> +       int len;
> +       struct iface_stat *iface_entry;
> +       struct rtnl_link_stats64 dev_stats, *stats;
> +       struct rtnl_link_stats64 no_dev_stats = {0};
> +
> +       if (unlikely(module_passive)) {
> +               *eof = 1;
> +               return 0;
> +       }
> +
> +       CT_DEBUG("qtaguid:proc iface_stat_all "
> +                "page=%p *num_items_returned=%p off=%ld "
> +                "char_count=%d *eof=%d\n", page, *num_items_returned,
> +                items_to_skip, char_count, *eof);
> +
> +       if (*eof)
> +               return 0;
> +
> +       /*
> +        * This lock will prevent iface_stat_update() from changing active,
> +        * and in turn prevent an interface from unregistering itself.
> +        */
> +       spin_lock_bh(&iface_stat_list_lock);
> +       list_for_each_entry(iface_entry, &iface_stat_list, list) {
> +               if (item_index++ < items_to_skip)
> +                       continue;
> +
> +               if (iface_entry->active) {
> +                       stats = dev_get_stats(iface_entry->net_dev,
> +                                             &dev_stats);
> +               } else {
> +                       stats = &no_dev_stats;
> +               }
> +               len = snprintf(outp, char_count,
> +                              "%s %d "
> +                              "%llu %llu %llu %llu "
> +                              "%llu %llu %llu %llu\n",
> +                              iface_entry->ifname,
> +                              iface_entry->active,
> +                              iface_entry->totals[IFS_RX].bytes,
> +                              iface_entry->totals[IFS_RX].packets,
> +                              iface_entry->totals[IFS_TX].bytes,
> +                              iface_entry->totals[IFS_TX].packets,
> +                              stats->rx_bytes, stats->rx_packets,
> +                              stats->tx_bytes, stats->tx_packets);
> +               if (len >= char_count) {
> +                       spin_unlock_bh(&iface_stat_list_lock);
> +                       *outp = '\0';
> +                       return outp - page;
> +               }
> +               outp += len;
> +               char_count -= len;
> +               (*num_items_returned)++;
> +       }
> +       spin_unlock_bh(&iface_stat_list_lock);
> +
> +       *eof = 1;
> +       return outp - page;
> +}
> +
> +static void iface_create_proc_worker(struct work_struct *work)
> +{
> +       struct proc_dir_entry *proc_entry;
> +       struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
> +                                                  iface_work);
> +       struct iface_stat *new_iface  = isw->iface_entry;
> +
> +       /* iface_entries are not deleted, so safe to manipulate. */
> +       proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
> +       if (IS_ERR_OR_NULL(proc_entry)) {
> +               pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
> +               kfree(isw);
> +               return;
> +       }
> +
> +       new_iface->proc_ptr = proc_entry;
> +
> +       create_proc_read_entry("tx_bytes", proc_iface_perms, proc_entry,
> +                       read_proc_u64, &new_iface->totals[IFS_TX].bytes);
> +       create_proc_read_entry("rx_bytes", proc_iface_perms, proc_entry,
> +                       read_proc_u64, &new_iface->totals[IFS_RX].bytes);
> +       create_proc_read_entry("tx_packets", proc_iface_perms, proc_entry,
> +                       read_proc_u64, &new_iface->totals[IFS_TX].packets);
> +       create_proc_read_entry("rx_packets", proc_iface_perms, proc_entry,
> +                       read_proc_u64, &new_iface->totals[IFS_RX].packets);
> +       create_proc_read_entry("active", proc_iface_perms, proc_entry,
> +                       read_proc_bool, &new_iface->active);
> +
> +       IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
> +                "entry=%p dev=%s\n", new_iface, new_iface->ifname);
> +       kfree(isw);
> +}
> +
> +/*
> + * Will set the entry's active state, and
> + * update the net_dev accordingly also.
> + */
> +static void _iface_stat_set_active(struct iface_stat *entry,
> +                                  struct net_device *net_dev,
> +                                  bool activate)
> +{
> +       if (activate) {
> +               entry->net_dev = net_dev;
> +               entry->active = true;
> +               IF_DEBUG("qtaguid: %s(%s): "
> +                        "enable tracking. rfcnt=%d\n", __func__,
> +                        entry->ifname,
> +                        this_cpu_read(*net_dev->pcpu_refcnt));
> +       } else {
> +               entry->active = false;
> +               entry->net_dev = NULL;
> +               IF_DEBUG("qtaguid: %s(%s): "
> +                        "disable tracking. rfcnt=%d\n", __func__,
> +                        entry->ifname,
> +                        this_cpu_read(*net_dev->pcpu_refcnt));
> +
> +       }
> +}
> +
> +/* Caller must hold iface_stat_list_lock */
> +static struct iface_stat *iface_alloc(struct net_device *net_dev)
> +{
> +       struct iface_stat *new_iface;
> +       struct iface_stat_work *isw;
> +
> +       new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
> +       if (new_iface == NULL) {
> +               pr_err("qtaguid: iface_stat: create(%s): "
> +                      "iface_stat alloc failed\n", net_dev->name);
> +               return NULL;
> +       }
> +       new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
> +       if (new_iface->ifname == NULL) {
> +               pr_err("qtaguid: iface_stat: create(%s): "
> +                      "ifname alloc failed\n", net_dev->name);
> +               kfree(new_iface);
> +               return NULL;
> +       }
> +       spin_lock_init(&new_iface->tag_stat_list_lock);
> +       new_iface->tag_stat_tree = RB_ROOT;
> +       _iface_stat_set_active(new_iface, net_dev, true);
> +
> +       /*
> +        * ipv6 notifier chains are atomic :(
> +        * No create_proc_read_entry() for you!
> +        */
> +       isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
> +       if (!isw) {
> +               pr_err("qtaguid: iface_stat: create(%s): "
> +                      "work alloc failed\n", new_iface->ifname);
> +               _iface_stat_set_active(new_iface, net_dev, false);
> +               kfree(new_iface->ifname);
> +               kfree(new_iface);
> +               return NULL;
> +       }
> +       isw->iface_entry = new_iface;
> +       INIT_WORK(&isw->iface_work, iface_create_proc_worker);
> +       schedule_work(&isw->iface_work);
> +       list_add(&new_iface->list, &iface_stat_list);
> +       return new_iface;
> +}
> +
> +static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
> +                                              struct iface_stat *iface)
> +{
> +       struct rtnl_link_stats64 dev_stats, *stats;
> +       bool stats_rewound;
> +
> +       stats = dev_get_stats(net_dev, &dev_stats);
> +       /* No empty packets */
> +       stats_rewound =
> +               (stats->rx_bytes < iface->last_known[IFS_RX].bytes)
> +               || (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
> +
> +       IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
> +                "bytes rx/tx=%llu/%llu "
> +                "active=%d last_known=%d "
> +                "stats_rewound=%d\n", __func__,
> +                net_dev ? net_dev->name : "?",
> +                iface, net_dev,
> +                stats->rx_bytes, stats->tx_bytes,
> +                iface->active, iface->last_known_valid, stats_rewound);
> +
> +       if (iface->active && iface->last_known_valid && stats_rewound) {
> +               pr_warn_once("qtaguid: iface_stat: %s(%s): "
> +                            "iface reset its stats unexpectedly\n", __func__,
> +                            net_dev->name);
> +
> +               iface->totals[IFS_TX].bytes += iface->last_known[IFS_TX].bytes;
> +               iface->totals[IFS_TX].packets +=
> +                       iface->last_known[IFS_TX].packets;
> +               iface->totals[IFS_RX].bytes += iface->last_known[IFS_RX].bytes;
> +               iface->totals[IFS_RX].packets +=
> +                       iface->last_known[IFS_RX].packets;
> +               iface->last_known_valid = false;
> +               IF_DEBUG("qtaguid: %s(%s): iface=%p "
> +                        "used last known bytes rx/tx=%llu/%llu\n", __func__,
> +                        iface->ifname, iface, iface->last_known[IFS_RX].bytes,
> +                        iface->last_known[IFS_TX].bytes);
> +       }
> +}
> +
> +/*
> + * Create a new entry for tracking the specified interface.
> + * Do nothing if the entry already exists.
> + * Called when an interface is configured with a valid IP address.
> + */
> +static void iface_stat_create(struct net_device *net_dev,
> +                             struct in_ifaddr *ifa)
> +{
> +       struct in_device *in_dev = NULL;
> +       const char *ifname;
> +       struct iface_stat *entry;
> +       __be32 ipaddr = 0;
> +       struct iface_stat *new_iface;
> +
> +       IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
> +                net_dev ? net_dev->name : "?",
> +                ifa, net_dev);
> +       if (!net_dev) {
> +               pr_err("qtaguid: iface_stat: create(): no net dev\n");
> +               return;
> +       }
> +
> +       ifname = net_dev->name;
> +       if (!ifa) {
> +               in_dev = in_dev_get(net_dev);
> +               if (!in_dev) {
> +                       pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
> +                              ifname);
> +                       return;
> +               }
> +               IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
> +                        ifname, in_dev);
> +               for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
> +                       IF_DEBUG("qtaguid: iface_stat: create(%s): "
> +                                "ifa=%p ifa_label=%s\n",
> +                                ifname, ifa,
> +                                ifa->ifa_label ? ifa->ifa_label : "(null)");
> +                       if (ifa->ifa_label && !strcmp(ifname, ifa->ifa_label))
> +                               break;
> +               }
> +       }
> +
> +       if (!ifa) {
> +               IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
> +                        ifname);
> +               goto done_put;
> +       }
> +       ipaddr = ifa->ifa_local;
> +
> +       spin_lock_bh(&iface_stat_list_lock);
> +       entry = get_iface_entry(ifname);
> +       if (entry != NULL) {
> +               bool activate = !ipv4_is_loopback(ipaddr);
> +               IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
> +                        ifname, entry);
> +               iface_check_stats_reset_and_adjust(net_dev, entry);
> +               _iface_stat_set_active(entry, net_dev, activate);
> +               IF_DEBUG("qtaguid: %s(%s): "
> +                        "tracking now %d on ip=%pI4\n", __func__,
> +                        entry->ifname, activate, &ipaddr);
> +               goto done_unlock_put;
> +       } else if (ipv4_is_loopback(ipaddr)) {
> +               IF_DEBUG("qtaguid: iface_stat: create(%s): "
> +                        "ignore loopback dev. ip=%pI4\n", ifname, &ipaddr);
> +               goto done_unlock_put;
> +       }
> +
> +       new_iface = iface_alloc(net_dev);
> +       IF_DEBUG("qtaguid: iface_stat: create(%s): done "
> +                "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
> +done_unlock_put:
> +       spin_unlock_bh(&iface_stat_list_lock);
> +done_put:
> +       if (in_dev)
> +               in_dev_put(in_dev);
> +}
> +
> +static void iface_stat_create_ipv6(struct net_device *net_dev,
> +                                  struct inet6_ifaddr *ifa)
> +{
> +       struct in_device *in_dev;
> +       const char *ifname;
> +       struct iface_stat *entry;
> +       struct iface_stat *new_iface;
> +       int addr_type;
> +
> +       IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
> +                ifa, net_dev, net_dev ? net_dev->name : "");
> +       if (!net_dev) {
> +               pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
> +               return;
> +       }
> +       ifname = net_dev->name;
> +
> +       in_dev = in_dev_get(net_dev);
> +       if (!in_dev) {
> +               pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
> +                      ifname);
> +               return;
> +       }
> +
> +       IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
> +                ifname, in_dev);
> +
> +       if (!ifa) {
> +               IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
> +                        ifname);
> +               goto done_put;
> +       }
> +       addr_type = ipv6_addr_type(&ifa->addr);
> +
> +       spin_lock_bh(&iface_stat_list_lock);
> +       entry = get_iface_entry(ifname);
> +       if (entry != NULL) {
> +               bool activate = !(addr_type & IPV6_ADDR_LOOPBACK);
> +               IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
> +                        ifname, entry);
> +               iface_check_stats_reset_and_adjust(net_dev, entry);
> +               _iface_stat_set_active(entry, net_dev, activate);
> +               IF_DEBUG("qtaguid: %s(%s): "
> +                        "tracking now %d on ip=%pI6c\n", __func__,
> +                        entry->ifname, activate, &ifa->addr);
> +               goto done_unlock_put;
> +       } else if (addr_type & IPV6_ADDR_LOOPBACK) {
> +               IF_DEBUG("qtaguid: %s(%s): "
> +                        "ignore loopback dev. ip=%pI6c\n", __func__,
> +                        ifname, &ifa->addr);
> +               goto done_unlock_put;
> +       }
> +
> +       new_iface = iface_alloc(net_dev);
> +       IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
> +                "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
> +
> +done_unlock_put:
> +       spin_unlock_bh(&iface_stat_list_lock);
> +done_put:
> +       in_dev_put(in_dev);
> +}
> +
> +static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
> +{
> +       MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
> +       return sock_tag_tree_search(&sock_tag_tree, sk);
> +}
> +
> +static struct sock_tag *get_sock_stat(const struct sock *sk)
> +{
> +       struct sock_tag *sock_tag_entry;
> +       MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
> +       if (!sk)
> +               return NULL;
> +       spin_lock_bh(&sock_tag_list_lock);
> +       sock_tag_entry = get_sock_stat_nl(sk);
> +       spin_unlock_bh(&sock_tag_list_lock);
> +       return sock_tag_entry;
> +}
> +
> +static void
> +data_counters_update(struct data_counters *dc, int set,
> +                    enum ifs_tx_rx direction, int proto, int bytes)
> +{
> +       switch (proto) {
> +       case IPPROTO_TCP:
> +               dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
> +               break;
> +       case IPPROTO_UDP:
> +               dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
> +               break;
> +       case IPPROTO_IP:
> +       default:
> +               dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
> +                                   1);
> +               break;
> +       }
> +}
> +
> +/*
> + * Update stats for the specified interface. Do nothing if the entry
> + * does not exist (when a device was never configured with an IP address).
> + * Called when an device is being unregistered.
> + */
> +static void iface_stat_update(struct net_device *net_dev, bool stash_only)
> +{
> +       struct rtnl_link_stats64 dev_stats, *stats;
> +       struct iface_stat *entry;
> +
> +       stats = dev_get_stats(net_dev, &dev_stats);
> +       spin_lock_bh(&iface_stat_list_lock);
> +       entry = get_iface_entry(net_dev->name);
> +       if (entry == NULL) {
> +               IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
> +                        net_dev->name);
> +               spin_unlock_bh(&iface_stat_list_lock);
> +               return;
> +       }
> +
> +       IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
> +                net_dev->name, entry);
> +       if (!entry->active) {
> +               IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
> +                        net_dev->name);
> +               spin_unlock_bh(&iface_stat_list_lock);
> +               return;
> +       }
> +
> +       if (stash_only) {
> +               entry->last_known[IFS_TX].bytes = stats->tx_bytes;
> +               entry->last_known[IFS_TX].packets = stats->tx_packets;
> +               entry->last_known[IFS_RX].bytes = stats->rx_bytes;
> +               entry->last_known[IFS_RX].packets = stats->rx_packets;
> +               entry->last_known_valid = true;
> +               IF_DEBUG("qtaguid: %s(%s): "
> +                        "dev stats stashed rx/tx=%llu/%llu\n", __func__,
> +                        net_dev->name, stats->rx_bytes, stats->tx_bytes);
> +               spin_unlock_bh(&iface_stat_list_lock);
> +               return;
> +       }
> +       entry->totals[IFS_TX].bytes += stats->tx_bytes;
> +       entry->totals[IFS_TX].packets += stats->tx_packets;
> +       entry->totals[IFS_RX].bytes += stats->rx_bytes;
> +       entry->totals[IFS_RX].packets += stats->rx_packets;
> +       /* We don't need the last_known[] anymore */
> +       entry->last_known_valid = false;
> +       _iface_stat_set_active(entry, net_dev, false);
> +       IF_DEBUG("qtaguid: %s(%s): "
> +                "disable tracking. rx/tx=%llu/%llu\n", __func__,
> +                net_dev->name, stats->rx_bytes, stats->tx_bytes);
> +       spin_unlock_bh(&iface_stat_list_lock);
> +}
> +
> +static void tag_stat_update(struct tag_stat *tag_entry,
> +                       enum ifs_tx_rx direction, int proto, int bytes)
> +{
> +       int active_set;
> +       active_set = get_active_counter_set(tag_entry->tn.tag);
> +       MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
> +                "dir=%d proto=%d bytes=%d)\n",
> +                tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
> +                active_set, direction, proto, bytes);
> +       data_counters_update(&tag_entry->counters, active_set, direction,
> +                            proto, bytes);
> +       if (tag_entry->parent_counters)
> +               data_counters_update(tag_entry->parent_counters, active_set,
> +                                    direction, proto, bytes);
> +}
> +
> +/*
> + * Create a new entry for tracking the specified {acct_tag,uid_tag} within
> + * the interface.
> + * iface_entry->tag_stat_list_lock should be held.
> + */
> +static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
> +                                          tag_t tag)
> +{
> +       struct tag_stat *new_tag_stat_entry = NULL;
> +       IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
> +                " (uid=%u)\n", __func__,
> +                iface_entry, tag, get_uid_from_tag(tag));
> +       new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
> +       if (!new_tag_stat_entry) {
> +               pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
> +               goto done;
> +       }
> +       new_tag_stat_entry->tn.tag = tag;
> +       tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
> +done:
> +       return new_tag_stat_entry;
> +}
> +
> +static void if_tag_stat_update(const char *ifname, uid_t uid,
> +                              const struct sock *sk, enum ifs_tx_rx direction,
> +                              int proto, int bytes)
> +{
> +       struct tag_stat *tag_stat_entry;
> +       tag_t tag, acct_tag;
> +       tag_t uid_tag;
> +       struct data_counters *uid_tag_counters;
> +       struct sock_tag *sock_tag_entry;
> +       struct iface_stat *iface_entry;
> +       struct tag_stat *new_tag_stat;
> +       MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
> +               "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
> +                ifname, uid, sk, direction, proto, bytes);
> +
> +
> +       iface_entry = get_iface_entry(ifname);
> +       if (!iface_entry) {
> +               pr_err("qtaguid: iface_stat: stat_update() %s not found\n",
> +                      ifname);
> +               return;
> +       }
> +       /* It is ok to process data when an iface_entry is inactive */
> +
> +       MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n",
> +                ifname, iface_entry);
> +
> +       /*
> +        * Look for a tagged sock.
> +        * It will have an acct_uid.
> +        */
> +       sock_tag_entry = get_sock_stat(sk);
> +       if (sock_tag_entry) {
> +               tag = sock_tag_entry->tag;
> +               acct_tag = get_atag_from_tag(tag);
> +               uid_tag = get_utag_from_tag(tag);
> +       } else {
> +               acct_tag = make_atag_from_value(0);
> +               tag = combine_atag_with_uid(acct_tag, uid);
> +               uid_tag = make_tag_from_uid(uid);
> +       }
> +       MT_DEBUG("qtaguid: iface_stat: stat_update(): "
> +                " looking for tag=0x%llx (uid=%u) in ife=%p\n",
> +                tag, get_uid_from_tag(tag), iface_entry);
> +       /* Loop over tag list under this interface for {acct_tag,uid_tag} */
> +       spin_lock_bh(&iface_entry->tag_stat_list_lock);
> +
> +       tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
> +                                             tag);
> +       if (tag_stat_entry) {
> +               /*
> +                * Updating the {acct_tag, uid_tag} entry handles both stats:
> +                * {0, uid_tag} will also get updated.
> +                */
> +               tag_stat_update(tag_stat_entry, direction, proto, bytes);
> +               spin_unlock_bh(&iface_entry->tag_stat_list_lock);
> +               return;
> +       }
> +
> +       /* Loop over tag list under this interface for {0,uid_tag} */
> +       tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
> +                                             uid_tag);
> +       if (!tag_stat_entry) {
> +               /* Here: the base uid_tag did not exist */
> +               /*
> +                * No parent counters. So
> +                *  - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
> +                */
> +               new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
> +               uid_tag_counters = &new_tag_stat->counters;
> +       } else {
> +               uid_tag_counters = &tag_stat_entry->counters;
> +       }
> +
> +       if (acct_tag) {
> +               new_tag_stat = create_if_tag_stat(iface_entry, tag);
> +               new_tag_stat->parent_counters = uid_tag_counters;
> +       }
> +       tag_stat_update(new_tag_stat, direction, proto, bytes);
> +       spin_unlock_bh(&iface_entry->tag_stat_list_lock);
> +}
> +
> +static int iface_netdev_event_handler(struct notifier_block *nb,
> +                                     unsigned long event, void *ptr) {
> +       struct net_device *dev = ptr;
> +
> +       if (unlikely(module_passive))
> +               return NOTIFY_DONE;
> +
> +       IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
> +                "ev=0x%lx/%s netdev=%p->name=%s\n",
> +                event, netdev_evt_str(event), dev, dev ? dev->name : "");
> +
> +       switch (event) {
> +       case NETDEV_UP:
> +               iface_stat_create(dev, NULL);
> +               atomic64_inc(&qtu_events.iface_events);
> +               break;
> +       case NETDEV_DOWN:
> +       case NETDEV_UNREGISTER:
> +               iface_stat_update(dev, event == NETDEV_DOWN);
> +               atomic64_inc(&qtu_events.iface_events);
> +               break;
> +       }
> +       return NOTIFY_DONE;
> +}
> +
> +static int iface_inet6addr_event_handler(struct notifier_block *nb,
> +                                        unsigned long event, void *ptr)
> +{
> +       struct inet6_ifaddr *ifa = ptr;
> +       struct net_device *dev;
> +
> +       if (unlikely(module_passive))
> +               return NOTIFY_DONE;
> +
> +       IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
> +                "ev=0x%lx/%s ifa=%p\n",
> +                event, netdev_evt_str(event), ifa);
> +
> +       switch (event) {
> +       case NETDEV_UP:
> +               BUG_ON(!ifa || !ifa->idev);
> +               dev = (struct net_device *)ifa->idev->dev;
> +               iface_stat_create_ipv6(dev, ifa);
> +               atomic64_inc(&qtu_events.iface_events);
> +               break;
> +       case NETDEV_DOWN:
> +       case NETDEV_UNREGISTER:
> +               BUG_ON(!ifa || !ifa->idev);
> +               dev = (struct net_device *)ifa->idev->dev;
> +               iface_stat_update(dev, event == NETDEV_DOWN);
> +               atomic64_inc(&qtu_events.iface_events);
> +               break;
> +       }
> +       return NOTIFY_DONE;
> +}
> +
> +static int iface_inetaddr_event_handler(struct notifier_block *nb,
> +                                       unsigned long event, void *ptr)
> +{
> +       struct in_ifaddr *ifa = ptr;
> +       struct net_device *dev;
> +
> +       if (unlikely(module_passive))
> +               return NOTIFY_DONE;
> +
> +       IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
> +                "ev=0x%lx/%s ifa=%p\n",
> +                event, netdev_evt_str(event), ifa);
> +
> +       switch (event) {
> +       case NETDEV_UP:
> +               BUG_ON(!ifa || !ifa->ifa_dev);
> +               dev = ifa->ifa_dev->dev;
> +               iface_stat_create(dev, ifa);
> +               atomic64_inc(&qtu_events.iface_events);
> +               break;
> +       case NETDEV_DOWN:
> +       case NETDEV_UNREGISTER:
> +               BUG_ON(!ifa || !ifa->ifa_dev);
> +               dev = ifa->ifa_dev->dev;
> +               iface_stat_update(dev, event == NETDEV_DOWN);
> +               atomic64_inc(&qtu_events.iface_events);
> +               break;
> +       }
> +       return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block iface_netdev_notifier_blk = {
> +       .notifier_call = iface_netdev_event_handler,
> +};
> +
> +static struct notifier_block iface_inetaddr_notifier_blk = {
> +       .notifier_call = iface_inetaddr_event_handler,
> +};
> +
> +static struct notifier_block iface_inet6addr_notifier_blk = {
> +       .notifier_call = iface_inet6addr_event_handler,
> +};
> +
> +static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
> +{
> +       int err;
> +
> +       iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
> +       if (!iface_stat_procdir) {
> +               pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
> +               err = -1;
> +               goto err;
> +       }
> +
> +       iface_stat_all_procfile = create_proc_entry(iface_stat_all_procfilename,
> +                                                   proc_iface_perms,
> +                                                   parent_procdir);
> +       if (!iface_stat_all_procfile) {
> +               pr_err("qtaguid: iface_stat: init "
> +                      " failed to create stat_all proc entry\n");
> +               err = -1;
> +               goto err_zap_entry;
> +       }
> +       iface_stat_all_procfile->read_proc = iface_stat_all_proc_read;
> +
> +
> +       err = register_netdevice_notifier(&iface_netdev_notifier_blk);
> +       if (err) {
> +               pr_err("qtaguid: iface_stat: init "
> +                      "failed to register dev event handler\n");
> +               goto err_zap_all_stats_entry;
> +       }
> +       err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
> +       if (err) {
> +               pr_err("qtaguid: iface_stat: init "
> +                      "failed to register ipv4 dev event handler\n");
> +               goto err_unreg_nd;
> +       }
> +
> +       err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
> +       if (err) {
> +               pr_err("qtaguid: iface_stat: init "
> +                      "failed to register ipv6 dev event handler\n");
> +               goto err_unreg_ip4_addr;
> +       }
> +       return 0;
> +
> +err_unreg_ip4_addr:
> +       unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
> +err_unreg_nd:
> +       unregister_netdevice_notifier(&iface_netdev_notifier_blk);
> +err_zap_all_stats_entry:
> +       remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
> +err_zap_entry:
> +       remove_proc_entry(iface_stat_procdirname, parent_procdir);
> +err:
> +       return err;
> +}
> +
> +static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
> +                                   struct xt_action_param *par)
> +{
> +       struct sock *sk;
> +       unsigned int hook_mask = (1 << par->hooknum);
> +
> +       MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb,
> +                par->hooknum, par->family);
> +
> +       /*
> +        * Let's not abuse the the xt_socket_get*_sk(), or else it will
> +        * return garbage SKs.
> +        */
> +       if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
> +               return NULL;
> +
> +       switch (par->family) {
> +       case NFPROTO_IPV6:
> +               sk = xt_socket_get6_sk(skb, par);
> +               break;
> +       case NFPROTO_IPV4:
> +               sk = xt_socket_get4_sk(skb, par);
> +               break;
> +       default:
> +               return NULL;
> +       }
> +
> +       /*
> +        * Seems to be issues on the file ptr for TCP_TIME_WAIT SKs.
> +        * http://kerneltrap.org/mailarchive/linux-netdev/2010/10/21/6287959
> +        * Not fixed in 3.0-r3 :(
> +        */

Is it fixed now?

> +       if (sk) {
> +               MT_DEBUG("qtaguid: %p->sk_proto=%u "
> +                        "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state);
> +               if (sk->sk_state  == TCP_TIME_WAIT) {
> +                       xt_socket_put_sk(sk);
> +                       sk = NULL;
> +               }
> +       }
> +       return sk;
> +}
> +
> +static void account_for_uid(const struct sk_buff *skb,
> +                           const struct sock *alternate_sk, uid_t uid,
> +                           struct xt_action_param *par)
> +{
> +       const struct net_device *el_dev;
> +
> +       if (!skb->dev) {
> +               MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum);
> +               el_dev = par->in ? : par->out;
> +       } else {
> +               const struct net_device *other_dev;
> +               el_dev = skb->dev;
> +               other_dev = par->in ? : par->out;
> +               if (el_dev != other_dev) {
> +                       MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs "
> +                               "par->(in/out)=%p %s\n",
> +                               par->hooknum, el_dev, el_dev->name, other_dev,
> +                               other_dev->name);
> +               }
> +       }
> +
> +       if (unlikely(!el_dev)) {
> +               pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum);
> +       } else if (unlikely(!el_dev->name)) {
> +               pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum);
> +       } else {
> +               MT_DEBUG("qtaguid[%d]: dev name=%s type=%d\n",
> +                        par->hooknum,
> +                        el_dev->name,
> +                        el_dev->type);
> +
> +               if_tag_stat_update(el_dev->name, uid,
> +                               skb->sk ? skb->sk : alternate_sk,
> +                               par->in ? IFS_RX : IFS_TX,
> +                               ip_hdr(skb)->protocol, skb->len);
> +       }
> +}
> +
> +static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
> +{
> +       const struct xt_qtaguid_match_info *info = par->matchinfo;
> +       const struct file *filp;
> +       bool got_sock = false;
> +       struct sock *sk;
> +       uid_t sock_uid;
> +       bool res;
> +
> +       if (unlikely(module_passive))
> +               return (info->match ^ info->invert) == 0;
> +
> +       MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
> +                par->hooknum, skb, par->in, par->out, par->family);
> +
> +       atomic64_inc(&qtu_events.match_calls);
> +       if (skb == NULL) {
> +               res = (info->match ^ info->invert) == 0;
> +               goto ret_res;
> +       }
> +
> +       sk = skb->sk;
> +
> +       if (sk == NULL) {
> +               /*
> +                * A missing sk->sk_socket happens when packets are in-flight
> +                * and the matching socket is already closed and gone.
> +                */
> +               sk = qtaguid_find_sk(skb, par);
> +               /*
> +                * If we got the socket from the find_sk(), we will need to put
> +                * it back, as nf_tproxy_get_sock_v4() got it.
> +                */
> +               got_sock = sk;
> +               if (sk)
> +                       atomic64_inc(&qtu_events.match_found_sk_in_ct);
> +               else
> +                       atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
> +       } else {
> +               atomic64_inc(&qtu_events.match_found_sk);
> +       }
> +       MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d proto=%d\n",
> +               par->hooknum, sk, got_sock, ip_hdr(skb)->protocol);
> +       if (sk != NULL) {
> +               MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
> +                       par->hooknum, sk, sk->sk_socket,
> +                       sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
> +               filp = sk->sk_socket ? sk->sk_socket->file : NULL;
> +               MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
> +                       par->hooknum, filp ? filp->f_cred->fsuid : -1);
> +       }
> +
> +       if (sk == NULL || sk->sk_socket == NULL) {
> +               /*
> +                * Here, the qtaguid_find_sk() using connection tracking
> +                * couldn't find the owner, so for now we just count them
> +                * against the system.
> +                */
> +               /*
> +                * TODO: unhack how to force just accounting.
> +                * For now we only do iface stats when the uid-owner is not
> +                * requested.
> +                */
> +               if (!(info->match & XT_QTAGUID_UID))
> +                       account_for_uid(skb, sk, 0, par);
> +               MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n",
> +                       par->hooknum,
> +                       sk ? sk->sk_socket : NULL);
> +               res = (info->match ^ info->invert) == 0;
> +               atomic64_inc(&qtu_events.match_no_sk);
> +               goto put_sock_ret_res;
> +       } else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
> +               res = false;
> +               goto put_sock_ret_res;
> +       }
> +       filp = sk->sk_socket->file;
> +       if (filp == NULL) {
> +               MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum);
> +               account_for_uid(skb, sk, 0, par);
> +               res = ((info->match ^ info->invert) &
> +                       (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0;
> +               atomic64_inc(&qtu_events.match_no_sk_file);
> +               goto put_sock_ret_res;
> +       }
> +       sock_uid = filp->f_cred->fsuid;
> +       /*
> +        * TODO: unhack how to force just accounting.
> +        * For now we only do iface stats when the uid-owner is not requested
> +        */
> +       if (!(info->match & XT_QTAGUID_UID))
> +               account_for_uid(skb, sk, sock_uid, par);
> +
> +       /*
> +        * The following two tests fail the match when:
> +        *    id not in range AND no inverted condition requested
> +        * or id     in range AND    inverted condition requested
> +        * Thus (!a && b) || (a && !b) == a ^ b
> +        */
> +       if (info->match & XT_QTAGUID_UID)
> +               if ((filp->f_cred->fsuid >= info->uid_min &&
> +                    filp->f_cred->fsuid <= info->uid_max) ^
> +                   !(info->invert & XT_QTAGUID_UID)) {
> +                       MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
> +                                par->hooknum);
> +                       res = false;
> +                       goto put_sock_ret_res;
> +               }
> +       if (info->match & XT_QTAGUID_GID)
> +               if ((filp->f_cred->fsgid >= info->gid_min &&
> +                               filp->f_cred->fsgid <= info->gid_max) ^
> +                       !(info->invert & XT_QTAGUID_GID)) {
> +                       MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
> +                               par->hooknum);
> +                       res = false;
> +                       goto put_sock_ret_res;
> +               }
> +
> +       MT_DEBUG("qtaguid[%d]: leaving matched\n", par->hooknum);
> +       res = true;
> +
> +put_sock_ret_res:
> +       if (got_sock)
> +               xt_socket_put_sk(sk);
> +ret_res:
> +       MT_DEBUG("qtaguid[%d]: left %d\n", par->hooknum, res);
> +       return res;
> +}
> +
> +#ifdef DDEBUG
> +/* This function is not in xt_qtaguid_print.c because of locks visibility */
> +static void prdebug_full_state(int indent_level, const char *fmt, ...)
> +{
> +       va_list args;
> +       char *fmt_buff;
> +       char *buff;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       fmt_buff = kasprintf(GFP_ATOMIC,
> +                            "qtaguid: %s(): %s {\n", __func__, fmt);
> +       BUG_ON(!fmt_buff);
> +       va_start(args, fmt);
> +       buff = kvasprintf(GFP_ATOMIC,
> +                         fmt_buff, args);
> +       BUG_ON(!buff);
> +       pr_debug("%s", buff);
> +       kfree(fmt_buff);
> +       kfree(buff);
> +       va_end(args);
> +
> +       spin_lock_bh(&sock_tag_list_lock);
> +       prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
> +       spin_unlock_bh(&sock_tag_list_lock);
> +
> +       spin_lock_bh(&sock_tag_list_lock);
> +       spin_lock_bh(&uid_tag_data_tree_lock);
> +       prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
> +       prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +       spin_unlock_bh(&sock_tag_list_lock);
> +
> +       spin_lock_bh(&iface_stat_list_lock);
> +       prdebug_iface_stat_list(indent_level, &iface_stat_list);
> +       spin_unlock_bh(&iface_stat_list_lock);
> +
> +       pr_debug("qtaguid: %s(): }\n", __func__);
> +}
> +#else
> +static void prdebug_full_state(int indent_level, const char *fmt, ...) {}
> +#endif
> +
> +/*
> + * Procfs reader to get all active socket tags using style "1)" as described in
> + * fs/proc/generic.c
> + */
> +static int qtaguid_ctrl_proc_read(char *page, char **num_items_returned,
> +                                 off_t items_to_skip, int char_count, int *eof,
> +                                 void *data)
> +{
> +       char *outp = page;
> +       int len;
> +       uid_t uid;
> +       struct rb_node *node;
> +       struct sock_tag *sock_tag_entry;
> +       int item_index = 0;
> +       int indent_level = 0;
> +       long f_count;
> +
> +       if (unlikely(module_passive)) {
> +               *eof = 1;
> +               return 0;
> +       }
> +
> +       if (*eof)
> +               return 0;
> +
> +       CT_DEBUG("qtaguid: proc ctrl page=%p off=%ld char_count=%d *eof=%d\n",
> +               page, items_to_skip, char_count, *eof);
> +
> +       spin_lock_bh(&sock_tag_list_lock);
> +       for (node = rb_first(&sock_tag_tree);
> +            node;
> +            node = rb_next(node)) {
> +               if (item_index++ < items_to_skip)
> +                       continue;
> +               sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
> +               uid = get_uid_from_tag(sock_tag_entry->tag);
> +               CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
> +                        "pid=%u\n",
> +                        sock_tag_entry->sk,
> +                        sock_tag_entry->tag,
> +                        uid,
> +                        sock_tag_entry->pid
> +                       );
> +               f_count = atomic_long_read(
> +                       &sock_tag_entry->socket->file->f_count);
> +               len = snprintf(outp, char_count,
> +                              "sock=%p tag=0x%llx (uid=%u) pid=%u "
> +                              "f_count=%lu\n",
> +                              sock_tag_entry->sk,
> +                              sock_tag_entry->tag, uid,
> +                              sock_tag_entry->pid, f_count);
> +               if (len >= char_count) {
> +                       spin_unlock_bh(&sock_tag_list_lock);
> +                       *outp = '\0';
> +                       return outp - page;
> +               }
> +               outp += len;
> +               char_count -= len;
> +               (*num_items_returned)++;
> +       }
> +       spin_unlock_bh(&sock_tag_list_lock);
> +
> +       if (item_index++ >= items_to_skip) {
> +               len = snprintf(outp, char_count,
> +                              "events: sockets_tagged=%llu "
> +                              "sockets_untagged=%llu "
> +                              "counter_set_changes=%llu "
> +                              "delete_cmds=%llu "
> +                              "iface_events=%llu "
> +                              "match_calls=%llu "
> +                              "match_found_sk=%llu "
> +                              "match_found_sk_in_ct=%llu "
> +                              "match_found_no_sk_in_ct=%llu "
> +                              "match_no_sk=%llu "
> +                              "match_no_sk_file=%llu\n",
> +                              atomic64_read(&qtu_events.sockets_tagged),
> +                              atomic64_read(&qtu_events.sockets_untagged),
> +                              atomic64_read(&qtu_events.counter_set_changes),
> +                              atomic64_read(&qtu_events.delete_cmds),
> +                              atomic64_read(&qtu_events.iface_events),
> +                              atomic64_read(&qtu_events.match_calls),
> +                              atomic64_read(&qtu_events.match_found_sk),
> +                              atomic64_read(&qtu_events.match_found_sk_in_ct),
> +                              atomic64_read(
> +                                      &qtu_events.match_found_no_sk_in_ct),
> +                              atomic64_read(&qtu_events.match_no_sk),
> +                              atomic64_read(&qtu_events.match_no_sk_file));
> +               if (len >= char_count) {
> +                       *outp = '\0';
> +                       return outp - page;
> +               }
> +               outp += len;
> +               char_count -= len;
> +               (*num_items_returned)++;
> +       }
> +
> +       /* Count the following as part of the last item_index */
> +       if (item_index > items_to_skip) {
> +               prdebug_full_state(indent_level, "proc ctrl");
> +       }
> +
> +       *eof = 1;
> +       return outp - page;
> +}
> +
> +/*
> + * Delete socket tags, and stat tags associated with a given
> + * accouting tag and uid.
> + */
> +static int ctrl_cmd_delete(const char *input)
> +{
> +       char cmd;
> +       uid_t uid;
> +       uid_t entry_uid;
> +       tag_t acct_tag;
> +       tag_t tag;
> +       int res, argc;
> +       struct iface_stat *iface_entry;
> +       struct rb_node *node;
> +       struct sock_tag *st_entry;
> +       struct rb_root st_to_free_tree = RB_ROOT;
> +       struct tag_stat *ts_entry;
> +       struct tag_counter_set *tcs_entry;
> +       struct tag_ref *tr_entry;
> +       struct uid_tag_data *utd_entry;
> +
> +       argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid);
> +       CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
> +                "user_tag=0x%llx uid=%u\n", input, argc, cmd,
> +                acct_tag, uid);
> +       if (argc < 2) {
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       if (!valid_atag(acct_tag)) {
> +               pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       if (argc < 3) {
> +               uid = current_fsuid();
> +       } else if (!can_impersonate_uid(uid)) {
> +               pr_info("qtaguid: ctrl_delete(%s): "
> +                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
> +                       input, current->pid, current->tgid, current_fsuid());
> +               res = -EPERM;
> +               goto err;
> +       }
> +
> +       tag = combine_atag_with_uid(acct_tag, uid);
> +       CT_DEBUG("qtaguid: ctrl_delete(%s): "
> +                "looking for tag=0x%llx (uid=%u)\n",
> +                input, tag, uid);
> +
> +       /* Delete socket tags */
> +       spin_lock_bh(&sock_tag_list_lock);
> +       node = rb_first(&sock_tag_tree);
> +       while (node) {
> +               st_entry = rb_entry(node, struct sock_tag, sock_node);
> +               entry_uid = get_uid_from_tag(st_entry->tag);
> +               node = rb_next(node);
> +               if (entry_uid != uid)
> +                       continue;
> +
> +               CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
> +                        input, st_entry->tag, entry_uid);
> +
> +               if (!acct_tag || st_entry->tag == tag) {
> +                       rb_erase(&st_entry->sock_node, &sock_tag_tree);
> +                       /* Can't sockfd_put() within spinlock, do it later. */
> +                       sock_tag_tree_insert(st_entry, &st_to_free_tree);
> +                       tr_entry = lookup_tag_ref(st_entry->tag, NULL);
> +                       BUG_ON(tr_entry->num_sock_tags <= 0);
> +                       tr_entry->num_sock_tags--;
> +                       /*
> +                        * TODO: remove if, and start failing.
> +                        * This is a hack to work around the fact that in some
> +                        * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
> +                        * and are trying to work around apps
> +                        * that didn't open the /dev/xt_qtaguid.
> +                        */
> +                       if (st_entry->list.next && st_entry->list.prev)
> +                               list_del(&st_entry->list);
> +               }
> +       }
> +       spin_unlock_bh(&sock_tag_list_lock);
> +
> +       sock_tag_tree_erase(&st_to_free_tree);
> +
> +       /* Delete tag counter-sets */
> +       spin_lock_bh(&tag_counter_set_list_lock);
> +       /* Counter sets are only on the uid tag, not full tag */
> +       tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
> +       if (tcs_entry) {
> +               CT_DEBUG("qtaguid: ctrl_delete(%s): "
> +                        "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
> +                        input,
> +                        tcs_entry->tn.tag,
> +                        get_uid_from_tag(tcs_entry->tn.tag),
> +                        tcs_entry->active_set);
> +               rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
> +               kfree(tcs_entry);
> +       }
> +       spin_unlock_bh(&tag_counter_set_list_lock);
> +
> +       /*
> +        * If acct_tag is 0, then all entries belonging to uid are
> +        * erased.
> +        */
> +       spin_lock_bh(&iface_stat_list_lock);
> +       list_for_each_entry(iface_entry, &iface_stat_list, list) {
> +               spin_lock_bh(&iface_entry->tag_stat_list_lock);
> +               node = rb_first(&iface_entry->tag_stat_tree);
> +               while (node) {
> +                       ts_entry = rb_entry(node, struct tag_stat, tn.node);
> +                       entry_uid = get_uid_from_tag(ts_entry->tn.tag);
> +                       node = rb_next(node);
> +
> +                       CT_DEBUG("qtaguid: ctrl_delete(%s): "
> +                                "ts tag=0x%llx (uid=%u)\n",
> +                                input, ts_entry->tn.tag, entry_uid);
> +
> +                       if (entry_uid != uid)
> +                               continue;
> +                       if (!acct_tag || ts_entry->tn.tag == tag) {
> +                               CT_DEBUG("qtaguid: ctrl_delete(%s): "
> +                                        "erase ts: %s 0x%llx %u\n",
> +                                        input, iface_entry->ifname,
> +                                        get_atag_from_tag(ts_entry->tn.tag),
> +                                        entry_uid);
> +                               rb_erase(&ts_entry->tn.node,
> +                                        &iface_entry->tag_stat_tree);
> +                               kfree(ts_entry);
> +                       }
> +               }
> +               spin_unlock_bh(&iface_entry->tag_stat_list_lock);
> +       }
> +       spin_unlock_bh(&iface_stat_list_lock);
> +
> +       /* Cleanup the uid_tag_data */
> +       spin_lock_bh(&uid_tag_data_tree_lock);
> +       node = rb_first(&uid_tag_data_tree);
> +       while (node) {
> +               utd_entry = rb_entry(node, struct uid_tag_data, node);
> +               entry_uid = utd_entry->uid;
> +               node = rb_next(node);
> +
> +               CT_DEBUG("qtaguid: ctrl_delete(%s): "
> +                        "utd uid=%u\n",
> +                        input, entry_uid);
> +
> +               if (entry_uid != uid)
> +                       continue;
> +               /*
> +                * Go over the tag_refs, and those that don't have
> +                * sock_tags using them are freed.
> +                */
> +               put_tag_ref_tree(tag, utd_entry);
> +               put_utd_entry(utd_entry);
> +       }
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +
> +       atomic64_inc(&qtu_events.delete_cmds);
> +       res = 0;
> +
> +err:
> +       return res;
> +}
> +
> +static int ctrl_cmd_counter_set(const char *input)
> +{
> +       char cmd;
> +       uid_t uid = 0;
> +       tag_t tag;
> +       int res, argc;
> +       struct tag_counter_set *tcs;
> +       int counter_set;
> +
> +       argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
> +       CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
> +                "set=%d uid=%u\n", input, argc, cmd,
> +                counter_set, uid);
> +       if (argc != 3) {
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
> +               pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
> +                       input);
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       if (!can_manipulate_uids()) {
> +               pr_info("qtaguid: ctrl_counterset(%s): "
> +                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
> +                       input, current->pid, current->tgid, current_fsuid());
> +               res = -EPERM;
> +               goto err;
> +       }
> +
> +       tag = make_tag_from_uid(uid);
> +       spin_lock_bh(&tag_counter_set_list_lock);
> +       tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
> +       if (!tcs) {
> +               tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
> +               if (!tcs) {
> +                       spin_unlock_bh(&tag_counter_set_list_lock);
> +                       pr_err("qtaguid: ctrl_counterset(%s): "
> +                              "failed to alloc counter set\n",
> +                              input);
> +                       res = -ENOMEM;
> +                       goto err;
> +               }
> +               tcs->tn.tag = tag;
> +               tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
> +               CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
> +                        "(uid=%u) set=%d\n",
> +                        input, tag, get_uid_from_tag(tag), counter_set);
> +       }
> +       tcs->active_set = counter_set;
> +       spin_unlock_bh(&tag_counter_set_list_lock);
> +       atomic64_inc(&qtu_events.counter_set_changes);
> +       res = 0;
> +
> +err:
> +       return res;
> +}
> +
> +static int ctrl_cmd_tag(const char *input)
> +{
> +       char cmd;
> +       int sock_fd = 0;
> +       uid_t uid = 0;
> +       tag_t acct_tag = make_atag_from_value(0);
> +       tag_t full_tag;
> +       struct socket *el_socket;
> +       int res, argc;
> +       struct sock_tag *sock_tag_entry;
> +       struct tag_ref *tag_ref_entry;
> +       struct uid_tag_data *uid_tag_data_entry;
> +       struct proc_qtu_data *pqd_entry;
> +
> +       /* Unassigned args will get defaulted later. */
> +       argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid);
> +       CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
> +                "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
> +                acct_tag, uid);
> +       if (argc < 2) {
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
> +       if (!el_socket) {
> +               pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
> +                       " sock_fd=%d err=%d\n", input, sock_fd, res);
> +               goto err;
> +       }
> +       CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->f_count=%ld ->sk=%p\n",
> +                input, atomic_long_read(&el_socket->file->f_count),
> +                el_socket->sk);
> +       if (argc < 3) {
> +               acct_tag = make_atag_from_value(0);
> +       } else if (!valid_atag(acct_tag)) {
> +               pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
> +               res = -EINVAL;
> +               goto err_put;
> +       }
> +       CT_DEBUG("qtaguid: ctrl_tag(%s): "
> +                "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
> +                "in_group=%d in_egroup=%d\n",
> +                input, current->pid, current->tgid, current_uid(),
> +                current_euid(), current_fsuid(),
> +                in_group_p(proc_ctrl_write_gid),
> +                in_egroup_p(proc_ctrl_write_gid));
> +       if (argc < 4) {
> +               uid = current_fsuid();
> +       } else if (!can_impersonate_uid(uid)) {
> +               pr_info("qtaguid: ctrl_tag(%s): "
> +                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
> +                       input, current->pid, current->tgid, current_fsuid());
> +               res = -EPERM;
> +               goto err_put;
> +       }
> +       full_tag = combine_atag_with_uid(acct_tag, uid);
> +
> +       spin_lock_bh(&sock_tag_list_lock);
> +       sock_tag_entry = get_sock_stat_nl(el_socket->sk);
> +       tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
> +       if (IS_ERR(tag_ref_entry)) {
> +               res = PTR_ERR(tag_ref_entry);
> +               spin_unlock_bh(&sock_tag_list_lock);
> +               goto err_put;
> +       }
> +       tag_ref_entry->num_sock_tags++;
> +       if (sock_tag_entry) {
> +               struct tag_ref *prev_tag_ref_entry;
> +
> +               CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
> +                        "st@%p ...->f_count=%ld\n",
> +                        input, el_socket->sk, sock_tag_entry,
> +                        atomic_long_read(&el_socket->file->f_count));
> +               /*
> +                * This is a re-tagging, so release the sock_fd that was
> +                * locked at the time of the 1st tagging.
> +                * There is still the ref from this call's sockfd_lookup() so
> +                * it can be done within the spinlock.
> +                */
> +               sockfd_put(sock_tag_entry->socket);
> +               prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
> +                                                   &uid_tag_data_entry);
> +               BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
> +               BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
> +               prev_tag_ref_entry->num_sock_tags--;
> +               sock_tag_entry->tag = full_tag;
> +       } else {
> +               CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
> +                        input, el_socket->sk);
> +               sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
> +                                        GFP_ATOMIC);
> +               if (!sock_tag_entry) {
> +                       pr_err("qtaguid: ctrl_tag(%s): "
> +                              "socket tag alloc failed\n",
> +                              input);
> +                       spin_unlock_bh(&sock_tag_list_lock);
> +                       res = -ENOMEM;
> +                       goto err_tag_unref_put;
> +               }
> +               sock_tag_entry->sk = el_socket->sk;
> +               sock_tag_entry->socket = el_socket;
> +               sock_tag_entry->pid = current->tgid;
> +               sock_tag_entry->tag = combine_atag_with_uid(acct_tag,
> +                                                           uid);
> +               spin_lock_bh(&uid_tag_data_tree_lock);
> +               pqd_entry = proc_qtu_data_tree_search(
> +                       &proc_qtu_data_tree, current->tgid);
> +               /*
> +                * TODO: remove if, and start failing.
> +                * At first, we want to catch user-space code that is not
> +                * opening the /dev/xt_qtaguid.
> +                */
> +               if (IS_ERR_OR_NULL(pqd_entry))
> +                       pr_warn_once(
> +                               "qtaguid: %s(): "
> +                               "User space forgot to open /dev/xt_qtaguid? "
> +                               "pid=%u tgid=%u uid=%u\n", __func__,
> +                               current->pid, current->tgid,
> +                               current_fsuid());
> +               else
> +                       list_add(&sock_tag_entry->list,
> +                                &pqd_entry->sock_tag_list);
> +               spin_unlock_bh(&uid_tag_data_tree_lock);
> +
> +               sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
> +               atomic64_inc(&qtu_events.sockets_tagged);
> +       }
> +       spin_unlock_bh(&sock_tag_list_lock);
> +       /* We keep the ref to the socket (file) until it is untagged */
> +       CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->f_count=%ld\n",
> +                input, sock_tag_entry,
> +                atomic_long_read(&el_socket->file->f_count));
> +       return 0;
> +
> +err_tag_unref_put:
> +       BUG_ON(tag_ref_entry->num_sock_tags <= 0);
> +       tag_ref_entry->num_sock_tags--;
> +       free_tag_ref_from_utd_entry(tag_ref_entry, uid_tag_data_entry);
> +err_put:
> +       CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->f_count=%ld\n",
> +                input, atomic_long_read(&el_socket->file->f_count) - 1);
> +       /* Release the sock_fd that was grabbed by sockfd_lookup(). */
> +       sockfd_put(el_socket);
> +       return res;
> +
> +err:
> +       CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
> +       return res;
> +}
> +
> +static int ctrl_cmd_untag(const char *input)
> +{
> +       char cmd;
> +       int sock_fd = 0;
> +       struct socket *el_socket;
> +       int res, argc;
> +       struct sock_tag *sock_tag_entry;
> +       struct tag_ref *tag_ref_entry;
> +       struct uid_tag_data *utd_entry;
> +       struct proc_qtu_data *pqd_entry;
> +
> +       argc = sscanf(input, "%c %d", &cmd, &sock_fd);
> +       CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
> +                input, argc, cmd, sock_fd);
> +       if (argc < 2) {
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
> +       if (!el_socket) {
> +               pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
> +                       " sock_fd=%d err=%d\n", input, sock_fd, res);
> +               goto err;
> +       }
> +       CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
> +                input, atomic_long_read(&el_socket->file->f_count),
> +                el_socket->sk);
> +       spin_lock_bh(&sock_tag_list_lock);
> +       sock_tag_entry = get_sock_stat_nl(el_socket->sk);
> +       if (!sock_tag_entry) {
> +               spin_unlock_bh(&sock_tag_list_lock);
> +               res = -EINVAL;
> +               goto err_put;
> +       }
> +       /*
> +        * The socket already belongs to the current process
> +        * so it can do whatever it wants to it.
> +        */
> +       rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
> +
> +       tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
> +       BUG_ON(!tag_ref_entry);
> +       BUG_ON(tag_ref_entry->num_sock_tags <= 0);
> +       spin_lock_bh(&uid_tag_data_tree_lock);
> +       pqd_entry = proc_qtu_data_tree_search(
> +               &proc_qtu_data_tree, current->tgid);
> +       /*
> +        * TODO: remove if, and start failing.
> +        * At first, we want to catch user-space code that is not
> +        * opening the /dev/xt_qtaguid.
> +        */
> +       if (IS_ERR_OR_NULL(pqd_entry))
> +               pr_warn_once("qtaguid: %s(): "
> +                            "User space forgot to open /dev/xt_qtaguid? "
> +                            "pid=%u tgid=%u uid=%u\n", __func__,
> +                            current->pid, current->tgid, current_fsuid());
> +       else
> +               list_del(&sock_tag_entry->list);
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +       /*
> +        * We don't free tag_ref from the utd_entry here,
> +        * only during a cmd_delete().
> +        */
> +       tag_ref_entry->num_sock_tags--;
> +       spin_unlock_bh(&sock_tag_list_lock);
> +       /*
> +        * Release the sock_fd that was grabbed at tag time,
> +        * and once more for the sockfd_lookup() here.
> +        */
> +       sockfd_put(sock_tag_entry->socket);
> +       CT_DEBUG("qtaguid: ctrl_untag(%s): done. st@%p ...->f_count=%ld\n",
> +                input, sock_tag_entry,
> +                atomic_long_read(&el_socket->file->f_count) - 1);
> +       sockfd_put(el_socket);
> +
> +       kfree(sock_tag_entry);
> +       atomic64_inc(&qtu_events.sockets_untagged);
> +
> +       return 0;
> +
> +err_put:
> +       CT_DEBUG("qtaguid: ctrl_untag(%s): done. socket->...->f_count=%ld\n",
> +                input, atomic_long_read(&el_socket->file->f_count) - 1);
> +       /* Release the sock_fd that was grabbed by sockfd_lookup(). */
> +       sockfd_put(el_socket);
> +       return res;
> +
> +err:
> +       CT_DEBUG("qtaguid: ctrl_untag(%s): done.\n", input);
> +       return res;
> +}
> +
> +static int qtaguid_ctrl_parse(const char *input, int count)
> +{
> +       char cmd;
> +       int res;
> +
> +       cmd = input[0];
> +       /* Collect params for commands */
> +       switch (cmd) {
> +       case 'd':
> +               res = ctrl_cmd_delete(input);
> +               break;
> +
> +       case 's':
> +               res = ctrl_cmd_counter_set(input);
> +               break;
> +
> +       case 't':
> +               res = ctrl_cmd_tag(input);
> +               break;
> +
> +       case 'u':
> +               res = ctrl_cmd_untag(input);
> +               break;
> +
> +       default:
> +               res = -EINVAL;
> +               goto err;
> +       }
> +       if (!res)
> +               res = count;
> +err:
> +       CT_DEBUG("qtaguid: ctrl(%s): res=%d\n", input, res);
> +       return res;
> +}
> +
> +#define MAX_QTAGUID_CTRL_INPUT_LEN 255
> +static int qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
> +                       unsigned long count, void *data)
> +{
> +       char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
> +
> +       if (unlikely(module_passive))
> +               return count;
> +
> +       if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
> +               return -EINVAL;
> +
> +       if (copy_from_user(input_buf, buffer, count))
> +               return -EFAULT;
> +
> +       input_buf[count] = '\0';
> +       return qtaguid_ctrl_parse(input_buf, count);
> +}
> +
> +struct proc_print_info {
> +       char *outp;
> +       char **num_items_returned;
> +       struct iface_stat *iface_entry;
> +       struct tag_stat *ts_entry;
> +       int item_index;
> +       int items_to_skip;
> +       int char_count;
> +};
> +
> +static int pp_stats_line(struct proc_print_info *ppi, int cnt_set)
> +{
> +       int len;
> +       struct data_counters *cnts;
> +
> +       if (!ppi->item_index) {
> +               if (ppi->item_index++ < ppi->items_to_skip)
> +                       return 0;
> +               len = snprintf(ppi->outp, ppi->char_count,
> +                              "idx iface acct_tag_hex uid_tag_int cnt_set "
> +                              "rx_bytes rx_packets "
> +                              "tx_bytes tx_packets "
> +                              "rx_tcp_bytes rx_tcp_packets "
> +                              "rx_udp_bytes rx_udp_packets "
> +                              "rx_other_bytes rx_other_packets "
> +                              "tx_tcp_bytes tx_tcp_packets "
> +                              "tx_udp_bytes tx_udp_packets "
> +                              "tx_other_bytes tx_other_packets\n");
> +       } else {
> +               tag_t tag = ppi->ts_entry->tn.tag;
> +               uid_t stat_uid = get_uid_from_tag(tag);
> +
> +               if (!can_read_other_uid_stats(stat_uid)) {
> +                       CT_DEBUG("qtaguid: stats line: "
> +                                "%s 0x%llx %u: insufficient priv "
> +                                "from pid=%u tgid=%u uid=%u\n",
> +                                ppi->iface_entry->ifname,
> +                                get_atag_from_tag(tag), stat_uid,
> +                                current->pid, current->tgid, current_fsuid());
> +                       return 0;
> +               }
> +               if (ppi->item_index++ < ppi->items_to_skip)
> +                       return 0;
> +               cnts = &ppi->ts_entry->counters;
> +               len = snprintf(
> +                       ppi->outp, ppi->char_count,
> +                       "%d %s 0x%llx %u %u "
> +                       "%llu %llu "
> +                       "%llu %llu "
> +                       "%llu %llu "
> +                       "%llu %llu "
> +                       "%llu %llu "
> +                       "%llu %llu "
> +                       "%llu %llu "
> +                       "%llu %llu\n",
> +                       ppi->item_index,
> +                       ppi->iface_entry->ifname,
> +                       get_atag_from_tag(tag),
> +                       stat_uid,
> +                       cnt_set,
> +                       dc_sum_bytes(cnts, cnt_set, IFS_RX),
> +                       dc_sum_packets(cnts, cnt_set, IFS_RX),
> +                       dc_sum_bytes(cnts, cnt_set, IFS_TX),
> +                       dc_sum_packets(cnts, cnt_set, IFS_TX),
> +                       cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
> +                       cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
> +                       cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
> +                       cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
> +                       cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
> +                       cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
> +                       cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
> +                       cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
> +                       cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
> +                       cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
> +                       cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
> +                       cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
> +       }
> +       return len;
> +}
> +
> +static bool pp_sets(struct proc_print_info *ppi)
> +{
> +       int len;
> +       int counter_set;
> +       for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
> +            counter_set++) {
> +               len = pp_stats_line(ppi, counter_set);
> +               if (len >= ppi->char_count) {
> +                       *ppi->outp = '\0';
> +                       return false;
> +               }
> +               if (len) {
> +                       ppi->outp += len;
> +                       ppi->char_count -= len;
> +                       (*ppi->num_items_returned)++;
> +               }
> +       }
> +       return true;
> +}
> +
> +/*
> + * Procfs reader to get all tag stats using style "1)" as described in
> + * fs/proc/generic.c
> + * Groups all protocols tx/rx bytes.
> + */
> +static int qtaguid_stats_proc_read(char *page, char **num_items_returned,
> +                               off_t items_to_skip, int char_count, int *eof,
> +                               void *data)
> +{
> +       struct proc_print_info ppi;
> +       int len;
> +
> +       ppi.outp = page;
> +       ppi.item_index = 0;
> +       ppi.char_count = char_count;
> +       ppi.num_items_returned = num_items_returned;
> +       ppi.items_to_skip = items_to_skip;
> +
> +       if (unlikely(module_passive)) {
> +               len = pp_stats_line(&ppi, 0);
> +               /* The header should always be shorter than the buffer. */
> +               BUG_ON(len >= ppi.char_count);
> +               (*num_items_returned)++;
> +               *eof = 1;
> +               return len;
> +       }
> +
> +       CT_DEBUG("qtaguid:proc stats page=%p *num_items_returned=%p off=%ld "
> +               "char_count=%d *eof=%d\n", page, *num_items_returned,
> +               items_to_skip, char_count, *eof);
> +
> +       if (*eof)
> +               return 0;
> +
> +       /* The idx is there to help debug when things go belly up. */
> +       len = pp_stats_line(&ppi, 0);
> +       /* Don't advance the outp unless the whole line was printed */
> +       if (len >= ppi.char_count) {
> +               *ppi.outp = '\0';
> +               return ppi.outp - page;
> +       }
> +       if (len) {
> +               ppi.outp += len;
> +               ppi.char_count -= len;
> +               (*num_items_returned)++;
> +       }
> +
> +       spin_lock_bh(&iface_stat_list_lock);
> +       list_for_each_entry(ppi.iface_entry, &iface_stat_list, list) {
> +               struct rb_node *node;
> +               spin_lock_bh(&ppi.iface_entry->tag_stat_list_lock);
> +               for (node = rb_first(&ppi.iface_entry->tag_stat_tree);
> +                    node;
> +                    node = rb_next(node)) {
> +                       ppi.ts_entry = rb_entry(node, struct tag_stat, tn.node);
> +                       if (!pp_sets(&ppi)) {
> +                               spin_unlock_bh(
> +                                       &ppi.iface_entry->tag_stat_list_lock);
> +                               spin_unlock_bh(&iface_stat_list_lock);
> +                               return ppi.outp - page;
> +                       }
> +               }
> +               spin_unlock_bh(&ppi.iface_entry->tag_stat_list_lock);
> +       }
> +       spin_unlock_bh(&iface_stat_list_lock);
> +
> +       *eof = 1;
> +       return ppi.outp - page;
> +}
> +
> +/*------------------------------------------*/
> +static int qtudev_open(struct inode *inode, struct file *file)
> +{
> +       struct uid_tag_data *utd_entry;
> +       struct proc_qtu_data  *pqd_entry;
> +       struct proc_qtu_data  *new_pqd_entry;
> +       int res;
> +       bool utd_entry_found;
> +
> +       if (unlikely(qtu_proc_handling_passive))
> +               return 0;
> +
> +       DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
> +                current->pid, current->tgid, current_fsuid());
> +
> +       spin_lock_bh(&uid_tag_data_tree_lock);
> +
> +       /* Look for existing uid data, or alloc one. */
> +       utd_entry = get_uid_data(current_fsuid(), &utd_entry_found);
> +       if (IS_ERR_OR_NULL(utd_entry)) {
> +               res = PTR_ERR(utd_entry);
> +               goto err;
> +       }
> +
> +       /* Look for existing PID based proc_data */
> +       pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
> +                                             current->tgid);
> +       if (pqd_entry) {
> +               pr_err("qtaguid: qtudev_open(): %u/%u %u "
> +                      "%s already opened\n",
> +                      current->pid, current->tgid, current_fsuid(),
> +                      QTU_DEV_NAME);
> +               res = -EBUSY;
> +               goto err_unlock_free_utd;
> +       }
> +
> +       new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
> +       if (!new_pqd_entry) {
> +               pr_err("qtaguid: qtudev_open(): %u/%u %u: "
> +                      "proc data alloc failed\n",
> +                      current->pid, current->tgid, current_fsuid());
> +               res = -ENOMEM;
> +               goto err_unlock_free_utd;
> +       }
> +       new_pqd_entry->pid = current->tgid;
> +       INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
> +       new_pqd_entry->parent_tag_data = utd_entry;
> +       utd_entry->num_pqd++;
> +
> +       proc_qtu_data_tree_insert(new_pqd_entry,
> +                                 &proc_qtu_data_tree);
> +
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +       DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
> +                current_fsuid(), new_pqd_entry);
> +       file->private_data = new_pqd_entry;
> +       return 0;
> +
> +err_unlock_free_utd:
> +       if (!utd_entry_found) {
> +               rb_erase(&utd_entry->node, &uid_tag_data_tree);
> +               kfree(utd_entry);
> +       }
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +err:
> +       return res;
> +}
> +
> +static int qtudev_release(struct inode *inode, struct file *file)
> +{
> +       struct proc_qtu_data  *pqd_entry = file->private_data;
> +       struct uid_tag_data  *utd_entry = pqd_entry->parent_tag_data;
> +       struct sock_tag *st_entry;
> +       struct rb_root st_to_free_tree = RB_ROOT;
> +       struct list_head *entry, *next;
> +       struct tag_ref *tr;
> +
> +       if (unlikely(qtu_proc_handling_passive))
> +               return 0;
> +
> +       /*
> +        * Do not trust the current->pid, it might just be a kworker cleaning
> +        * up after a dead proc.
> +        */
> +       DR_DEBUG("qtaguid: qtudev_release(): "
> +                "pid=%u tgid=%u uid=%u "
> +                "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
> +                current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
> +                pqd_entry, pqd_entry->pid, utd_entry,
> +                utd_entry->num_active_tags);
> +
> +       spin_lock_bh(&sock_tag_list_lock);
> +       spin_lock_bh(&uid_tag_data_tree_lock);
> +
> +       list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
> +               st_entry = list_entry(entry, struct sock_tag, list);
> +               DR_DEBUG("qtaguid: %s(): "
> +                        "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
> +                        __func__,
> +                        st_entry, st_entry->sk,
> +                        current->pid, current->tgid,
> +                        pqd_entry->parent_tag_data->uid);
> +
> +               utd_entry = uid_tag_data_tree_search(
> +                       &uid_tag_data_tree,
> +                       get_uid_from_tag(st_entry->tag));
> +               BUG_ON(IS_ERR_OR_NULL(utd_entry));
> +               DR_DEBUG("qtaguid: %s(): "
> +                        "looking for tag=0x%llx in utd_entry=%p\n", __func__,
> +                        st_entry->tag, utd_entry);
> +               tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
> +                                        st_entry->tag);
> +               BUG_ON(!tr);
> +               BUG_ON(tr->num_sock_tags <= 0);
> +               tr->num_sock_tags--;
> +               free_tag_ref_from_utd_entry(tr, utd_entry);
> +
> +               rb_erase(&st_entry->sock_node, &sock_tag_tree);
> +               list_del(&st_entry->list);
> +               /* Can't sockfd_put() within spinlock, do it later. */
> +               sock_tag_tree_insert(st_entry, &st_to_free_tree);
> +
> +               /*
> +                * Try to free the utd_entry if no other proc_qtu_data is
> +                * using it (num_pqd is 0) and it doesn't have active tags
> +                * (num_active_tags is 0).
> +                */
> +               put_utd_entry(utd_entry);
> +       }
> +
> +       rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
> +       BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
> +       pqd_entry->parent_tag_data->num_pqd--;
> +       put_utd_entry(pqd_entry->parent_tag_data);
> +       kfree(pqd_entry);
> +       file->private_data = NULL;
> +
> +       spin_unlock_bh(&uid_tag_data_tree_lock);
> +       spin_unlock_bh(&sock_tag_list_lock);
> +
> +
> +       sock_tag_tree_erase(&st_to_free_tree);
> +
> +       prdebug_full_state(0, "%s(): pid=%u tgid=%u", __func__,
> +                          current->pid, current->tgid);
> +       return 0;
> +}
> +
> +/*------------------------------------------*/
> +static const struct file_operations qtudev_fops = {
> +       .owner = THIS_MODULE,
> +       .open = qtudev_open,
> +       .release = qtudev_release,
> +};
> +
> +static struct miscdevice qtu_device = {
> +       .minor = MISC_DYNAMIC_MINOR,
> +       .name = QTU_DEV_NAME,
> +       .fops = &qtudev_fops,
> +       /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
> +};
> +
> +/*------------------------------------------*/
> +static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
> +{
> +       int ret;
> +       *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
> +       if (!*res_procdir) {
> +               pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
> +               ret = -ENOMEM;
> +               goto no_dir;
> +       }
> +
> +       xt_qtaguid_ctrl_file = create_proc_entry("ctrl", proc_ctrl_perms,
> +                                               *res_procdir);
> +       if (!xt_qtaguid_ctrl_file) {
> +               pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
> +                       " file\n");
> +               ret = -ENOMEM;
> +               goto no_ctrl_entry;
> +       }
> +       xt_qtaguid_ctrl_file->read_proc = qtaguid_ctrl_proc_read;
> +       xt_qtaguid_ctrl_file->write_proc = qtaguid_ctrl_proc_write;
> +
> +       xt_qtaguid_stats_file = create_proc_entry("stats", proc_stats_perms,
> +                                               *res_procdir);
> +       if (!xt_qtaguid_stats_file) {
> +               pr_err("qtaguid: failed to create xt_qtaguid/stats "
> +                       "file\n");
> +               ret = -ENOMEM;
> +               goto no_stats_entry;
> +       }
> +       xt_qtaguid_stats_file->read_proc = qtaguid_stats_proc_read;
> +       /*
> +        * TODO: add support counter hacking
> +        * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
> +        */
> +       return 0;
> +
> +no_stats_entry:
> +       remove_proc_entry("ctrl", *res_procdir);
> +no_ctrl_entry:
> +       remove_proc_entry("xt_qtaguid", NULL);
> +no_dir:
> +       return ret;
> +}
> +
> +static struct xt_match qtaguid_mt_reg __read_mostly = {
> +       /*
> +        * This module masquerades as the "owner" module so that iptables
> +        * tools can deal with it.
> +        */
> +       .name       = "owner",
> +       .revision   = 1,
> +       .family     = NFPROTO_UNSPEC,
> +       .match      = qtaguid_mt,
> +       .matchsize  = sizeof(struct xt_qtaguid_match_info),
> +       .me         = THIS_MODULE,
> +};
> +
> +static int __init qtaguid_mt_init(void)
> +{
> +       if (qtaguid_proc_register(&xt_qtaguid_procdir)
> +           || iface_stat_init(xt_qtaguid_procdir)
> +           || xt_register_match(&qtaguid_mt_reg)
> +           || misc_register(&qtu_device))
> +               return -1;
> +       return 0;
> +}
> +
> +/*
> + * TODO: allow unloading of the module.
> + * For now stats are permanent.
> + * Kconfig forces'y/n' and never an 'm'.
> + */
> +
> +module_init(qtaguid_mt_init);
> +MODULE_AUTHOR("jpa <jpa@google.com>");
> +MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
> +MODULE_LICENSE("GPL");
> +MODULE_ALIAS("ipt_owner");
> +MODULE_ALIAS("ip6t_owner");
> +MODULE_ALIAS("ipt_qtaguid");
> +MODULE_ALIAS("ip6t_qtaguid");
> diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h
> new file mode 100644
> index 0000000..02479d6
> --- /dev/null
> +++ b/net/netfilter/xt_qtaguid_internal.h
> @@ -0,0 +1,330 @@
> +/*
> + * Kernel iptables module to track stats for packets based on user tags.
> + *
> + * (C) 2011 Google, Inc
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef __XT_QTAGUID_INTERNAL_H__
> +#define __XT_QTAGUID_INTERNAL_H__
> +
> +#include <linux/types.h>
> +#include <linux/rbtree.h>
> +#include <linux/spinlock_types.h>
> +#include <linux/workqueue.h>
> +
> +/* Iface handling */
> +#define IDEBUG_MASK (1<<0)
> +/* Iptable Matching. Per packet. */
> +#define MDEBUG_MASK (1<<1)
> +/* Red-black tree handling. Per packet. */
> +#define RDEBUG_MASK (1<<2)
> +/* procfs ctrl/stats handling */
> +#define CDEBUG_MASK (1<<3)
> +/* dev and resource tracking */
> +#define DDEBUG_MASK (1<<4)
> +
> +/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
> +#define DEFAULT_DEBUG_MASK 0
> +
> +/*
> + * (Un)Define these *DEBUG to compile out/in the pr_debug calls.
> + * All undef: text size ~ 0x3030; all def: ~ 0x4404.
> + */
> +#define IDEBUG
> +#define MDEBUG
> +#define RDEBUG
> +#define CDEBUG
> +#define DDEBUG
> +
> +#define MSK_DEBUG(mask, ...) do {                           \
> +               if (unlikely(qtaguid_debug_mask & (mask)))  \
> +                       pr_debug(__VA_ARGS__);              \
> +       } while (0)
> +#ifdef IDEBUG
> +#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
> +#else
> +#define IF_DEBUG(...) no_printk(__VA_ARGS__)
> +#endif
> +#ifdef MDEBUG
> +#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
> +#else
> +#define MT_DEBUG(...) no_printk(__VA_ARGS__)
> +#endif
> +#ifdef RDEBUG
> +#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
> +#else
> +#define RB_DEBUG(...) no_printk(__VA_ARGS__)
> +#endif
> +#ifdef CDEBUG
> +#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
> +#else
> +#define CT_DEBUG(...) no_printk(__VA_ARGS__)
> +#endif
> +#ifdef DDEBUG
> +#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
> +#else
> +#define DR_DEBUG(...) no_printk(__VA_ARGS__)
> +#endif
> +
> +extern uint qtaguid_debug_mask;
> +
> +/*---------------------------------------------------------------------------*/
> +/*
> + * Tags:
> + *
> + * They represent what the data usage counters will be tracked against.
> + * By default a tag is just based on the UID.
> + * The UID is used as the base for policing, and can not be ignored.
> + * So a tag will always at least represent a UID (uid_tag).
> + *
> + * A tag can be augmented with an "accounting tag" which is associated
> + * with a UID.
> + * User space can set the acct_tag portion of the tag which is then used
> + * with sockets: all data belonging to that socket will be counted against the
> + * tag. The policing is then based on the tag's uid_tag portion,
> + * and stats are collected for the acct_tag portion separately.
> + *
> + * There could be
> + * a:  {acct_tag=1, uid_tag=10003}
> + * b:  {acct_tag=2, uid_tag=10003}
> + * c:  {acct_tag=3, uid_tag=10003}
> + * d:  {acct_tag=0, uid_tag=10003}
> + * a, b, and c represent tags associated with specific sockets.
> + * d is for the totals for that uid, including all untagged traffic.
> + * Typically d is used with policing/quota rules.
> + *
> + * We want tag_t big enough to distinguish uid_t and acct_tag.
> + * It might become a struct if needed.
> + * Nothing should be using it as an int.
> + */
> +typedef uint64_t tag_t;  /* Only used via accessors */
> +
> +#define TAG_UID_MASK 0xFFFFFFFFULL
> +#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
> +
> +static inline int tag_compare(tag_t t1, tag_t t2)
> +{
> +       return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
> +}
> +
> +static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
> +{
> +       return acct_tag | uid;
> +}
> +static inline tag_t make_tag_from_uid(uid_t uid)
> +{
> +       return uid;
> +}
> +static inline uid_t get_uid_from_tag(tag_t tag)
> +{
> +       return tag & TAG_UID_MASK;
> +}
> +static inline tag_t get_utag_from_tag(tag_t tag)
> +{
> +       return tag & TAG_UID_MASK;
> +}
> +static inline tag_t get_atag_from_tag(tag_t tag)
> +{
> +       return tag & TAG_ACCT_MASK;
> +}
> +
> +static inline bool valid_atag(tag_t tag)
> +{
> +       return !(tag & TAG_UID_MASK);
> +}
> +static inline tag_t make_atag_from_value(uint32_t value)
> +{
> +       return (uint64_t)value << 32;
> +}
> +/*---------------------------------------------------------------------------*/
> +
> +/*
> + * Maximum number of socket tags that a UID is allowed to have active.
> + * Multiple processes belonging to the same UID contribute towards this limit.
> + * Special UIDs that can impersonate a UID also contribute (e.g. download
> + * manager, ...)
> + */
> +#define DEFAULT_MAX_SOCK_TAGS 1024
> +
> +/*
> + * For now we only track 2 sets of counters.
> + * The default set is 0.
> + * Userspace can activate another set for a given uid being tracked.
> + */
> +#define IFS_MAX_COUNTER_SETS 2
> +
> +enum ifs_tx_rx {
> +       IFS_TX,
> +       IFS_RX,
> +       IFS_MAX_DIRECTIONS
> +};
> +
> +/* For now, TCP, UDP, the rest */
> +enum ifs_proto {
> +       IFS_TCP,
> +       IFS_UDP,
> +       IFS_PROTO_OTHER,
> +       IFS_MAX_PROTOS
> +};
> +
> +struct byte_packet_counters {
> +       uint64_t bytes;
> +       uint64_t packets;
> +};
> +
> +struct data_counters {
> +       struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
> +};
> +
> +/* Generic X based nodes used as a base for rb_tree ops */
> +struct tag_node {
> +       struct rb_node node;
> +       tag_t tag;
> +};
> +
> +struct tag_stat {
> +       struct tag_node tn;
> +       struct data_counters counters;
> +       /*
> +        * If this tag is acct_tag based, we need to count against the
> +        * matching parent uid_tag.
> +        */
> +       struct data_counters *parent_counters;
> +};
> +
> +struct iface_stat {
> +       struct list_head list;  /* in iface_stat_list */
> +       char *ifname;
> +       bool active;
> +       /* net_dev is only valid for active iface_stat */
> +       struct net_device *net_dev;
> +
> +       struct byte_packet_counters totals[IFS_MAX_DIRECTIONS];
> +       /*
> +        * We keep the last_known, because some devices reset their counters
> +        * just before NETDEV_UP, while some will reset just before
> +        * NETDEV_REGISTER (which is more normal).
> +        * So now, if the device didn't do a NETDEV_UNREGISTER and we see
> +        * its current dev stats smaller that what was previously known, we
> +        * assume an UNREGISTER and just use the last_known.
> +        */
> +       struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS];
> +       /* last_known is usable when last_known_valid is true */
> +       bool last_known_valid;
> +
> +       struct proc_dir_entry *proc_ptr;
> +
> +       struct rb_root tag_stat_tree;
> +       spinlock_t tag_stat_list_lock;
> +};
> +
> +/* This is needed to create proc_dir_entries from atomic context. */
> +struct iface_stat_work {
> +       struct work_struct iface_work;
> +       struct iface_stat *iface_entry;
> +};
> +
> +/*
> + * Track tag that this socket is transferring data for, and not necessarily
> + * the uid that owns the socket.
> + * This is the tag against which tag_stat.counters will be billed.
> + * These structs need to be looked up by sock and pid.
> + */
> +struct sock_tag {
> +       struct rb_node sock_node;
> +       struct sock *sk;  /* Only used as a number, never dereferenced */
> +       /* The socket is needed for sockfd_put() */
> +       struct socket *socket;
> +       /* Used to associate with a given pid */
> +       struct list_head list;   /* in proc_qtu_data.sock_tag_list */
> +       pid_t pid;
> +
> +       tag_t tag;
> +};
> +
> +struct qtaguid_event_counts {
> +       /* Various successful events */
> +       atomic64_t sockets_tagged;
> +       atomic64_t sockets_untagged;
> +       atomic64_t counter_set_changes;
> +       atomic64_t delete_cmds;
> +       atomic64_t iface_events;  /* Number of NETDEV_* events handled */
> +
> +       atomic64_t match_calls;   /* Number of times iptables called mt */
> +       /*
> +        * match_found_sk_*: numbers related to the netfilter matching
> +        * function finding a sock for the sk_buff.
> +        * Total skbs processed is sum(match_found*).
> +        */
> +       atomic64_t match_found_sk;   /* An sk was already in the sk_buff. */
> +       /* The connection tracker had or didn't have the sk. */
> +       atomic64_t match_found_sk_in_ct;
> +       atomic64_t match_found_no_sk_in_ct;
> +       /*
> +        * No sk could be found. No apparent owner. Could happen with
> +        * unsolicited traffic.
> +        */
> +       atomic64_t match_no_sk;
> +       /*
> +        * The file ptr in the sk_socket wasn't there.
> +        * This might happen for traffic while the socket is being closed.
> +        */
> +       atomic64_t match_no_sk_file;
> +};
> +
> +/* Track the set active_set for the given tag. */
> +struct tag_counter_set {
> +       struct tag_node tn;
> +       int active_set;
> +};
> +
> +/*----------------------------------------------*/
> +/*
> + * The qtu uid data is used to track resources that are created directly or
> + * indirectly by processes (uid tracked).
> + * It is shared by the processes with the same uid.
> + * Some of the resource will be counted to prevent further rogue allocations,
> + * some will need freeing once the owner process (uid) exits.
> + */
> +struct uid_tag_data {
> +       struct rb_node node;
> +       uid_t uid;
> +
> +       /*
> +        * For the uid, how many accounting tags have been set.
> +        */
> +       int num_active_tags;
> +       /* Track the number of proc_qtu_data that reference it */
> +       int num_pqd;
> +       struct rb_root tag_ref_tree;
> +       /* No tag_node_tree_lock; use uid_tag_data_tree_lock */
> +};
> +
> +struct tag_ref {
> +       struct tag_node tn;
> +
> +       /*
> +        * This tracks the number of active sockets that have a tag on them
> +        * which matches this tag_ref.tn.tag.
> +        * A tag ref can live on after the sockets are untagged.
> +        * A tag ref can only be removed during a tag delete command.
> +        */
> +       int num_sock_tags;
> +};
> +
> +struct proc_qtu_data {
> +       struct rb_node node;
> +       pid_t pid;
> +
> +       struct uid_tag_data *parent_tag_data;
> +
> +       /* Tracks the sock_tags that need freeing upon this proc's death */
> +       struct list_head sock_tag_list;
> +       /* No spinlock_t sock_tag_list_lock; use the global one. */
> +};
> +
> +/*----------------------------------------------*/
> +#endif  /* ifndef __XT_QTAGUID_INTERNAL_H__ */
> diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c
> new file mode 100644
> index 0000000..3917678
> --- /dev/null
> +++ b/net/netfilter/xt_qtaguid_print.c
> @@ -0,0 +1,556 @@
> +/*
> + * Pretty printing Support for iptables xt_qtaguid module.
> + *
> + * (C) 2011 Google, Inc
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +/*
> + * Most of the functions in this file just waste time if DEBUG is not defined.
> + * The matching xt_qtaguid_print.h will static inline empty funcs if the needed
> + * debug flags ore not defined.
> + * Those funcs that fail to allocate memory will panic as there is no need to
> + * hobble allong just pretending to do the requested work.
> + */
> +
> +#define DEBUG
> +
> +#include <linux/fs.h>
> +#include <linux/gfp.h>
> +#include <linux/net.h>
> +#include <linux/rbtree.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock_types.h>
> +
> +
> +#include "xt_qtaguid_internal.h"
> +#include "xt_qtaguid_print.h"
> +
> +#ifdef DDEBUG
> +
> +static void _bug_on_err_or_null(void *ptr)
> +{
> +       if (IS_ERR_OR_NULL(ptr)) {
> +               pr_err("qtaguid: kmalloc failed\n");
> +               BUG();
> +       }
> +}
> +
> +char *pp_tag_t(tag_t *tag)
> +{
> +       char *res;
> +
> +       if (!tag)
> +               res = kasprintf(GFP_ATOMIC, "tag_t@null{}");
> +       else
> +               res = kasprintf(GFP_ATOMIC,
> +                               "tag_t@%p{tag=0x%llx, uid=%u}",
> +                               tag, *tag, get_uid_from_tag(*tag));
> +       _bug_on_err_or_null(res);
> +       return res;
> +}
> +
> +char *pp_data_counters(struct data_counters *dc, bool showValues)
> +{
> +       char *res;
> +
> +       if (!dc)
> +               res = kasprintf(GFP_ATOMIC, "data_counters@null{}");
> +       else if (showValues)
> +               res = kasprintf(
> +                       GFP_ATOMIC, "data_counters@%p{"
> +                       "set0{"
> +                       "rx{"
> +                       "tcp{b=%llu, p=%llu}, "
> +                       "udp{b=%llu, p=%llu},"
> +                       "other{b=%llu, p=%llu}}, "
> +                       "tx{"
> +                       "tcp{b=%llu, p=%llu}, "
> +                       "udp{b=%llu, p=%llu},"
> +                       "other{b=%llu, p=%llu}}}, "
> +                       "set1{"
> +                       "rx{"
> +                       "tcp{b=%llu, p=%llu}, "
> +                       "udp{b=%llu, p=%llu},"
> +                       "other{b=%llu, p=%llu}}, "
> +                       "tx{"
> +                       "tcp{b=%llu, p=%llu}, "
> +                       "udp{b=%llu, p=%llu},"
> +                       "other{b=%llu, p=%llu}}}}",
> +                       dc,
> +                       dc->bpc[0][IFS_RX][IFS_TCP].bytes,
> +                       dc->bpc[0][IFS_RX][IFS_TCP].packets,
> +                       dc->bpc[0][IFS_RX][IFS_UDP].bytes,
> +                       dc->bpc[0][IFS_RX][IFS_UDP].packets,
> +                       dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
> +                       dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
> +                       dc->bpc[0][IFS_TX][IFS_TCP].bytes,
> +                       dc->bpc[0][IFS_TX][IFS_TCP].packets,
> +                       dc->bpc[0][IFS_TX][IFS_UDP].bytes,
> +                       dc->bpc[0][IFS_TX][IFS_UDP].packets,
> +                       dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
> +                       dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
> +                       dc->bpc[1][IFS_RX][IFS_TCP].bytes,
> +                       dc->bpc[1][IFS_RX][IFS_TCP].packets,
> +                       dc->bpc[1][IFS_RX][IFS_UDP].bytes,
> +                       dc->bpc[1][IFS_RX][IFS_UDP].packets,
> +                       dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
> +                       dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
> +                       dc->bpc[1][IFS_TX][IFS_TCP].bytes,
> +                       dc->bpc[1][IFS_TX][IFS_TCP].packets,
> +                       dc->bpc[1][IFS_TX][IFS_UDP].bytes,
> +                       dc->bpc[1][IFS_TX][IFS_UDP].packets,
> +                       dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
> +                       dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
> +       else
> +               res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
> +       _bug_on_err_or_null(res);
> +       return res;
> +}
> +
> +char *pp_tag_node(struct tag_node *tn)
> +{
> +       char *tag_str;
> +       char *res;
> +
> +       if (!tn) {
> +               res = kasprintf(GFP_ATOMIC, "tag_node@null{}");
> +               _bug_on_err_or_null(res);
> +               return res;
> +       }
> +       tag_str = pp_tag_t(&tn->tag);
> +       res = kasprintf(GFP_ATOMIC,
> +                       "tag_node@%p{tag=%s}",
> +                       tn, tag_str);
> +       _bug_on_err_or_null(res);
> +       kfree(tag_str);
> +       return res;
> +}
> +
> +char *pp_tag_ref(struct tag_ref *tr)
> +{
> +       char *tn_str;
> +       char *res;
> +
> +       if (!tr) {
> +               res = kasprintf(GFP_ATOMIC, "tag_ref@null{}");
> +               _bug_on_err_or_null(res);
> +               return res;
> +       }
> +       tn_str = pp_tag_node(&tr->tn);
> +       res = kasprintf(GFP_ATOMIC,
> +                       "tag_ref@%p{%s, num_sock_tags=%d}",
> +                       tr, tn_str, tr->num_sock_tags);
> +       _bug_on_err_or_null(res);
> +       kfree(tn_str);
> +       return res;
> +}
> +
> +char *pp_tag_stat(struct tag_stat *ts)
> +{
> +       char *tn_str;
> +       char *counters_str;
> +       char *parent_counters_str;
> +       char *res;
> +
> +       if (!ts) {
> +               res = kasprintf(GFP_ATOMIC, "tag_stat@null{}");
> +               _bug_on_err_or_null(res);
> +               return res;
> +       }
> +       tn_str = pp_tag_node(&ts->tn);
> +       counters_str = pp_data_counters(&ts->counters, true);
> +       parent_counters_str = pp_data_counters(ts->parent_counters, false);
> +       res = kasprintf(GFP_ATOMIC,
> +                       "tag_stat@%p{%s, counters=%s, parent_counters=%s}",
> +                       ts, tn_str, counters_str, parent_counters_str);
> +       _bug_on_err_or_null(res);
> +       kfree(tn_str);
> +       kfree(counters_str);
> +       kfree(parent_counters_str);
> +       return res;
> +}
> +
> +char *pp_iface_stat(struct iface_stat *is)
> +{
> +       char *res;
> +       if (!is)
> +               res = kasprintf(GFP_ATOMIC, "iface_stat@null{}");
> +       else
> +               res = kasprintf(GFP_ATOMIC, "iface_stat@%p{"
> +                               "list=list_head{...}, "
> +                               "ifname=%s, "
> +                               "total={rx={bytes=%llu, "
> +                               "packets=%llu}, "
> +                               "tx={bytes=%llu, "
> +                               "packets=%llu}}, "
> +                               "last_known_valid=%d, "
> +                               "last_known={rx={bytes=%llu, "
> +                               "packets=%llu}, "
> +                               "tx={bytes=%llu, "
> +                               "packets=%llu}}, "
> +                               "active=%d, "
> +                               "net_dev=%p, "
> +                               "proc_ptr=%p, "
> +                               "tag_stat_tree=rb_root{...}}",
> +                               is,
> +                               is->ifname,
> +                               is->totals[IFS_RX].bytes,
> +                               is->totals[IFS_RX].packets,
> +                               is->totals[IFS_TX].bytes,
> +                               is->totals[IFS_TX].packets,
> +                               is->last_known_valid,
> +                               is->last_known[IFS_RX].bytes,
> +                               is->last_known[IFS_RX].packets,
> +                               is->last_known[IFS_TX].bytes,
> +                               is->last_known[IFS_TX].packets,
> +                               is->active,
> +                               is->net_dev,
> +                               is->proc_ptr);
> +       _bug_on_err_or_null(res);
> +       return res;
> +}
> +
> +char *pp_sock_tag(struct sock_tag *st)
> +{
> +       char *tag_str;
> +       char *res;
> +
> +       if (!st) {
> +               res = kasprintf(GFP_ATOMIC, "sock_tag@null{}");
> +               _bug_on_err_or_null(res);
> +               return res;
> +       }
> +       tag_str = pp_tag_t(&st->tag);
> +       res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
> +                       "sock_node=rb_node{...}, "
> +                       "sk=%p socket=%p (f_count=%lu), list=list_head{...}, "
> +                       "pid=%u, tag=%s}",
> +                       st, st->sk, st->socket, atomic_long_read(
> +                               &st->socket->file->f_count),
> +                       st->pid, tag_str);
> +       _bug_on_err_or_null(res);
> +       kfree(tag_str);
> +       return res;
> +}
> +
> +char *pp_uid_tag_data(struct uid_tag_data *utd)
> +{
> +       char *res;
> +
> +       if (!utd)
> +               res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
> +       else
> +               res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
> +                               "uid=%u, num_active_acct_tags=%d, "
> +                               "num_pqd=%d, "
> +                               "tag_node_tree=rb_root{...}, "
> +                               "proc_qtu_data_tree=rb_root{...}}",
> +                               utd, utd->uid,
> +                               utd->num_active_tags, utd->num_pqd);
> +       _bug_on_err_or_null(res);
> +       return res;
> +}
> +
> +char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
> +{
> +       char *parent_tag_data_str;
> +       char *res;
> +
> +       if (!pqd) {
> +               res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
> +               _bug_on_err_or_null(res);
> +               return res;
> +       }
> +       parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
> +       res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
> +                       "node=rb_node{...}, pid=%u, "
> +                       "parent_tag_data=%s, "
> +                       "sock_tag_list=list_head{...}}",
> +                       pqd, pqd->pid, parent_tag_data_str
> +               );
> +       _bug_on_err_or_null(res);
> +       kfree(parent_tag_data_str);
> +       return res;
> +}
> +
> +/*------------------------------------------*/
> +void prdebug_sock_tag_tree(int indent_level,
> +                          struct rb_root *sock_tag_tree)
> +{
> +       struct rb_node *node;
> +       struct sock_tag *sock_tag_entry;
> +       char *str;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (RB_EMPTY_ROOT(sock_tag_tree)) {
> +               str = "sock_tag_tree=rb_root{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "sock_tag_tree=rb_root{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       for (node = rb_first(sock_tag_tree);
> +            node;
> +            node = rb_next(node)) {
> +               sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
> +               str = pp_sock_tag(sock_tag_entry);
> +               pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
> +               kfree(str);
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +void prdebug_sock_tag_list(int indent_level,
> +                          struct list_head *sock_tag_list)
> +{
> +       struct sock_tag *sock_tag_entry;
> +       char *str;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (list_empty(sock_tag_list)) {
> +               str = "sock_tag_list=list_head{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "sock_tag_list=list_head{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
> +               str = pp_sock_tag(sock_tag_entry);
> +               pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
> +               kfree(str);
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +void prdebug_proc_qtu_data_tree(int indent_level,
> +                               struct rb_root *proc_qtu_data_tree)
> +{
> +       char *str;
> +       struct rb_node *node;
> +       struct proc_qtu_data *proc_qtu_data_entry;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (RB_EMPTY_ROOT(proc_qtu_data_tree)) {
> +               str = "proc_qtu_data_tree=rb_root{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "proc_qtu_data_tree=rb_root{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       for (node = rb_first(proc_qtu_data_tree);
> +            node;
> +            node = rb_next(node)) {
> +               proc_qtu_data_entry = rb_entry(node,
> +                                              struct proc_qtu_data,
> +                                              node);
> +               str = pp_proc_qtu_data(proc_qtu_data_entry);
> +               pr_debug("%*d: %s,\n", indent_level*2, indent_level,
> +                        str);
> +               kfree(str);
> +               indent_level++;
> +               prdebug_sock_tag_list(indent_level,
> +                                     &proc_qtu_data_entry->sock_tag_list);
> +               indent_level--;
> +
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
> +{
> +       char *str;
> +       struct rb_node *node;
> +       struct tag_ref *tag_ref_entry;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (RB_EMPTY_ROOT(tag_ref_tree)) {
> +               str = "tag_ref_tree{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "tag_ref_tree{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       for (node = rb_first(tag_ref_tree);
> +            node;
> +            node = rb_next(node)) {
> +               tag_ref_entry = rb_entry(node,
> +                                        struct tag_ref,
> +                                        tn.node);
> +               str = pp_tag_ref(tag_ref_entry);
> +               pr_debug("%*d: %s,\n", indent_level*2, indent_level,
> +                        str);
> +               kfree(str);
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +void prdebug_uid_tag_data_tree(int indent_level,
> +                              struct rb_root *uid_tag_data_tree)
> +{
> +       char *str;
> +       struct rb_node *node;
> +       struct uid_tag_data *uid_tag_data_entry;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (RB_EMPTY_ROOT(uid_tag_data_tree)) {
> +               str = "uid_tag_data_tree=rb_root{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "uid_tag_data_tree=rb_root{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       for (node = rb_first(uid_tag_data_tree);
> +            node;
> +            node = rb_next(node)) {
> +               uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
> +                                             node);
> +               str = pp_uid_tag_data(uid_tag_data_entry);
> +               pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
> +               kfree(str);
> +               if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
> +                       indent_level++;
> +                       prdebug_tag_ref_tree(indent_level,
> +                                            &uid_tag_data_entry->tag_ref_tree);
> +                       indent_level--;
> +               }
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +void prdebug_tag_stat_tree(int indent_level,
> +                                 struct rb_root *tag_stat_tree)
> +{
> +       char *str;
> +       struct rb_node *node;
> +       struct tag_stat *ts_entry;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (RB_EMPTY_ROOT(tag_stat_tree)) {
> +               str = "tag_stat_tree{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "tag_stat_tree{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       for (node = rb_first(tag_stat_tree);
> +            node;
> +            node = rb_next(node)) {
> +               ts_entry = rb_entry(node, struct tag_stat, tn.node);
> +               str = pp_tag_stat(ts_entry);
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level,
> +                        str);
> +               kfree(str);
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +void prdebug_iface_stat_list(int indent_level,
> +                            struct list_head *iface_stat_list)
> +{
> +       char *str;
> +       struct iface_stat *iface_entry;
> +
> +       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
> +               return;
> +
> +       if (list_empty(iface_stat_list)) {
> +               str = "iface_stat_list=list_head{}";
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               return;
> +       }
> +
> +       str = "iface_stat_list=list_head{";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +       indent_level++;
> +       list_for_each_entry(iface_entry, iface_stat_list, list) {
> +               str = pp_iface_stat(iface_entry);
> +               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +               kfree(str);
> +
> +               spin_lock_bh(&iface_entry->tag_stat_list_lock);
> +               if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
> +                       indent_level++;
> +                       prdebug_tag_stat_tree(indent_level,
> +                                             &iface_entry->tag_stat_tree);
> +                       indent_level--;
> +               }
> +               spin_unlock_bh(&iface_entry->tag_stat_list_lock);
> +       }
> +       indent_level--;
> +       str = "}";
> +       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
> +}
> +
> +#endif  /* ifdef DDEBUG */
> +/*------------------------------------------*/
> +static const char * const netdev_event_strings[] = {
> +       "netdev_unknown",
> +       "NETDEV_UP",
> +       "NETDEV_DOWN",
> +       "NETDEV_REBOOT",
> +       "NETDEV_CHANGE",
> +       "NETDEV_REGISTER",
> +       "NETDEV_UNREGISTER",
> +       "NETDEV_CHANGEMTU",
> +       "NETDEV_CHANGEADDR",
> +       "NETDEV_GOING_DOWN",
> +       "NETDEV_CHANGENAME",
> +       "NETDEV_FEAT_CHANGE",
> +       "NETDEV_BONDING_FAILOVER",
> +       "NETDEV_PRE_UP",
> +       "NETDEV_PRE_TYPE_CHANGE",
> +       "NETDEV_POST_TYPE_CHANGE",
> +       "NETDEV_POST_INIT",
> +       "NETDEV_UNREGISTER_BATCH",
> +       "NETDEV_RELEASE",
> +       "NETDEV_NOTIFY_PEERS",
> +       "NETDEV_JOIN",
> +};
> +
> +const char *netdev_evt_str(int netdev_event)
> +{
> +       if (netdev_event < 0
> +           || netdev_event >= ARRAY_SIZE(netdev_event_strings))
> +               return "bad event num";
> +       return netdev_event_strings[netdev_event];
> +}
> diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h
> new file mode 100644
> index 0000000..b63871a
> --- /dev/null
> +++ b/net/netfilter/xt_qtaguid_print.h
> @@ -0,0 +1,120 @@
> +/*
> + * Pretty printing Support for iptables xt_qtaguid module.
> + *
> + * (C) 2011 Google, Inc
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef __XT_QTAGUID_PRINT_H__
> +#define __XT_QTAGUID_PRINT_H__
> +
> +#include "xt_qtaguid_internal.h"
> +
> +#ifdef DDEBUG
> +
> +char *pp_tag_t(tag_t *tag);
> +char *pp_data_counters(struct data_counters *dc, bool showValues);
> +char *pp_tag_node(struct tag_node *tn);
> +char *pp_tag_ref(struct tag_ref *tr);
> +char *pp_tag_stat(struct tag_stat *ts);
> +char *pp_iface_stat(struct iface_stat *is);
> +char *pp_sock_tag(struct sock_tag *st);
> +char *pp_uid_tag_data(struct uid_tag_data *qtd);
> +char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
> +
> +/*------------------------------------------*/
> +void prdebug_sock_tag_list(int indent_level,
> +                          struct list_head *sock_tag_list);
> +void prdebug_sock_tag_tree(int indent_level,
> +                          struct rb_root *sock_tag_tree);
> +void prdebug_proc_qtu_data_tree(int indent_level,
> +                               struct rb_root *proc_qtu_data_tree);
> +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
> +void prdebug_uid_tag_data_tree(int indent_level,
> +                              struct rb_root *uid_tag_data_tree);
> +void prdebug_tag_stat_tree(int indent_level,
> +                          struct rb_root *tag_stat_tree);
> +void prdebug_iface_stat_list(int indent_level,
> +                            struct list_head *iface_stat_list);
> +
> +#else
> +
> +/*------------------------------------------*/
> +static inline char *pp_tag_t(tag_t *tag)
> +{
> +       return NULL;
> +}
> +static inline char *pp_data_counters(struct data_counters *dc, bool showValues)
> +{
> +       return NULL;
> +}
> +static inline char *pp_tag_node(struct tag_node *tn)
> +{
> +       return NULL;
> +}
> +static inline char *pp_tag_ref(struct tag_ref *tr)
> +{
> +       return NULL;
> +}
> +static inline char *pp_tag_stat(struct tag_stat *ts)
> +{
> +       return NULL;
> +}
> +static inline char *pp_iface_stat(struct iface_stat *is)
> +{
> +       return NULL;
> +}
> +static inline char *pp_sock_tag(struct sock_tag *st)
> +{
> +       return NULL;
> +}
> +static inline char *pp_uid_tag_data(struct uid_tag_data *qtd)
> +{
> +       return NULL;
> +}
> +static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
> +{
> +       return NULL;
> +}
> +
> +/*------------------------------------------*/
> +static inline
> +void prdebug_sock_tag_list(int indent_level,
> +                          struct list_head *sock_tag_list)
> +{
> +}
> +static inline
> +void prdebug_sock_tag_tree(int indent_level,
> +                          struct rb_root *sock_tag_tree)
> +{
> +}
> +static inline
> +void prdebug_proc_qtu_data_tree(int indent_level,
> +                               struct rb_root *proc_qtu_data_tree)
> +{
> +}
> +static inline
> +void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
> +{
> +}
> +static inline
> +void prdebug_uid_tag_data_tree(int indent_level,
> +                              struct rb_root *uid_tag_data_tree)
> +{
> +}
> +static inline
> +void prdebug_tag_stat_tree(int indent_level,
> +                          struct rb_root *tag_stat_tree)
> +{
> +}
> +static inline
> +void prdebug_iface_stat_list(int indent_level,
> +                            struct list_head *iface_stat_list)
> +{
> +}
> +#endif
> +/*------------------------------------------*/
> +const char *netdev_evt_str(int netdev_event);
> +#endif  /* ifndef __XT_QTAGUID_PRINT_H__ */
> --
> 1.7.9.5

-- 
Thanks,
//richard

^ permalink raw reply

* Re: linux-next: build failure after merge of the final tree (net-next tree related)
From: Benjamin Herrenschmidt @ 2012-09-22 21:20 UTC (permalink / raw)
  To: David Miller
  Cc: sfr, paulus, linuxppc-dev, mika.westerberg, netdev, linux-next,
	linux-kernel
In-Reply-To: <20120922.160027.1528491047223302539.davem@davemloft.net>

On Sat, 2012-09-22 at 16:00 -0400, David Miller wrote:
> 
> I think I'm going to just end up restricting this driver to X86
> as was originally suggested.

Probably the easiest fix indeed.

> There seems to be no real consistent Kconfig protection for users
> of isa_virt_to_bus() and friends.
> 
> We know that ISA_DMA_API doesn't do it, and ISA doesn't do it either
> as powerpc also allows that to bet set for CHRP and friends.

It's a old mess :-( And not an easy one to untangle...

Cheers,
Ben.

^ permalink raw reply

* [PATCH net-next] tcp: TCP Fast Open Server - record retransmits after 3WHS
From: Neal Cardwell @ 2012-09-23  3:03 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, Eric Dumazet, Yuchung Cheng, Jerry Chu, Neal Cardwell

When recording the number of SYNACK retransmits for servers using TCP
Fast Open, fix the code to ensure that we copy over the retransmit
count from the request_sock after we receive the ACK that completes
the 3-way handshake.

The story here is similar to that of SYNACK RTT
measurements. Previously we were always doing this in
tcp_v4_syn_recv_sock(). However, for TCP Fast Open connections
tcp_v4_conn_req_fastopen() calls tcp_v4_syn_recv_sock() at the time we
receive the SYN. So for TFO we must copy the final SYNACK retransmit
count in tcp_rcv_state_process().

Note that copying over the SYNACK retransmit count will give us the
correct count since, as is mentioned in a comment in
tcp_retransmit_timer(), before we receive an ACK for our SYN-ACK a TFO
passive connection does not retransmit anything else (e.g., data or
FIN segments).

Signed-off-by: Neal Cardwell <ncardwell@google.com>
---
 net/ipv4/tcp_input.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 36e069a1..e037697 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5985,6 +5985,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 				 */
 				if (req) {
 					tcp_synack_rtt_meas(sk, req);
+					tp->total_retrans = req->retrans;
+
 					reqsk_fastopen_remove(sk, req, false);
 				} else {
 					/* Make sure socket is routed, for
-- 
1.7.7.3

^ permalink raw reply related

* Re: [PATCH net-next] tcp: TCP Fast Open Server - record retransmits after 3WHS
From: David Miller @ 2012-09-23  3:15 UTC (permalink / raw)
  To: ncardwell; +Cc: netdev, edumazet, ycheng, hkchu
In-Reply-To: <1348369427-29563-1-git-send-email-ncardwell@google.com>

From: Neal Cardwell <ncardwell@google.com>
Date: Sat, 22 Sep 2012 23:03:47 -0400

> When recording the number of SYNACK retransmits for servers using TCP
> Fast Open, fix the code to ensure that we copy over the retransmit
> count from the request_sock after we receive the ACK that completes
> the 3-way handshake.
> 
> The story here is similar to that of SYNACK RTT
> measurements. Previously we were always doing this in
> tcp_v4_syn_recv_sock(). However, for TCP Fast Open connections
> tcp_v4_conn_req_fastopen() calls tcp_v4_syn_recv_sock() at the time we
> receive the SYN. So for TFO we must copy the final SYNACK retransmit
> count in tcp_rcv_state_process().
> 
> Note that copying over the SYNACK retransmit count will give us the
> correct count since, as is mentioned in a comment in
> tcp_retransmit_timer(), before we receive an ACK for our SYN-ACK a TFO
> passive connection does not retransmit anything else (e.g., data or
> FIN segments).
> 
> Signed-off-by: Neal Cardwell <ncardwell@google.com>

Applied, thanks Neal.

^ permalink raw reply

* [PATCH] inet_diag: make config INET_DIAG bool
From: Gao feng @ 2012-09-23  3:49 UTC (permalink / raw)
  To: davem, kuznet; +Cc: netdev, Gao feng

when inet_diag being compiled as module, inet_diag_handler_dump
set netlink_dump_control.dump to inet_diag_dump,so if module
inet_diag is unloaded,netlink will still try to call this function
in netlink_dump. this will cause kernel panic.

this patch makes config INET_DIAG bool to avoid inet_diag being
compiled as module.

Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com>
---
 net/ipv4/Kconfig |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 5a19aeb..da56f84 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -403,7 +403,7 @@ config INET_LRO
 	  If unsure, say Y.

 config INET_DIAG
-	tristate "INET: socket monitoring interface"
+	bool "INET: socket monitoring interface"
 	default y
 	---help---
 	  Support for INET (TCP, DCCP, etc) socket monitoring interface used by
-- 
1.7.7.6

^ permalink raw reply related

* Re: [PATCH] inet_diag: make config INET_DIAG bool
From: Stephen Hemminger @ 2012-09-23  4:36 UTC (permalink / raw)
  To: Gao feng; +Cc: netdev, davem, kuznet
In-Reply-To: <1348372154-20663-1-git-send-email-gaofeng@cn.fujitsu.com>


> when inet_diag being compiled as module, inet_diag_handler_dump
> set netlink_dump_control.dump to inet_diag_dump,so if module
> inet_diag is unloaded,netlink will still try to call this function
> in netlink_dump. this will cause kernel panic.
>

Another way to handle this to just get rid of inet_diag_exit
so that module can be loaded but not unloaded.

^ permalink raw reply

* Re: [PATCH 1/2] netlink: kill netlink_set_nonroot
From: Jan Engelhardt @ 2012-09-23  5:00 UTC (permalink / raw)
  To: pablo; +Cc: netdev, davem
In-Reply-To: <1346934712-3056-1-git-send-email-pablo@netfilter.org>


On Thursday 2012-09-06 14:31, pablo@netfilter.org wrote:

> /* optional Netlink kernel configuration parameters */
> struct netlink_kernel_cfg {
> 	unsigned int	groups;
> 	void		(*input)(struct sk_buff *skb);
> 	struct mutex	*cb_mutex;
> 	void		(*bind)(int group);
>+	unsigned int	flags;
> };

Putting flags next to groups would reduce the hole there.

^ permalink raw reply

* Re: [PATCH 1/2] netlink: kill netlink_set_nonroot
From: David Miller @ 2012-09-23  6:09 UTC (permalink / raw)
  To: jengelh; +Cc: pablo, netdev
In-Reply-To: <alpine.LNX.2.01.1209230700120.2498@nerf07.vanv.qr>

From: Jan Engelhardt <jengelh@inai.de>
Date: Sun, 23 Sep 2012 07:00:42 +0200 (CEST)

> 
> On Thursday 2012-09-06 14:31, pablo@netfilter.org wrote:
> 
>> /* optional Netlink kernel configuration parameters */
>> struct netlink_kernel_cfg {
>> 	unsigned int	groups;
>> 	void		(*input)(struct sk_buff *skb);
>> 	struct mutex	*cb_mutex;
>> 	void		(*bind)(int group);
>>+	unsigned int	flags;
>> };
> 
> Putting flags next to groups would reduce the hole there.

Works for me:

--------------------
[PATCH] netlink: Rearrange netlink_kernel_cfg to save space on 64-bit.

Suggested by Jan Engelhardt.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index b3dc992..f80c56a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -183,10 +183,10 @@ extern void netlink_table_ungrab(void);
 /* optional Netlink kernel configuration parameters */
 struct netlink_kernel_cfg {
 	unsigned int	groups;
+	unsigned int	flags;
 	void		(*input)(struct sk_buff *skb);
 	struct mutex	*cb_mutex;
 	void		(*bind)(int group);
-	unsigned int	flags;
 };
 
 extern struct sock *__netlink_kernel_create(struct net *net, int unit,
-- 
1.7.11.4


 

^ permalink raw reply related

* Re: [PATCH v3] ucc_geth: Lockless xmit
From: Joakim Tjernlund @ 2012-09-23  9:39 UTC (permalink / raw)
  To: Francois Romieu; +Cc: netdev
In-Reply-To: <20120921173535.GA27934@electric-eye.fr.zoreil.com>

Francois Romieu <romieu@fr.zoreil.com> wrote on 2012/09/21 19:35:35:
>
> Please keep netdev in the Cc:.

Sorry, wrong reply button. Now added.

>
> Joakim Tjernlund <joakim.tjernlund@transmode.se> :
> [...]
> > This is what I could come up with, what do you think?
>
> In its current state the driver implicitely relies on the Tx xmit vs Tx
> completion exclusion to work. As long as they exclude each other the
> ugeth->tx_skbuff[txQ][...] and the "bd_status & T_R" states are consistent.
>
> This scheme can not work without locking. The skb alone won't provide a
> synchronization point.

I don't get it. The skb test is there just for one special case, when
the BD ring is empty the (bd_status & T_R) == 0 will be true as well so
one need something more than the bd_status test.

>
> An usual solution would be some skb dirtytx vs curtx comparison + smp
> barriers (Documentation/memory-barriers.txt).

You mean adding an extra test in addition to !skb? Something like:
if (!skb && ugeth->skb_dirtytx[txQ] == ugeth->skb_curtx[txQ])
	break;

>
> Sidenote: is there some reason why modulo (%) operations should be
> avoided on this platform ?

Not that I know. The original driver author did it this way.

>
> [...]
> > @@ -3380,8 +3380,12 @@ static int ucc_geth_tx(struct net_device *dev, u8 txQ)
> >                      1) & TX_RING_MOD_MASK(ugeth->ug_info->bdRingLenTx[txQ]);
> >
> >                 /* We freed a buffer, so now we can restart transmission */
> > -               if (netif_queue_stopped(dev))
> > -                       netif_wake_queue(dev);
> > +               if (netif_queue_stopped(dev)) {
> > +                       netif_tx_lock(dev);
> > +                       if (netif_queue_stopped(dev))
> > +                               netif_wake_queue(dev);
> > +                       netif_tx_unlock(dev);
> > +               }
>
> Without exclusion we don't know if it was stopped before or after the packet
> was freed. There must be some "are there available Tx slots ?" test in the
> locked section.

This makes sense, but stopping relies on ugeth->confBd[txQ] which is updated
last in ucc_geth_tx(). Looks a bit suboptimal though so it should probably
be changed.

>
> Btw the second netif_queue_stopped test can be removed if tx queueing can
> only be stopped in a single place (I did not check).

There is a netif_stop_queue in xmit and one in ucc_geth_close() so I guess
one can say there is only one place.

^ permalink raw reply

* [net-next patch v2] bnx2x: Improve code around bnx2x_tests_str_arr
From: Merav Sicron @ 2012-09-23 13:12 UTC (permalink / raw)
  To: davem, netdev, eilong; +Cc: Merav Sicron

This patch changes the definition of bnx2x_tests_str_arr from static char
pointer to static const char bi-directional array. Also the
bnx2x_get_strings function is simplified.

Reported-by: Joe Perches <joe@perches.com>
Reported-by: David Laight <David.Laight@ACULAB.COM>
Signed-off-by: Merav Sicron <meravs@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
---
Hi,
Sorry for the late response.

Changes comparing to v1: Define the content of the array const, use
bi-directional array instead of array with strings pointers, simplify the
bnx2x_get_strings function by using memcpy.

Dave, please consider applying this patch to net-next.

Thanks,
Merav

 drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c |   13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
index f923125..a19c9e0 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_ethtool.c
@@ -1607,7 +1607,7 @@ static int bnx2x_set_pauseparam(struct net_device *dev,
 	return 0;
 }

-static char *bnx2x_tests_str_arr[BNX2X_NUM_TESTS_SF] = {
+static const char bnx2x_tests_str_arr[BNX2X_NUM_TESTS_SF][ETH_GSTRING_LEN] = {
 	"register_test (offline)    ",
 	"memory_test (offline)      ",
 	"int_loopback_test (offline)",
@@ -2536,7 +2536,7 @@ static int bnx2x_get_sset_count(struct net_device *dev, int stringset)
 static void bnx2x_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 {
 	struct bnx2x *bp = netdev_priv(dev);
-	int i, j, k, offset, start;
+	int i, j, k, start;
 	char queue_name[MAX_QUEUE_NAME_LEN+1];

 	switch (stringset) {
@@ -2572,13 +2572,8 @@ static void bnx2x_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
 			start = 0;
 		else
 			start = 4;
-		for (i = 0, j = start; j < (start + BNX2X_NUM_TESTS(bp));
-		     i++, j++) {
-			offset = sprintf(buf+32*i, "%s",
-					 bnx2x_tests_str_arr[j]);
-			*(buf+offset) = '\0';
-		}
-		break;
+		memcpy(buf, bnx2x_tests_str_arr + start,
+		       ETH_GSTRING_LEN * BNX2X_NUM_TESTS(bp));
 	}
 }

--
1.7.10

^ permalink raw reply related

* Re: loop back address question
From: Jan Engelhardt @ 2012-09-23 12:37 UTC (permalink / raw)
  To: ratheesh kannoth; +Cc: netdev
In-Reply-To: <CAGZFCEE=Txa4=meB7CTuWvcgFe4teiXG3LWbOua21TqXXYY1SQ@mail.gmail.com>

On Tuesday 2012-09-18 15:42, ratheesh kannoth wrote:

>Hi ,
>
>i have two  linux machines(2.6.29 ) A & B, both  connected  to same lan network
>
>1)  if  i assign 127.0.2.1 and 127.0.2.2 to interfaces , can we ping
>each other ?  is there any way to  send packets out of interface ?

1. The route "local 127.0.0.0/8 dev lo" in the local table has a higher 
priority than the main table, so you would need to reorder that first.

2. There are a handful of places in linux where ipv4_is_loopback() is 
used on addresses to take a different path, so the chances to 
successfully get 127.0.2.0/24 packets out an interface may not be 100%.

Just in case you tried selling blocks out of your 127 block to someone..

^ permalink raw reply

* Re: [RFC] tcp: use order-3 pages in tcp_sendmsg()
From: Jan Engelhardt @ 2012-09-23 12:47 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, rick.jones2, subramanian.vijay, netdev
In-Reply-To: <20120921.122705.1344923255084267186.davem@davemloft.net>

On Friday 2012-09-21 18:27, David Miller wrote:

>From: Eric Dumazet <eric.dumazet@gmail.com>
>Date: Fri, 21 Sep 2012 17:48:31 +0200
>
>> There is probably a reason why lo default MTU is 16436 ?
>
>That's what fit into L1 caches back in 1999

Would it make sense to automatically set lo's MTU on device registration 
to the actual size of the L1 that the $running system has?

^ permalink raw reply

* Re: [PATCH] inet_diag: make config INET_DIAG bool
From: Jan Engelhardt @ 2012-09-23 13:40 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Gao feng, netdev, davem, kuznet
In-Reply-To: <b133d67a-cb5f-4d46-b3d8-b1ac9448725e@tahiti.vyatta.com>

On Sunday 2012-09-23 06:36, Stephen Hemminger wrote:

>> when inet_diag being compiled as module, inet_diag_handler_dump
>> set netlink_dump_control.dump to inet_diag_dump,so if module
>> inet_diag is unloaded,netlink will still try to call this function
>> in netlink_dump. this will cause kernel panic.
>
>Another way to handle this to just get rid of inet_diag_exit
>so that module can be loaded but not unloaded.

Why don't we unset netlink_dump_control.dump on exit?

^ permalink raw reply

* Re: New commands to configure IOV features
From: Yuval Mintz @ 2012-09-23 15:49 UTC (permalink / raw)
  To: Don Dutile
  Cc: Yinghai Lu, Rose, Gregory V, Ben Hutchings, Bjorn Helgaas,
	davem@davemloft.net, netdev@vger.kernel.org, Ariel Elior,
	Eilon Greenstein, linux-pci
In-Reply-To: <505CC930.5070807@redhat.com>

>>> It provides 3 sysfs files (enable, disable, num_max_vfs);
>>> callbacks for enable&  disable.
>>
>> why using three? only pass max_vfs should be enough...
>>
>> aka pass 0 mean disabling, pass other value mean enabling.
>>
> could do that.  but I wouldn't use 'max_vfs'; I would recommend
> 'num_vfs', as max implies, er, um, the maximum, and what one
> wants is to be able to enable a number from 1->max_vfs.
> 'max_vfs' will be provided by another file.
> 

If we're at it, how do you suggest the configurations of the VFs
resources/properties should be made?
Notice I'm asking about properties which relate to all PCI devices,
E.g., number of interrupts assigned to each VF (for VF RSS).

(For properties which relate only to networking devices, we could
use standard network API such as netlink).

Perhaps given a configurable property that applies to all PCI devices,
once SRIOV is enabled and the VFs are created, there would be an
additional sysfs node under the VFs through which the user could
configure that property.

^ permalink raw reply

* Re: [RFC] tcp: use order-3 pages in tcp_sendmsg()
From: David Miller @ 2012-09-23 16:16 UTC (permalink / raw)
  To: jengelh; +Cc: eric.dumazet, rick.jones2, subramanian.vijay, netdev
In-Reply-To: <alpine.LNX.2.01.1209231446170.9625@nerf07.vanv.qr>

From: Jan Engelhardt <jengelh@inai.de>
Date: Sun, 23 Sep 2012 14:47:28 +0200 (CEST)

> On Friday 2012-09-21 18:27, David Miller wrote:
> 
>>From: Eric Dumazet <eric.dumazet@gmail.com>
>>Date: Fri, 21 Sep 2012 17:48:31 +0200
>>
>>> There is probably a reason why lo default MTU is 16436 ?
>>
>>That's what fit into L1 caches back in 1999
> 
> Would it make sense to automatically set lo's MTU on device
> registration to the actual size of the L1 that the $running system
> has?

I think a fixed value of 64K would be a easiest, because another issue
is that back in 1999 we didn't have GRO/GSO/etc.

^ permalink raw reply

* [PATCH] net: scm: moving dereference after NULL check
From: Maxin B. John @ 2012-09-23 17:03 UTC (permalink / raw)
  To: David S. Miller; +Cc: Catherine Zhang, netdev, linux-kernel

scm_destroy_cred() dereferences 'scm' before null check

Signed-off-by: Maxin B. John <maxin.john@gmail.com>
---
diff --git a/include/net/scm.h b/include/net/scm.h
index 7dc0854..ed25aa1 100644
--- a/include/net/scm.h
+++ b/include/net/scm.h
@@ -64,9 +64,11 @@ static __inline__ void scm_destroy_cred(struct scm_cookie *scm)
 
 static __inline__ void scm_destroy(struct scm_cookie *scm)
 {
-	scm_destroy_cred(scm);
-	if (scm && scm->fp)
-		__scm_destroy(scm);
+	if (scm) {
+		scm_destroy_cred(scm);
+		if (scm->fp)
+			__scm_destroy(scm);
+	}
 }
 
 static __inline__ int scm_send(struct socket *sock, struct msghdr *msg,

^ permalink raw reply related

* Re: [RFC] tcp: use order-3 pages in tcp_sendmsg()
From: Jan Engelhardt @ 2012-09-23 17:40 UTC (permalink / raw)
  To: David Miller; +Cc: eric.dumazet, rick.jones2, subramanian.vijay, netdev
In-Reply-To: <20120923.121601.2244503178301624422.davem@davemloft.net>

On Sunday 2012-09-23 18:16, David Miller wrote:
>> On Friday 2012-09-21 18:27, David Miller wrote:
>> 
>>>From: Eric Dumazet <eric.dumazet@gmail.com>
>>>Date: Fri, 21 Sep 2012 17:48:31 +0200
>>>
>>>> There is probably a reason why lo default MTU is 16436 ?
>>>
>>>That's what fit into L1 caches back in 1999
>> 
>> Would it make sense to automatically set lo's MTU on device
>> registration to the actual size of the L1 that the $running system
>> has?
>
>I think a fixed value of 64K would be a easiest, because another issue
>is that back in 1999 we didn't have GRO/GSO/etc.

Cache sizes, and an oddity.

L1 cache sizes have not increased ever since (the 2011 Intel i7-2600
has the same amount of L1 as a 1998ish AMD K6-2), and the Atom N450
even has less, namely 24d+32i, meaning a 13000ish MTU might be more
accurate for netbooks of this kind.

What would a MTU of 64K buy? Offloading seems pointless for the lo
device. There is no hardware to offload it to - well, the
machine already has the packet too.

^ permalink raw reply

* Re: [RFC] tcp: use order-3 pages in tcp_sendmsg()
From: Eric Dumazet @ 2012-09-23 18:13 UTC (permalink / raw)
  To: Jan Engelhardt; +Cc: David Miller, rick.jones2, subramanian.vijay, netdev
In-Reply-To: <alpine.LNX.2.01.1209231930520.3904@nerf07.vanv.qr>

On Sun, 2012-09-23 at 19:40 +0200, Jan Engelhardt wrote:

> 
> Cache sizes, and an oddity.
> 
> L1 cache sizes have not increased ever since (the 2011 Intel i7-2600
> has the same amount of L1 as a 1998ish AMD K6-2), and the Atom N450
> even has less, namely 24d+32i, meaning a 13000ish MTU might be more
> accurate for netbooks of this kind.
> 

MTU above 64K is not really useful, at least for IPv4, I think.

> What would a MTU of 64K buy? Offloading seems pointless for the lo
> device. There is no hardware to offload it to - well, the
> machine already has the packet too.

It buys 25% increase of performance. Not too bad...

I yet have to make sure no adjustment is needed, in TCP stack for
example.

If you try to change lo mtu on an old kernel (eg 2.6.38, on my laptop),
you can notice some packets drops (TCPBacklogDrop)

^ permalink raw reply

* Re: [RFC] tcp: use order-3 pages in tcp_sendmsg()
From: David Miller @ 2012-09-23 18:27 UTC (permalink / raw)
  To: jengelh; +Cc: eric.dumazet, rick.jones2, subramanian.vijay, netdev
In-Reply-To: <alpine.LNX.2.01.1209231930520.3904@nerf07.vanv.qr>

From: Jan Engelhardt <jengelh@inai.de>
Date: Sun, 23 Sep 2012 19:40:33 +0200 (CEST)

> What would a MTU of 64K buy? Offloading seems pointless for the lo
> device. There is no hardware to offload it to - well, the
> machine already has the packet too.

Traversing the stack once instead of N times.

^ permalink raw reply

* Re: [PATCH v2] lxt PHY: Support for the buggy LXT973 rev A2
From: Lutz Jaenicke @ 2012-09-23 18:44 UTC (permalink / raw)
  To: leroy christophe; +Cc: David S Miller, netdev, linux-kernel
In-Reply-To: <505DE592.8060103@c-s.fr>

On Sat, Sep 22, 2012 at 06:21:38PM +0200, leroy christophe wrote:
> 
> Le 10/09/2012 20:17, Lutz Jaenicke a écrit :
> >On Mon, Sep 10, 2012 at 05:45:49PM +0200, Christophe Leroy wrote:
> >>This patch adds proper handling of the buggy revision A2 of LXT973 phy, adding
> >>precautions linked to ERRATA Item 4:
> >>
> >>Item 4: MDIO Interface and Repeated Polling
> >>Problem: Repeated polling of odd-numbered registers via the MDIO interface
> >>	randomly returns the contents of the previous even register.
> >>Implication: Managed applications may not obtain the correct register contents
> >>	when a particular register is monitored for device status.
> >>Workaround: None.
> >>Status: This erratum has been previously fixed (in rev A3)
> >Hmm. Were did you get the hardware from? We have been using LXT973 in
> >our products and the A2 was replaced by A3 in 2003.
> >
> >Best regards,
> >	Lutz
> They are custom boards that my company manufactures. Most boards
> manufactured before 2006 or 2007 have this buggy chip.

Ok, so this problem indeed is related to (very) old hardware.
I would not be sure whether the hack needed to work around
this bug should be added one decade after the chip has
been fixed.

Best regards,
	Lutz
PS.Yes, we have used the same workaround with such hardware, fortunately
we only had a few first boards using A2 and the series production
started just with A3 at that time.
-- 
Dr.-Ing. Lutz Jänicke
CTO
Innominate Security Technologies AG  /protecting industrial networks/
tel: +49.30.921028-200
fax: +49.30.921028-020
Rudower Chaussee 13
D-12489 Berlin, Germany
www.innominate.com

Register Court: AG Charlottenburg, HR B 81603
Management Board: Dirk Seewald
Chairman of the Supervisory Board: Christoph Leifer

^ permalink raw reply

* Re: [PATCH 2/7][RFC] netfilter: add xt_qtaguid matching module
From: Pablo Neira Ayuso @ 2012-09-23 21:26 UTC (permalink / raw)
  To: John Stultz
  Cc: LKML, JP Abgrall, netdev, Ashish Sharma, Peter P Waskiewicz Jr,
	netfilter-devel
In-Reply-To: <1348279853-44499-3-git-send-email-john.stultz@linaro.org>

Hi John,

Cc'ing netfilter-devel (better than only netdev, to attract the
attention from other Netfilter hacker fellows).

Some comments on this:

On Fri, Sep 21, 2012 at 10:10:48PM -0400, John Stultz wrote:
> From: JP Abgrall <jpa@google.com>
> 
> This module allows tracking stats at the socket level for given UIDs.
> It replaces xt_owner.
> If the --uid-owner is not specified, it will just count stats based on
> who the skb belongs to. This will even happen on incoming skbs as it
> looks into the skb via xt_socket magic to see who owns it.
> If an skb is lost, it will be assigned to uid=0.
> 
> To control what sockets of what UIDs are tagged by what, one uses:
>   echo t $sock_fd $accounting_tag $the_billed_uid \
>      > /proc/net/xt_qtaguid/ctrl
>  So whenever an skb belongs to a sock_fd, it will be accounted against
>    $the_billed_uid
>   and matching stats will show up under the uid with the given
>    $accounting_tag.
> 
> Because the number of allocations for the stats structs is not that big:
>   ~500 apps * 32 per app
> we'll just do it atomic. This avoids walking lists many times, and
> the fancy worker thread handling. Slabs will grow when needed later.
> 
> It use netdevice and inetaddr notifications instead of hooks in the core dev
> code to track when a device comes and goes. This removes the need for
> exposed iface_stat.h.
> 
> Put procfs dirs in /proc/net/xt_qtaguid/
>   ctrl
>   stats
>   iface_stat/<iface>/...
> The uid stats are obtainable in ./stats.

Unless I'm missing anything worth in this patch, this seems to me like
a combo match of owner + nfacct infrastructure.

I guess you can probably get all done with one single rule, but that
is not enough to justify its inclusion in mainline.

In case you are not familiar with the nfacct infrastructure:

http://lwn.net/Articles/472094/

I'd be happy anyway if you provide more examples on you use this, so I
can assure you we can do this with the existing infrastructure in
mainstream.

Thanks!

^ permalink raw reply

* Re: [PATCH 6/7][RFC] netfilter: xt_IDLETIMER: Add new netlink msg type
From: Pablo Neira Ayuso @ 2012-09-23 21:41 UTC (permalink / raw)
  To: John Stultz
  Cc: LKML, JP Abgrall, netdev, Ashish Sharma, Peter P Waskiewicz Jr
In-Reply-To: <1348279853-44499-7-git-send-email-john.stultz@linaro.org>

On Fri, Sep 21, 2012 at 10:10:52PM -0400, John Stultz wrote:
> From: JP Abgrall <jpa@google.com>
> 
> Send notifications when the label becomes active after an idle period.
> Send netlink message notifications in addition to sysfs notifications.
> Using a uevent with
>   subsystem=xt_idletimer
>   INTERFACE=...
>   STATE={active,inactive}
> 
> This is backport from common android-3.0
> commit: beb914e987cbbd368988d2b94a6661cb907c4d5a
> with uevent support instead of a new netlink message type.

I think this can be candidate to be included once some comestical
issues are fixed.

> Cc: netdev@vger.kernel.org
> Cc: JP Abgrall <jpa@google.com>
> Cc: Ashish Sharma <ashishsharma@google.com>
> Cc: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
> Signed-off-by: Ashish Sharma <ashishsharma@google.com>
> Signed-off-by: JP Abgrall <jpa@google.com>
> Signed-off-by: John Stultz <john.stultz@linaro.org>
> ---
>  include/linux/netfilter/xt_IDLETIMER.h |    8 ++++
>  net/netfilter/xt_IDLETIMER.c           |   78 +++++++++++++++++++++++++++++---
>  2 files changed, 79 insertions(+), 7 deletions(-)
> 
> diff --git a/include/linux/netfilter/xt_IDLETIMER.h b/include/linux/netfilter/xt_IDLETIMER.h
> index 208ae93..faaa28b 100644
> --- a/include/linux/netfilter/xt_IDLETIMER.h
> +++ b/include/linux/netfilter/xt_IDLETIMER.h
> @@ -4,6 +4,7 @@
>   * Header file for Xtables timer target module.
>   *
>   * Copyright (C) 2004, 2010 Nokia Corporation
> + *

we can live without that extra line above.

>   * Written by Timo Teras <ext-timo.teras@nokia.com>
>   *
>   * Converted to x_tables and forward-ported to 2.6.34
> @@ -32,12 +33,19 @@
>  #include <linux/types.h>
>  
>  #define MAX_IDLETIMER_LABEL_SIZE 28
> +#define NLMSG_MAX_SIZE 64

better define this NLMSG_MAX_SIZE inside xt_IDLETIMER.c, this header
is exported to user-space and that definition is not useful.

> +
> +#define NL_EVENT_TYPE_INACTIVE 0
> +#define NL_EVENT_TYPE_ACTIVE 1
>  
>  struct idletimer_tg_info {
>  	__u32 timeout;
>  
>  	char label[MAX_IDLETIMER_LABEL_SIZE];
>  
> +	/* Use netlink messages for notification in addition to sysfs */
> +	__u8 send_nl_msg;
> +
>  	/* for kernel module internal use only */
>  	struct idletimer_tg *timer __attribute__((aligned(8)));
>  };

you have to define two structures:

struct idletimer_tg_info
struct idletimer_tg_info_v1

and register both revisions. This (rudimentary) mechanism allows us to
keep backward compatibility.

See net/netfilter/xt_socket.c for instance.

> diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
> index f407ebc1..df91e26 100644
> --- a/net/netfilter/xt_IDLETIMER.c
> +++ b/net/netfilter/xt_IDLETIMER.c
> @@ -5,6 +5,7 @@
>   * After timer expires a kevent will be sent.
>   *
>   * Copyright (C) 2004, 2010 Nokia Corporation
> + *
>   * Written by Timo Teras <ext-timo.teras@nokia.com>
>   *
>   * Converted to x_tables and reworked for upstream inclusion
> @@ -38,8 +39,10 @@
>  #include <linux/netfilter/xt_IDLETIMER.h>
>  #include <linux/kdev_t.h>
>  #include <linux/kobject.h>
> +#include <linux/skbuff.h>
>  #include <linux/workqueue.h>
>  #include <linux/sysfs.h>
> +#include <net/net_namespace.h>
>  
>  struct idletimer_tg_attr {
>  	struct attribute attr;
> @@ -56,6 +59,8 @@ struct idletimer_tg {
>  	struct idletimer_tg_attr attr;
>  
>  	unsigned int refcnt;
> +	bool send_nl_msg;
> +	bool active;
>  };
>  
>  static LIST_HEAD(idletimer_tg_list);
> @@ -63,6 +68,32 @@ static DEFINE_MUTEX(list_mutex);
>  
>  static struct kobject *idletimer_tg_kobj;
>  
> +static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer)
> +{
> +	char iface_msg[NLMSG_MAX_SIZE];
> +	char state_msg[NLMSG_MAX_SIZE];
> +	char *envp[] = { iface_msg, state_msg, NULL };
> +	int res;
> +
> +	res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s",
> +		       iface);
> +	if (NLMSG_MAX_SIZE <= res) {
> +		pr_err("message too long (%d)", res);
> +		return;
> +	}
> +	res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s",
> +		       timer->active ? "active" : "inactive");
> +	if (NLMSG_MAX_SIZE <= res) {
> +		pr_err("message too long (%d)", res);
> +		return;
> +	}
> +	pr_debug("putting nlmsg: <%s> <%s>\n", iface_msg, state_msg);
> +	kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp);
> +	return;
> +
> +

extra lines.

> +}
> +
>  static
>  struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
>  {
> @@ -83,6 +114,7 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
>  {
>  	struct idletimer_tg *timer;
>  	unsigned long expires = 0;
> +	unsigned long now = jiffies;
>  
>  	mutex_lock(&list_mutex);
>  
> @@ -92,11 +124,15 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
>  
>  	mutex_unlock(&list_mutex);
>  
> -	if (time_after(expires, jiffies))
> +	if (time_after(expires, now))
>  		return sprintf(buf, "%u\n",
> -			       jiffies_to_msecs(expires - jiffies) / 1000);
> +			       jiffies_to_msecs(expires - now) / 1000);
>  
> -	return sprintf(buf, "0\n");
> +	if (timer->send_nl_msg)
> +		return sprintf(buf, "0 %d\n",
> +			jiffies_to_msecs(now - expires) / 1000);
> +	else
> +		return sprintf(buf, "0\n");
>  }
>  
>  static void idletimer_tg_work(struct work_struct *work)
> @@ -105,6 +141,9 @@ static void idletimer_tg_work(struct work_struct *work)
>  						  work);
>  
>  	sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
> +
> +	if (timer->send_nl_msg)
> +		notify_netlink_uevent(timer->attr.attr.name, timer);
>  }
>  
>  static void idletimer_tg_expired(unsigned long data)
> @@ -113,6 +152,7 @@ static void idletimer_tg_expired(unsigned long data)
>  
>  	pr_debug("timer %s expired\n", timer->attr.attr.name);
>  
> +	timer->active = false;
>  	schedule_work(&timer->work);
>  }
>  
> @@ -145,6 +185,8 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
>  	setup_timer(&info->timer->timer, idletimer_tg_expired,
>  		    (unsigned long) info->timer);
>  	info->timer->refcnt = 1;
> +	info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
> +	info->timer->active = true;
>  
>  	mod_timer(&info->timer->timer,
>  		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
> @@ -168,14 +210,24 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
>  					 const struct xt_action_param *par)
>  {
>  	const struct idletimer_tg_info *info = par->targinfo;
> +	unsigned long now = jiffies;
>  
>  	pr_debug("resetting timer %s, timeout period %u\n",
>  		 info->label, info->timeout);
>  
>  	BUG_ON(!info->timer);
>  
> +	info->timer->active = true;
> +
> +	if (time_before(info->timer->timer.expires, now)) {
> +		schedule_work(&info->timer->work);
> +		pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
> +			 info->label, info->timer->timer.expires, now);
> +	}

AFAICS, this seems to fix the case in which timer has expired and we
refresh it without sending the notification.

Should go in a different patch.

> +
> +	/* TODO: Avoid modifying timers on each packet */
>  	mod_timer(&info->timer->timer,
> -		  msecs_to_jiffies(info->timeout * 1000) + jiffies);
> +		  msecs_to_jiffies(info->timeout * 1000) + now);
>  
>  	return XT_CONTINUE;
>  }
> @@ -184,8 +236,9 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
>  {
>  	struct idletimer_tg_info *info = par->targinfo;
>  	int ret;
> +	unsigned long now = jiffies;
>  
> -	pr_debug("checkentry targinfo%s\n", info->label);
> +	pr_debug("checkentry targinfo %s\n", info->label);
>  
>  	if (info->timeout == 0) {
>  		pr_debug("timeout value is zero\n");
> @@ -204,8 +257,16 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
>  	info->timer = __idletimer_tg_find_by_label(info->label);
>  	if (info->timer) {
>  		info->timer->refcnt++;
> +		info->timer->active = true;
> +
> +		if (time_before(info->timer->timer.expires, now)) {
> +			schedule_work(&info->timer->work);
> +			pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n",
> +				info->timer->timer.expires, now);
> +		}
> +
>  		mod_timer(&info->timer->timer,
> -			  msecs_to_jiffies(info->timeout * 1000) + jiffies);
> +			  msecs_to_jiffies(info->timeout * 1000) + now);
>  
>  		pr_debug("increased refcnt of timer %s to %u\n",
>  			 info->label, info->timer->refcnt);
> @@ -219,6 +280,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
>  	}
>  
>  	mutex_unlock(&list_mutex);
> +

we don't need this.

>  	return 0;
>  }
>  
> @@ -240,7 +302,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
>  		kfree(info->timer);
>  	} else {
>  		pr_debug("decreased refcnt of timer %s to %u\n",
> -			 info->label, info->timer->refcnt);
> +		info->label, info->timer->refcnt);

remove this change.

>  	}
>  
>  	mutex_unlock(&list_mutex);
> @@ -248,6 +310,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
>  
>  static struct xt_target idletimer_tg __read_mostly = {
>  	.name		= "IDLETIMER",
> +	.revision	= 1,
>  	.family		= NFPROTO_UNSPEC,
>  	.target		= idletimer_tg_target,
>  	.targetsize     = sizeof(struct idletimer_tg_info),
> @@ -313,3 +376,4 @@ MODULE_DESCRIPTION("Xtables: idle time monitor");
>  MODULE_LICENSE("GPL v2");
>  MODULE_ALIAS("ipt_IDLETIMER");
>  MODULE_ALIAS("ip6t_IDLETIMER");
> +MODULE_ALIAS("arpt_IDLETIMER");

Interesting, they are really using this with arptables?

> -- 
> 1.7.9.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply

* [PATCH 1/3 V4] phy/micrel: Implement support for KSZ8021
From: Marek Vasut @ 2012-09-24  1:29 UTC (permalink / raw)
  To: netdev; +Cc: Marek Vasut, David J. Choi, David S. Miller, Nobuhiro Iwamatsu
In-Reply-To: <20120922.153122.1337768859179326955.davem@davemloft.net>

The KSZ8021 PHY was previously caught by KS8051, which is not correct.
This PHY needs additional setup if it is strapped for address 0. In such
case an reserved bit must be written in the 0x16, "Operation Mode Strap
Override" register. According to the KS8051 datasheet, that bit means
"PHY Address 0 in non-broadcast" and it indeed behaves as such on KSZ8021.
The issue where the ethernet controller (Freescale FEC) did not communicate
with network is fixed by writing this bit as 1.

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: David J. Choi <david.choi@micrel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Nobuhiro Iwamatsu <nobuhiro.iwamatsu.yj@renesas.com>
---
 drivers/net/phy/micrel.c   |   27 +++++++++++++++++++++++++++
 include/linux/micrel_phy.h |    1 +
 2 files changed, 28 insertions(+)

V2: Also add entry into micrel_tbl
V3: - Rectify the indent problem in ksz8021_config_init() by introducing
      const int value and using it in phy_write()
    - Fix checkpatch issue in ksphy_driver array, put | at the end of line
V4: Also align SUPPORTED_Asym_Pause to previous line

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index cf287e0..e8e00dc 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -21,6 +21,12 @@
 #include <linux/phy.h>
 #include <linux/micrel_phy.h>
 
+/* Operation Mode Strap Override */
+#define MII_KSZPHY_OMSO				0x16
+#define KSZPHY_OMSO_B_CAST_OFF			(1 << 9)
+#define KSZPHY_OMSO_RMII_OVERRIDE		(1 << 1)
+#define KSZPHY_OMSO_MII_OVERRIDE		(1 << 0)
+
 /* general Interrupt control/status reg in vendor specific block. */
 #define MII_KSZPHY_INTCS			0x1B
 #define	KSZPHY_INTCS_JABBER			(1 << 15)
@@ -101,6 +107,13 @@ static int kszphy_config_init(struct phy_device *phydev)
 	return 0;
 }
 
+static int ksz8021_config_init(struct phy_device *phydev)
+{
+	const u16 val = KSZPHY_OMSO_B_CAST_OFF | KSZPHY_OMSO_RMII_OVERRIDE;
+	phy_write(phydev, MII_KSZPHY_OMSO, val);
+	return 0;
+}
+
 static int ks8051_config_init(struct phy_device *phydev)
 {
 	int regval;
@@ -128,6 +141,19 @@ static struct phy_driver ksphy_driver[] = {
 	.config_intr	= ks8737_config_intr,
 	.driver		= { .owner = THIS_MODULE,},
 }, {
+	.phy_id		= PHY_ID_KSZ8021,
+	.phy_id_mask	= 0x00ffffff,
+	.name		= "Micrel KSZ8021",
+	.features	= (PHY_BASIC_FEATURES | SUPPORTED_Pause |
+			   SUPPORTED_Asym_Pause),
+	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init	= ksz8021_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.ack_interrupt	= kszphy_ack_interrupt,
+	.config_intr	= kszphy_config_intr,
+	.driver		= { .owner = THIS_MODULE,},
+}, {
 	.phy_id		= PHY_ID_KS8041,
 	.phy_id_mask	= 0x00fffff0,
 	.name		= "Micrel KS8041",
@@ -203,6 +229,7 @@ static struct mdio_device_id __maybe_unused micrel_tbl[] = {
 	{ PHY_ID_KSZ9021, 0x000ffffe },
 	{ PHY_ID_KS8001, 0x00ffffff },
 	{ PHY_ID_KS8737, 0x00fffff0 },
+	{ PHY_ID_KSZ8021, 0x00ffffff },
 	{ PHY_ID_KS8041, 0x00fffff0 },
 	{ PHY_ID_KS8051, 0x00fffff0 },
 	{ }
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 61f0905..be7f366 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -5,6 +5,7 @@
 
 #define PHY_ID_KSZ9021		0x00221610
 #define PHY_ID_KS8737		0x00221720
+#define PHY_ID_KSZ8021		0x00221555
 #define PHY_ID_KS8041		0x00221510
 #define PHY_ID_KS8051		0x00221550
 /* both for ks8001 Rev. A/B, and for ks8721 Rev 3. */
-- 
1.7.10.4

^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox