All of lore.kernel.org
 help / color / mirror / Atom feed
From: Alan Jenkins <alan-jenkins@tuffmail.co.uk>
To: linux-hotplug@vger.kernel.org
Subject: [PATCH] udevd: de-duplicate strings in rules
Date: Tue, 11 Nov 2008 20:20:11 +0000	[thread overview]
Message-ID: <4919E8FB.2050809@tuffmail.co.uk> (raw)

On my Ubuntu installation this removes 15k of duplicate strings,
using a temporary index of about 25k.

Signed-off-by: Alan Jenkins <alan-jenkins@tuffmail.co.uk>

diff --git a/udev/udev-rules.c b/udev/udev-rules.c
index 810a863..df5ed53 100644
--- a/udev/udev-rules.c
+++ b/udev/udev-rules.c
@@ -16,6 +16,7 @@
  */
 
 #include <stddef.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
@@ -30,6 +31,7 @@
 
 #define PREALLOC_TOKEN			2048
 #define PREALLOC_STRBUF			32 * 1024
+#define PREALLOC_TRIE			256
 
 struct uid_gid {
 	unsigned int name_off;
@@ -39,7 +41,16 @@ struct uid_gid {
 	};
 };
 
-/* KEY="", KEY!="", KEY+="", KEY="", KEY:="" */
+#define TRIE_CHILD_MAX 10
+
+struct trie_node {
+	unsigned int value_off;
+	size_t value_len;
+	unsigned char child_cur;
+	unsigned char child_key[TRIE_CHILD_MAX];
+	unsigned short child[TRIE_CHILD_MAX];
+};
+
 struct udev_rules {
 	struct udev *udev;
 	int resolve_names;
@@ -55,6 +66,12 @@ struct udev_rules {
 	size_t buf_max;
 	unsigned int buf_count;
 
+	/* key strings are indexed to avoid wasting space with duplicates */
+	struct trie_node *trie;
+	unsigned short trie_cur;
+	unsigned short trie_max;
+	unsigned short *trie_root;
+
 	/* during rule parsing, we cache uid/gid lookup results */
 	struct uid_gid *uids;
 	unsigned int uids_cur;
@@ -64,6 +81,7 @@ struct udev_rules {
 	unsigned int gids_max;
 };
 
+/* KEY="", KEY!="", KEY+="", KEY="", KEY:="" */
 enum operation_type {
 	OP_UNSET,
 
@@ -392,25 +410,19 @@ static inline void dump_token(struct udev_rules *rules, struct token *token) {}
 static inline void dump_rules(struct udev_rules *rules) {}
 #endif /* DEBUG */
 
-/* we could lookup and return existing strings, or tails of strings */
-static int add_string(struct udev_rules *rules, const char *str)
+static int add_new_string(struct udev_rules *rules, const char *str, size_t bytes)
 {
-	size_t len = strlen(str)+1;
 	int off;
 
-	/* offset 0 is always '\0' */
-	if (str[0] = '\0')
-		return 0;
-
 	/* grow buffer if needed */
-	if (rules->buf_cur + len+1 >= rules->buf_max) {
+	if (rules->buf_cur + bytes+1 >= rules->buf_max) {
 		char *buf;
 		unsigned int add;
 
 		/* double the buffer size */
 		add = rules->buf_max;
-		if (add < len * 8)
-			add = len * 8;
+		if (add < bytes * 8)
+			add = bytes * 8;
 
 		buf = realloc(rules->buf, rules->buf_max + add);
 		if (buf = NULL)
@@ -420,15 +432,128 @@ static int add_string(struct udev_rules *rules, const char *str)
 		rules->buf_max += add;
 	}
 	off = rules->buf_cur;
-	memcpy(&rules->buf[rules->buf_cur], str, len);
-	rules->buf_cur += len;
+	memcpy(&rules->buf[rules->buf_cur], str, bytes);
+	rules->buf_cur += bytes;
 	rules->buf_count++;
 	return off;
 }
 
-static int add_token(struct udev_rules *rules, struct token *token)
+static unsigned char trie_child_slot(struct trie_node *node, char key)
 {
+	unsigned char child_slot;
+
+	for (child_slot = 0; child_slot < node->child_cur; child_slot++) {
+		if (node->child_key[child_slot] = key)
+			break;
+	}
+
+	return child_slot;
+}
+
+static int add_string(struct udev_rules *rules, const char *str)
+{
+	struct trie_node *child;
+	unsigned short child_off;
+	unsigned short node_off;
+	unsigned char key;
+	size_t len;
+	int depth;
+	unsigned int off;
+
+	len = strlen(str);
+
+	/* offset 0 is always '\0' */
+	if (len = 0)
+		return 0;
+
+	/* strings with spaces are probably commands e.g. modprobe,
+	   with unique arguments. */
+	if (strchr(str, ' ') != NULL)
+		return add_new_string(rules, str, len + 1);
+
+	/* descend root - start from last character of str */
+	key = str[len - 1];
+	node_off = rules->trie_root[key];
+	depth = 0;
+
+	/* descend suffix trie */
+	if (node_off != 0) {
+		while (1) {
+			struct trie_node *node = &rules->trie[node_off];
+			unsigned char child_slot;
+
+			depth++;
+			off = node->value_off + node->value_len - len;
+
+			/* match against current node */
+			if (depth = len ||
+			    (node->value_len >= len &&
+			    memcmp(&rules->buf[off], str, len) = 0))
+			{
+				return off;
+			}
 
+			/* lookup child node */
+			key = str[len - 1 - depth];
+			child_slot = trie_child_slot(node, key);
+
+			if(child_slot = node->child_cur)
+				break;
+
+			node_off = node->child[child_slot];
+		}
+	}
+
+	/* string not found, add it */
+	off = add_new_string(rules, str, len + 1);
+
+	/* grow trie storage if needed */
+	if (rules->trie_cur >= rules->trie_max) {
+		struct trie_node *trie;
+		unsigned short add;
+
+		/* double the buffer size */
+		add = rules->trie_max;
+		if (add < 8)
+			add = 8;
+
+		trie = realloc(rules->trie, (rules->trie_max + add) * sizeof(struct trie_node));
+		if (trie = NULL)
+			return -1;
+		dbg(rules->udev, "extend string index nodes from %u to %u\n", rules->trie_max, rules->trie_max + add);
+		rules->trie = trie;
+		rules->trie_max += add;
+	}
+
+	/* insert new child node */
+	child_off = rules->trie_cur;
+	if (depth = 0) {
+		rules->trie_root[key] = child_off;
+	} else {
+		struct trie_node *parent = &rules->trie[node_off];
+		unsigned char child_slot = parent->child_cur;
+
+		/* no space in parent, we can't index this string, nevermind */
+		if (child_slot = TRIE_CHILD_MAX)
+			return off;
+
+		parent->child[child_slot] = child_off;
+		parent->child_key[child_slot] = key;
+		parent->child_cur = child_slot + 1;
+	}
+
+	/* allocate and construct the child node */
+	rules->trie_cur++;
+	child = &rules->trie[child_off];
+	memset(child, 0x00, sizeof(struct trie_node));
+	child->value_off = off;
+	child->value_len = len;
+
+	return off;
+}
+
+static int add_token(struct udev_rules *rules, struct token *token)
+{
 	/* grow buffer if needed */
 	if (rules->token_cur+1 >= rules->token_max) {
 		struct token *tokens;
@@ -1607,9 +1732,16 @@ struct udev_rules *udev_rules_new(struct udev *udev, int resolve_names)
 	if (rules->buf = NULL)
 		return NULL;
 	rules->buf_max = PREALLOC_STRBUF;
+	rules->trie = malloc(PREALLOC_TRIE * sizeof(struct trie_node));
+	if (rules->trie = NULL)
+		return NULL;
+	rules->trie_max = PREALLOC_TRIE;
+	rules->trie_root = calloc(UCHAR_MAX + 1, sizeof(unsigned short));
 	/* offset 0 is always '\0' */
 	rules->buf[0] = '\0';
 	rules->buf_cur = 1;
+	/* offset 0 is reserved for the null trie node */
+	rules->trie_cur = 1;
 	dbg(udev, "prealloc %zu bytes tokens (%u * %zu bytes), %zu bytes buffer\n",
 	    rules->token_max * sizeof(struct token), rules->token_max, sizeof(struct token), rules->buf_max);
 
@@ -1725,6 +1857,16 @@ struct udev_rules *udev_rules_new(struct udev *udev, int resolve_names)
 	}
 	info(udev, "shrunk to %zu bytes tokens (%u * %zu bytes), %zu bytes buffer\n",
 	     rules->token_max * sizeof(struct token), rules->token_max, sizeof(struct token), rules->buf_max);
+	info(udev, "used %zu bytes of string index nodes (%hu * %zu bytes)\n",
+	     rules->trie_cur * sizeof(struct trie_node), rules->trie_cur, sizeof(struct trie_node));
+
+	/* cleanup trie */
+	free(rules->trie);
+	rules->trie = NULL;
+	rules->trie_cur = 0;
+	rules->trie_max = 0;
+	free(rules->trie_root);
+	rules->trie_root = NULL;
 
 	/* cleanup uid/gid cache */
 	free(rules->uids);
@@ -1746,6 +1888,8 @@ void udev_rules_unref(struct udev_rules *rules)
 		return;
 	free(rules->tokens);
 	free(rules->buf);
+	free(rules->trie);
+	free(rules->trie_root);
 	free(rules->uids);
 	free(rules->gids);
 	free(rules);



             reply	other threads:[~2008-11-11 20:20 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-11-11 20:20 Alan Jenkins [this message]
2008-11-11 21:23 ` [PATCH] udevd: de-duplicate strings in rules Kay Sievers
2008-11-12  5:11 ` Kay Sievers
2008-11-12 16:50 ` Alan Jenkins
2008-11-12 18:00 ` Kay Sievers
2008-11-12 18:05 ` Alan Jenkins
2008-11-12 18:20 ` Kay Sievers
2008-11-12 20:48 ` Alan Jenkins
2008-11-12 21:37 ` Kay Sievers
2008-11-13 10:12 ` Alan Jenkins
2008-11-13 19:03 ` Kay Sievers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4919E8FB.2050809@tuffmail.co.uk \
    --to=alan-jenkins@tuffmail.co.uk \
    --cc=linux-hotplug@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.