linux-hotplug.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Kay Sievers <kay.sievers@vrfy.org>
To: linux-hotplug@vger.kernel.org
Subject: Re: Hanging udev process on nfs-mounted /dev
Date: Wed, 06 Oct 2004 12:00:46 +0000	[thread overview]
Message-ID: <20041006120046.GA9768@vrfy.org> (raw)
In-Reply-To: <415980BF.1020401@bio.ifi.lmu.de>

[-- Attachment #1: Type: text/plain, Size: 1162 bytes --]

On Wed, Oct 06, 2004 at 08:06:29AM +0200, Frank Steiner wrote:
> Frank Steiner wrote
> 
> >Kay Sievers wrote
> >
> >
> >>All here known failure paths are now covered in the attached patch by
> >>limiting the iteration count for loops over data read from disk. Let's
> >>see what happens next :)
> >
> >
> >Ok :-) I'm running this patch in parallel on hosts using tmpfs for /dev
> >as well as "old" ones still using NFS. I will report success or failures!
> 
> Well, we are not done yet :-) Another udev hanging with 100% CPU, with
> the latest deadlock patch applied. Here's the trace (I removed the "(gdb) s"
> lines):

Funny, this is not inside the tdb, it's caused by getting garbage from the
udev database.

You don't use {all_partitions}, right?

I expect, that the payload-data from the tdb is corrupt and the record
claims, that the udev_add has added > 2.000.000 partition-nodes for the
device :) You should have something like this in the syslog:
  "removing partitions 'sg2[1-24532432]'"

Here is a patch for it. It's against the current tree after our patch
session yesterday. But it should successfully apply to your tree with
offsets.

Thanks,
Kay


[-- Attachment #2: udev-deadlock-debug-04.patch --]
[-- Type: text/plain, Size: 11731 bytes --]

===== namedev.c 1.147 vs edited =====
--- 1.147/namedev.c	2004-09-20 16:01:58 +02:00
+++ edited/namedev.c	2004-10-06 13:09:20 +02:00
@@ -29,7 +29,6 @@
 #include <ctype.h>
 #include <unistd.h>
 #include <errno.h>
-#include <time.h>
 #include <sys/wait.h>
 #include <sys/stat.h>
 #include <sys/sysinfo.h>
@@ -353,7 +352,6 @@ static struct bus_file {
 	{}
 };
 
-#define SECONDS_TO_WAIT_FOR_FILE	10
 static void wait_for_device_to_initialize(struct sysfs_device *sysfs_device)
 {
 	/* sleep until we see the file for this specific bus type show up this
@@ -367,14 +365,14 @@ static void wait_for_device_to_initializ
 	struct bus_file *b = &bus_files[0];
 	struct sysfs_attribute *tmpattr;
 	int found = 0;
-	int loop = SECONDS_TO_WAIT_FOR_FILE;
+	int loop = WAIT_FOR_FILE_SECONDS * WAIT_FOR_FILE_RETRY_FREQ;
 
 	while (1) {
 		if (b->bus == NULL) {
 			if (!found)
 				break;
-			/* sleep to give the kernel a chance to create the file */
-			sleep(1);
+			/* give the kernel a chance to create the file */
+			usleep(1000 * 1000 / WAIT_FOR_FILE_RETRY_FREQ);
 			--loop;
 			if (loop == 0)
 				break;
@@ -394,7 +392,8 @@ static void wait_for_device_to_initializ
 	}
 	if (!found)
 		dbg("did not find bus type '%s' on list of bus_id_files, "
-		    "contact greg@kroah.com", sysfs_device->bus);
+		    "please report to <linux-hotplug-devel@lists.sourceforge.net>",
+		    sysfs_device->bus);
 exit:
 	return; /* here to prevent compiler warning... */
 }
@@ -680,7 +679,6 @@ static struct sysfs_device *get_sysfs_de
 {
 	struct sysfs_device *sysfs_device;
 	struct sysfs_class_device *class_dev_parent;
-	struct timespec tspec;
 	int loop;
 
 	/* Figure out where the device symlink is at.  For char devices this will
@@ -696,16 +694,14 @@ static struct sysfs_device *get_sysfs_de
 	if (class_dev_parent != NULL) 
 		dbg("given class device has a parent, use this instead");
 
-	tspec.tv_sec = 0;
-	tspec.tv_nsec = 10000000;  /* sleep 10 millisec */
-	loop = 10;
+	loop = WAIT_FOR_FILE_SECONDS * WAIT_FOR_FILE_RETRY_FREQ;
 	while (loop--) {
 		if (udev_sleep) {
 			if (whitelist_search(class_dev)) {
 				sysfs_device = NULL;
 				goto exit;
 			}
-			nanosleep(&tspec, NULL);
+			usleep(1000 * 1000 / WAIT_FOR_FILE_RETRY_FREQ);
 		}
 
 		if (class_dev_parent)
@@ -727,11 +723,9 @@ device_found:
 		if (sysfs_device->bus[0] != '\0')
 			goto bus_found;
 
-		loop = 10;
-		tspec.tv_nsec = 10000000;
 		while (loop--) {
 			if (udev_sleep)
-				nanosleep(&tspec, NULL);
+				usleep(1000 * 1000 / WAIT_FOR_FILE_RETRY_FREQ);
 			sysfs_get_device_bus(sysfs_device);
 			
 			if (sysfs_device->bus[0] != '\0')
===== udev-add.c 1.74 vs edited =====
--- 1.74/udev-add.c	2004-10-06 00:22:36 +02:00
+++ edited/udev-add.c	2004-10-06 13:09:20 +02:00
@@ -348,11 +348,10 @@ exit:
 /* wait for the "dev" file to show up in the directory in sysfs.
  * If it doesn't happen in about 10 seconds, give up.
  */
-#define SECONDS_TO_WAIT_FOR_FILE	10
 static int sleep_for_file(const char *path, char* file)
 {
 	char filename[SYSFS_PATH_MAX + 6];
-	int loop = SECONDS_TO_WAIT_FOR_FILE;
+	int loop = WAIT_FOR_FILE_SECONDS * WAIT_FOR_FILE_RETRY_FREQ;
 	int retval;
 
 	strfieldcpy(filename, sysfs_path);
@@ -368,7 +367,7 @@ static int sleep_for_file(const char *pa
 			goto exit;
 
 		/* sleep to give the kernel a chance to create the dev file */
-		sleep(1);
+		usleep(1000 * 1000 / WAIT_FOR_FILE_RETRY_FREQ);
 	}
 	retval = -ENODEV;
 exit:
===== udev-remove.c 1.33 vs edited =====
--- 1.33/udev-remove.c	2004-08-05 00:40:11 +02:00
+++ edited/udev-remove.c	2004-10-06 13:24:28 +02:00
@@ -109,6 +109,7 @@ static int delete_node(struct udevice *d
 	int i;
 	char *pos;
 	int len;
+	int num;
 
 	strfieldcpy(filename, udev_root);
 	strfieldcat(filename, dev->name);
@@ -118,10 +119,15 @@ static int delete_node(struct udevice *d
 	if (retval)
 		return retval;
 
-	/* remove partition nodes */
-	if (dev->partitions > 0) {
-		info("removing partitions '%s[1-%i]'", filename, dev->partitions);
-		for (i = 1; i <= dev->partitions; i++) {
+	/* remove all_partitions nodes */
+	num = dev->partitions;
+	if (num > 0) {
+		info("removing all_partitions '%s[1-%i]'", filename, num);
+		if (num > PARTITIONS_COUNT) {
+			info("garbage from udev database, skip all_partitions removal");
+			return -1;
+		}
+		for (i = 1; i <= num; i++) {
 			strfieldcpy(partitionname, filename);
 			strintcat(partitionname, i);
 			secure_unlink(partitionname);
===== udev.c 1.62 vs edited =====
--- 1.62/udev.c	2004-09-14 02:25:32 +02:00
+++ edited/udev.c	2004-10-06 13:09:20 +02:00
@@ -36,6 +36,9 @@
 #include "namedev.h"
 #include "udevdb.h"
 
+/* timeout flag for udevdb */
+extern sig_atomic_t gotalarm;
+
 /* global variables */
 char **main_argv;
 char **main_envp;
@@ -58,6 +61,11 @@ void log_message(int level, const char *
 asmlinkage static void sig_handler(int signum)
 {
 	switch (signum) {
+		case SIGALRM:
+			gotalarm = 1;
+			info("error: timeout reached, event probably not handled correctly, "
+			     "please report to <linux-hotplug-devel@lists.sourceforge.net> ");
+			break;
 		case SIGINT:
 		case SIGTERM:
 			udevdb_exit();
@@ -94,7 +102,8 @@ int main(int argc, char *argv[], char *e
 
 	dbg("version %s", UDEV_VERSION);
 
-	/* initialize our configuration */
+	init_logging("udev");
+
 	udev_init_config();
 
 	if (strstr(argv[0], "udevstart")) {
@@ -147,15 +156,20 @@ int main(int argc, char *argv[], char *e
 	/* set signal handlers */
 	act.sa_handler = sig_handler;
 	sigemptyset (&act.sa_mask);
+
+	/* alarm should interrupt */
+	sigaction(SIGALRM, &act, NULL);
+
 	act.sa_flags = SA_RESTART;
 	sigaction(SIGINT, &act, NULL);
 	sigaction(SIGTERM, &act, NULL);
 
+	/* trigger timout to interrupt blocking syscalls */
+	alarm(ALARM_TIMEOUT);
+
 	/* initialize udev database */
-	if (udevdb_init(UDEVDB_DEFAULT) != 0) {
-		dbg("unable to initialize database");
-		goto exit;
-	}
+	if (udevdb_init(UDEVDB_DEFAULT) != 0)
+		info("error: unable to initialize database, continuing without database");
 
 	switch(act_type) {
 	case UDEVSTART:
===== udev.h 1.62 vs edited =====
--- 1.62/udev.h	2004-09-14 14:29:10 +02:00
+++ edited/udev.h	2004-10-06 13:09:21 +02:00
@@ -26,6 +26,9 @@
 #include <sys/param.h>
 #include "libsysfs/sysfs/libsysfs.h"
 
+#define ALARM_TIMEOUT			20
+#define WAIT_FOR_FILE_SECONDS		10
+#define WAIT_FOR_FILE_RETRY_FREQ	10
 #define COMMENT_CHARACTER		'#'
 
 #define NAME_SIZE			256
===== udevdb.c 1.30 vs edited =====
--- 1.30/udevdb.c	2004-06-29 14:51:35 +02:00
+++ edited/udevdb.c	2004-10-06 13:09:21 +02:00
@@ -42,13 +42,28 @@
 #include "tdb/tdb.h"
 
 static TDB_CONTEXT *udevdb;
+sig_atomic_t gotalarm;
 
+static void tdb_log(TDB_CONTEXT *tdb, int level, const char *format, ...)
+{
+	va_list args;
+
+	if (!udev_log)
+		return;
+
+	va_start(args, format);
+	vsyslog(level, format, args);
+	va_end(args);
+}
 
 int udevdb_add_dev(const char *path, const struct udevice *dev)
 {
 	TDB_DATA key, data;
 	char keystr[SYSFS_PATH_MAX];
 
+	if (udevdb == NULL)
+		return -1;
+
 	if ((path == NULL) || (dev == NULL))
 		return -ENODEV;
 
@@ -68,6 +83,9 @@ int udevdb_get_dev(const char *path, str
 {
 	TDB_DATA key, data;
 
+	if (udevdb == NULL)
+		return -1;
+
 	if (path == NULL)
 		return -ENODEV;
 
@@ -88,6 +106,9 @@ int udevdb_delete_dev(const char *path)
 	TDB_DATA key;
 	char keystr[SYSFS_PATH_MAX];
 
+	if (udevdb == NULL)
+		return -1;
+
 	if (path == NULL)
 		return -EINVAL;
 
@@ -121,7 +142,9 @@ int udevdb_init(int init_flag)
 	if (init_flag != UDEVDB_DEFAULT && init_flag != UDEVDB_INTERNAL)
 		return -EINVAL;
 
-	udevdb = tdb_open(udev_db_filename, 0, init_flag, O_RDWR | O_CREAT, 0644);
+	tdb_set_lock_alarm(&gotalarm);
+
+	udevdb = tdb_open_ex(udev_db_filename, 0, init_flag, O_RDWR | O_CREAT, 0644, tdb_log);
 	if (udevdb == NULL) {
 		if (init_flag == UDEVDB_INTERNAL)
 			dbg("unable to initialize in-memory database");
@@ -137,7 +160,7 @@ int udevdb_init(int init_flag)
  */
 int udevdb_open_ro(void)
 {
-	udevdb = tdb_open(udev_db_filename, 0, 0, O_RDONLY, 0);
+	udevdb = tdb_open_ex(udev_db_filename, 0, 0, O_RDONLY, 0, tdb_log);
 	if (udevdb == NULL) {
 		dbg("unable to open database at '%s'", udev_db_filename);
 		return -EACCES;
@@ -159,6 +182,9 @@ static int traverse_callback(TDB_CONTEXT
 int udevdb_call_foreach(int (*user_record_handler) (char *path, struct udevice *dev))
 {
 	int retval = 0;
+
+	if (udevdb == NULL)
+		return -1;
 
 	if (user_record_handler == NULL) {
 		dbg("invalid user record handling function");
===== tdb/tdb.c 1.4 vs edited =====
--- 1.4/tdb/tdb.c	2004-09-20 16:01:58 +02:00
+++ edited/tdb/tdb.c	2004-10-06 13:09:21 +02:00
@@ -617,8 +617,10 @@ int tdb_printfreelist(TDB_CONTEXT *tdb)
 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
 {
 	tdb_off last_ptr, i;
+	int maxloop;
 
 	/* read in the freelist top */
+	maxloop = 100000;
 	last_ptr = FREELIST_TOP;
 	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
 		if (i == off) {
@@ -627,6 +629,12 @@ static int remove_from_freelist(TDB_CONT
 		}
 		/* Follow chain (next offset is at start of record) */
 		last_ptr = i;
+
+		maxloop--;
+		if (maxloop == 0) {
+			TDB_LOG((tdb, 0, "remove_from_freelist: maxloop reached; corrupt database!\n"));
+			return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
+		}
 	}
 	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
 	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
@@ -853,6 +861,7 @@ static tdb_off tdb_allocate(TDB_CONTEXT 
 {
 	tdb_off rec_ptr, last_ptr, newrec_ptr;
 	struct list_struct newrec;
+	int maxloop;
 
 	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
 		return 0;
@@ -868,6 +877,7 @@ static tdb_off tdb_allocate(TDB_CONTEXT 
 		goto fail;
 
 	/* keep looking until we find a freelist record big enough */
+	maxloop = 100000;
 	while (rec_ptr) {
 		if (rec_free_read(tdb, rec_ptr, rec) == -1)
 			goto fail;
@@ -919,6 +929,12 @@ static tdb_off tdb_allocate(TDB_CONTEXT 
 		/* move to the next record */
 		last_ptr = rec_ptr;
 		rec_ptr = rec->next;
+
+		maxloop--;
+		if (maxloop == 0) {
+			TDB_LOG((tdb, 0, "tdb_allocate: maxloop reached; corrupt database!\n"));
+			return TDB_ERRCODE(TDB_ERR_CORRUPT, 0);
+		}
 	}
 	/* we didn't find enough space. See if we can expand the
 	   database and if we can then try again */
@@ -981,12 +997,14 @@ static tdb_off tdb_find(TDB_CONTEXT *tdb
 			struct list_struct *r)
 {
 	tdb_off rec_ptr;
-	
+	int maxloop;
+
 	/* read in the hash top */
 	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
 		return 0;
 
 	/* keep looking until we find the right record */
+	maxloop = 100000;
 	while (rec_ptr) {
 		if (rec_read(tdb, rec_ptr, r) == -1)
 			return 0;
@@ -1006,6 +1024,12 @@ static tdb_off tdb_find(TDB_CONTEXT *tdb
 			SAFE_FREE(k);
 		}
 		rec_ptr = r->next;
+
+		maxloop--;
+		if (maxloop == 0) {
+			TDB_LOG((tdb, 0, "tdb_find maxloop reached; corrupt database!\n"));
+			return TDB_ERRCODE(TDB_ERR_CORRUPT, 0);
+		}
 	}
 	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
 }
@@ -1188,6 +1212,7 @@ static int do_delete(TDB_CONTEXT *tdb, t
 {
 	tdb_off last_ptr, i;
 	struct list_struct lastrec;
+	int maxloop;
 
 	if (tdb->read_only) return -1;
 
@@ -1202,9 +1227,18 @@ static int do_delete(TDB_CONTEXT *tdb, t
 	/* find previous record in hash chain */
 	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
 		return -1;
-	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
+
+	maxloop = 100000;
+	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next) {
 		if (rec_read(tdb, i, &lastrec) == -1)
 			return -1;
+
+		maxloop--;
+		if (maxloop == 0) {
+			TDB_LOG((tdb, 0, "(tdb)do_delete: maxloop reached; corrupt database!\n"));
+			return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
+		}
+	}
 
 	/* unlink it: next ptr is at start of record. */
 	if (last_ptr == 0)

  parent reply	other threads:[~2004-10-06 12:00 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-09-28 15:18 Hanging udev process on nfs-mounted /dev Frank Steiner
2004-09-29 17:18 ` Greg KH
2004-09-29 23:39 ` Kay Sievers
2004-09-30  2:11 ` Kay Sievers
2004-09-30  6:18 ` Frank Steiner
2004-09-30  6:21 ` Frank Steiner
2004-09-30 14:07 ` Kay Sievers
2004-10-01  6:25 ` Frank Steiner
2004-10-01  7:36 ` Kay Sievers
2004-10-01  7:38 ` Frank Steiner
2004-10-01  7:55 ` Frank Steiner
2004-10-01  8:08 ` Kay Sievers
2004-10-01  9:43 ` Frank Steiner
2004-10-01  9:57 ` Kay Sievers
2004-10-01 10:43 ` Kay Sievers
2004-10-01 22:18 ` Kay Sievers
2004-10-03 21:10 ` Frank Steiner
2004-10-03 23:07 ` Kay Sievers
2004-10-04  6:15 ` Frank Steiner
2004-10-04 14:19 ` Kay Sievers
2004-10-04 14:53 ` Frank Steiner
2004-10-05 15:37 ` Kay Sievers
2004-10-06  6:06 ` Frank Steiner
2004-10-06 12:00 ` Kay Sievers [this message]
2004-10-06 12:29 ` Frank Steiner
2004-10-08  5:59 ` Frank Steiner

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20041006120046.GA9768@vrfy.org \
    --to=kay.sievers@vrfy.org \
    --cc=linux-hotplug@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).