Re: GFS — Linux Kernel

On Thu, 2005-08-11 at 09:33 -0700, Zach Brown wrote:
> I don't think this patch is the way to go at all.  It imposes an
> allocation and vma walking overhead for the vast majority of IOs that
> aren't interested.  It doesn't look like it will get a consistent
> ordering when multiple file systems are concerned.  It doesn't record
> the ranges of the mappings involved so Lustre can't properly use its
> range locks.  And finally, it doesn't prohibit mapping operations for
> the duration of the IO -- the whole reason we ended up in this thread in
> the first place :)

Hmm. So how do you propose we get rid of the mandatory vma walk? I was
thinking of making iolock a config option so when you don't have any
filesystems that need it, it can go away. I have also optimized the
extra allocation away when there are none mmap'd files that require
locking.

As for the rest of your comments, I heartly agree with them and
hopefully some interested party will take care of them :-).

			Pekka

Index: 2.6-mm/fs/iolock.c
===================================================================
--- /dev/null
+++ 2.6-mm/fs/iolock.c
@@ -0,0 +1,183 @@
+/*
+ * I/O locking for memory regions. Used by filesystems that need special
+ * locking for mmap'd files.
+ */
+
+#include <linux/iolock.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+/*
+ * TODO:
+ *
+ *  - Deadlock when two nodes acquire iolocks in reverse order for two
+ *    different filesystems. Solution: use rbtree in iolock_chain so we
+ *    can walk iolocks in order. XXX: what order is stable for two nodes
+ *    that don't know about each other?
+ */
+
+/*
+ * I/O lock contains all files that participate in locking a memory region
+ * in an address_space.
+ */
+struct iolock {
+	struct address_space	*mapping;
+	unsigned long		nr_files;
+	struct file		**files;
+	struct list_head	chain;
+};
+
+struct iolock_chain {
+	struct list_head	list;
+};
+
+static struct iolock *iolock_new(unsigned long max_files)
+{
+	struct iolock *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (!ret)
+		goto out;
+	ret->files = kcalloc(max_files, sizeof(struct file *), GFP_KERNEL);
+	if (!ret->files) {
+		kfree(ret);
+		ret = NULL;
+		goto out;
+	}
+	INIT_LIST_HEAD(&ret->chain);
+out:
+	return ret;
+}
+
+static struct iolock_chain *iolock_chain_new(void)
+{
+	struct iolock_chain * ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+	if (ret) {
+		INIT_LIST_HEAD(&ret->list);
+	}
+	return ret;
+}
+
+static int iolock_chain_acquire(struct iolock_chain *chain)
+{
+	struct iolock * iolock;
+	int err = 0;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		if (iolock->mapping->a_ops->iolock_acquire) {
+			err = iolock->mapping->a_ops->iolock_acquire(
+					iolock->files, iolock->nr_files);
+			if (!err)
+				goto error;
+		}
+	}
+error:
+	return err;
+}
+
+static struct iolock *iolock_lookup(struct iolock_chain *chain,
+				    struct address_space *mapping)
+{
+	struct iolock *ret = NULL;
+	struct iolock *iolock;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		if (iolock->mapping == mapping) {
+			ret = iolock;
+			break;
+		}
+	}
+	return ret;
+}
+
+/**
+ * iolock_region - Lock memory region for file I/O.
+ * @buf: the buffer we want to lock.
+ * @size: size of the buffer.
+ *
+ * Returns a pointer to the iolock_chain or NULL to denote an empty chain;
+ * otherwise returns ERR_PTR().
+ */
+struct iolock_chain *iolock_region(const char __user *buf, size_t size)
+{
+	struct iolock_chain *ret = NULL;
+	int err = -ENOMEM;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start = (unsigned long)buf;
+	unsigned long end = start + size;
+	int max_files;
+
+	down_read(&mm->mmap_sem);
+	max_files = mm->map_count;
+
+	for (vma = find_vma(mm, start); vma; vma = vma->vm_next) {
+		struct file *file;
+		struct address_space *mapping;
+		struct iolock *iolock;
+
+		if (end <= vma->vm_start)
+			break;
+
+		file = vma->vm_file;
+		if (!file)
+			continue;
+
+		mapping = file->f_mapping;
+		if (!mapping->a_ops->iolock_acquire ||
+		    !mapping->a_ops->iolock_release)
+			continue;
+
+		/* Allocate chain lazily to avoid initialization overhead
+		   when we don't have any files that require iolock.  */
+		if (!ret) {
+			ret = iolock_chain_new();
+			if (!ret)
+				goto error;
+		}
+
+		iolock = iolock_lookup(ret, mapping);
+		if (!iolock) {
+			iolock = iolock_new(max_files);
+			if (!iolock)
+				goto error;
+			iolock->mapping = mapping;
+		}
+
+		iolock->files[iolock->nr_files++] = file;
+		list_add(&iolock->chain, &ret->list);
+	}
+	err = iolock_chain_acquire(ret);
+	if (!err)
+		goto error;
+
+out:
+	up_read(&mm->mmap_sem);
+	return ret;
+
+error:
+	iolock_release(ret);
+	ret = ERR_PTR(err);
+	goto out;
+}
+
+/**
+ * iolock_release - Release file I/O locks for a memory region.
+ * @chain: The I/O lock chain to release. Passing NULL means no-op.
+ */
+void iolock_release(struct iolock_chain *chain)
+{
+	struct iolock *iolock;
+
+	if (!chain)
+		return;
+
+	list_for_each_entry(iolock, &chain->list, chain) {
+		struct address_space *mapping = iolock->mapping;
+		if (mapping && mapping->a_ops->iolock_release)
+			mapping->a_ops->iolock_release(iolock->files, iolock->nr_files);
+		kfree(iolock->files);
+		kfree(iolock);
+	}
+	kfree(chain);
+}
Index: 2.6-mm/fs/read_write.c
===================================================================
--- 2.6-mm.orig/fs/read_write.c
+++ 2.6-mm/fs/read_write.c
@@ -14,6 +14,7 @@
 #include <linux/security.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/iolock.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -247,14 +248,21 @@ ssize_t vfs_read(struct file *file, char
 	if (!ret) {
 		ret = security_file_permission (file, MAY_READ);
 		if (!ret) {
+			struct iolock_chain * lock = iolock_region(buf, count);
+			if (IS_ERR(lock)) {
+				ret = PTR_ERR(lock);
+				goto out;
+			}
 			if (file->f_op->read)
 				ret = file->f_op->read(file, buf, count, pos);
 			else
 				ret = do_sync_read(file, buf, count, pos);
+			iolock_release(lock);
 			if (ret > 0) {
 				fsnotify_access(file->f_dentry);
 				current->rchar += ret;
 			}
+  out:
 			current->syscr++;
 		}
 	}
@@ -298,14 +306,21 @@ ssize_t vfs_write(struct file *file, con
 	if (!ret) {
 		ret = security_file_permission (file, MAY_WRITE);
 		if (!ret) {
+			struct iolock_chain * lock = iolock_region(buf, count);
+			if (IS_ERR(lock)) {
+				ret = PTR_ERR(lock);
+				goto out;
+			}
 			if (file->f_op->write)
 				ret = file->f_op->write(file, buf, count, pos);
 			else
 				ret = do_sync_write(file, buf, count, pos);
+			iolock_release(lock);
 			if (ret > 0) {
 				fsnotify_modify(file->f_dentry);
 				current->wchar += ret;
 			}
+  out:
 			current->syscw++;
 		}
 	}
Index: 2.6-mm/include/linux/iolock.h
===================================================================
--- /dev/null
+++ 2.6-mm/include/linux/iolock.h
@@ -0,0 +1,11 @@
+#ifndef __LINUX_IOLOCK_H
+#define __LINUX_IOLOCK_H
+
+#include <linux/kernel.h>
+
+struct iolock_chain;
+
+extern struct iolock_chain *iolock_region(const char __user *, size_t);
+extern void iolock_release(struct iolock_chain *);
+
+#endif
Index: 2.6-mm/fs/Makefile
===================================================================
--- 2.6-mm.orig/fs/Makefile
+++ 2.6-mm/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.
 		ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
-		ioprio.o
+		ioprio.o iolock.o
 
 obj-$(CONFIG_INOTIFY)		+= inotify.o
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
Index: 2.6-mm/include/linux/fs.h
===================================================================
--- 2.6-mm.orig/include/linux/fs.h
+++ 2.6-mm/include/linux/fs.h
@@ -334,6 +334,8 @@ struct address_space_operations {
 			loff_t offset, unsigned long nr_segs);
 	struct page* (*get_xip_page)(struct address_space *, sector_t,
 			int);
+	int (*iolock_acquire)(struct file **, unsigned long);
+	void (*iolock_release)(struct file **, unsigned long);
 };
 
 struct backing_dev_info;


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
References:
- Re: GFS
  - From: Pekka J Enberg <penberg@cs.Helsinki.FI>
- Re: GFS
  - From: Zach Brown <zab@zabbo.net>
Prev by Date: Re: Kernel 2.6.5 - Compaq Fibre Channel 64-bit/66Mhz HBA
Next by Date: Re: [PATCH] consolidate sys_ptrace
Previous by thread: Re: GFS
Next by thread: Is it possible to control C.O.P
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind]