[PATCH] sys_vmsplice

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

Here's the first implementation of sys_vmsplice(). I'm attaching a
little test app as well for playing with it, it's also committed to the
splice tools repo at:

        git://brick.kernel.dk/data/git/splice.git

Patch is against current Linus -git, it's also included in the splice
branch of the block git repo.

-----

sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice()
moves data to a pipe, with the input being a user address range instead.

Signed-off-by: Jens Axboe <[email protected]>

---

 arch/ia64/kernel/entry.S     |    1 
 arch/powerpc/kernel/systbl.S |    1 
 fs/fcntl.c                   |    8 ++
 fs/splice.c                  |  181 +++++++++++++++++++++++++++++++++++-------
 include/asm-generic/fcntl.h  |    5 +
 include/asm-i386/unistd.h    |    3 -
 include/asm-ia64/unistd.h    |    1 
 include/asm-powerpc/unistd.h |    3 -
 include/asm-x86_64/unistd.h  |    4 +
 include/linux/syscalls.h     |    3 +
 10 files changed, 176 insertions(+), 34 deletions(-)

888642ed2f315862a4cc815c1a9029a328adbd33
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e307988..bcb80ca 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -1610,5 +1610,6 @@ sys_call_table:
 	data8 sys_get_robust_list
 	data8 sys_sync_file_range		// 1300
 	data8 sys_tee
+	data8 sys_vmsplice
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index a14c964..9730315 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -324,3 +324,4 @@ COMPAT_SYS(ppoll)
 SYSCALL(unshare)
 SYSCALL(splice)
 SYSCALL(tee)
+SYSCALL(vmsplice)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index d35cbc6..56ac96e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -18,6 +18,7 @@ #include <linux/security.h>
 #include <linux/ptrace.h>
 #include <linux/signal.h>
 #include <linux/rcupdate.h>
+#include <linux/pipe_fs_i.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -345,6 +346,13 @@ static long do_fcntl(int fd, unsigned in
 	case F_NOTIFY:
 		err = fcntl_dirnotify(fd, filp, arg);
 		break;
+	case F_SETPSZ:
+		err = -EINVAL;
+		break;
+	case F_GETPSZ:
+		/* this assumes user space can reliably tell PAGE_CACHE_SIZE */
+		err = PIPE_BUFFERS;
+		break;
 	default:
 		break;
 	}
diff --git a/fs/splice.c b/fs/splice.c
index 0559e75..57d55f2 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -39,6 +39,18 @@ struct splice_desc {
 };
 
 /*
+ * Passed to move_to_pipe
+ */
+struct splice_pipe_desc {
+	struct page **pages;		/* page map */
+	int nr_pages;			/* number of pages in map */
+	unsigned long len;		/* maximum number of bytes to maps */
+	unsigned int offset;		/* offset into first page */
+	unsigned int flags;		/* splice flags */
+	struct pipe_buf_operations *ops;/* ops associated with output pipe */
+};
+
+/*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
  * a vm helper function, it's already simplified quite a bit by the
  * addition of remove_mapping(). If success is returned, the caller may
@@ -128,6 +140,19 @@ static void page_cache_pipe_buf_unmap(st
 	kunmap(buf->page);
 }
 
+static void *user_page_pipe_buf_map(struct file *file,
+				    struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return kmap(buf->page);
+}
+
+static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	kunmap(buf->page);
+}
+
 static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
 				    struct pipe_buffer *buf)
 {
@@ -143,13 +168,27 @@ static struct pipe_buf_operations page_c
 	.get = page_cache_pipe_buf_get,
 };
 
+static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static struct pipe_buf_operations user_page_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = user_page_pipe_buf_map,
+	.unmap = user_page_pipe_buf_unmap,
+	.release = page_cache_pipe_buf_release,
+	.steal = user_page_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
+};
+
 /*
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
-			    int nr_pages, unsigned long len,
-			    unsigned int offset, unsigned int flags)
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe,
+			    struct splice_pipe_desc *spd)
 {
 	int ret, do_wakeup, i;
 
@@ -171,27 +210,27 @@ static ssize_t move_to_pipe(struct pipe_
 		if (pipe->nrbufs < PIPE_BUFFERS) {
 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
 			struct pipe_buffer *buf = pipe->bufs + newbuf;
-			struct page *page = pages[i++];
+			struct page *page = spd->pages[i++];
 			unsigned long this_len;
 
-			this_len = PAGE_CACHE_SIZE - offset;
-			if (this_len > len)
-				this_len = len;
+			this_len = PAGE_CACHE_SIZE - spd->offset;
+			if (this_len > spd->len)
+				this_len = spd->len;
 
 			buf->page = page;
-			buf->offset = offset;
+			buf->offset = spd->offset;
 			buf->len = this_len;
-			buf->ops = &page_cache_pipe_buf_ops;
+			buf->ops = spd->ops;
 			pipe->nrbufs++;
 			if (pipe->inode)
 				do_wakeup = 1;
 
 			ret += this_len;
-			len -= this_len;
-			offset = 0;
-			if (!--nr_pages)
+			spd->len -= this_len;
+			spd->offset = 0;
+			if (!--spd->nr_pages)
 				break;
-			if (!len)
+			if (!spd->len)
 				break;
 			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
@@ -199,7 +238,7 @@ static ssize_t move_to_pipe(struct pipe_
 			break;
 		}
 
-		if (flags & SPLICE_F_NONBLOCK) {
+		if (spd->flags & SPLICE_F_NONBLOCK) {
 			if (!ret)
 				ret = -EAGAIN;
 			break;
@@ -234,8 +273,8 @@ static ssize_t move_to_pipe(struct pipe_
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
-	while (i < nr_pages)
-		page_cache_release(pages[i++]);
+	while (i < spd->nr_pages)
+		page_cache_release(spd->pages[i++]);
 
 	return ret;
 }
@@ -246,17 +285,21 @@ __generic_file_splice_read(struct file *
 			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int loff, offset, nr_pages;
+	unsigned int loff, nr_pages;
 	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
 	pgoff_t index, end_index;
 	loff_t isize;
-	size_t bytes;
-	int i, error;
+	int error;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.flags = flags,
+		.ops = &page_cache_pipe_buf_ops,
+	};
 
 	index = *ppos >> PAGE_CACHE_SHIFT;
-	loff = offset = *ppos & ~PAGE_CACHE_MASK;
-	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	loff = spd.offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
 		nr_pages = PIPE_BUFFERS;
@@ -266,15 +309,14 @@ __generic_file_splice_read(struct file *
 	 * read-ahead if this is a non-zero offset (we are likely doing small
 	 * chunk splice and the page is already there) for a single page.
 	 */
-	if (!offset || nr_pages > 1)
-		do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!spd.offset || spd.nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, spd.nr_pages);
 
 	/*
 	 * Now fill in the holes:
 	 */
 	error = 0;
-	bytes = 0;
-	for (i = 0; i < nr_pages; i++, index++) {
+	for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) {
 		unsigned int this_len;
 
 		if (!len)
@@ -367,26 +409,26 @@ readpage:
 			 */
 			if (end_index == index) {
 				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
-				if (bytes + loff > isize) {
+				if (spd.len + loff > isize) {
 					page_cache_release(page);
 					break;
 				}
 				/*
 				 * force quit after adding this page
 				 */
-				nr_pages = i;
+				nr_pages = spd.nr_pages;
 				this_len = min(this_len, loff);
 			}
 		}
 fill_it:
-		pages[i] = page;
-		bytes += this_len;
+		pages[spd.nr_pages] = page;
+		spd.len += this_len;
 		len -= this_len;
 		loff = 0;
 	}
 
-	if (i)
-		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
+	if (spd.nr_pages)
+		return move_to_pipe(pipe, &spd);
 
 	return error;
 }
@@ -1010,6 +1052,83 @@ static long do_splice(struct file *in, l
 	return -EINVAL;
 }
 
+/*
+ * vmsplice splices a user address range into a pipe. It can be thought of
+ * as splice-from-memory, where the regular splice is splice-from-file (or
+ * to file). In both cases the output is a pipe, naturally.
+ *
+ * Note that vmsplice only supports splicing _from_ user memory to a pipe,
+ * not the other way around. Splicing from user memory is a simple operation
+ * that can be supported without any funky alignment restrictions or nasty
+ * vm tricks. We simply map in the user memory and fill them into a pipe.
+ * The reverse isn't quite as easy, though. There are two possible solutions
+ * for that:
+ *
+ *	- memcpy() the data internally, at which point we might as well just
+ *	  do a regular read() on the buffer anyway.
+ *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
+ *	  has restriction limitations on both ends of the pipe).
+ *
+ * Alas, it isn't here.
+ *
+ */
+static long do_vmsplice(struct file *file, void __user *buffer, size_t len,
+			unsigned int flags)
+{
+	unsigned long uaddr = (unsigned long) buffer;
+	struct pipe_inode_info *pipe;
+	struct page *pages[PIPE_BUFFERS];
+	unsigned int nr_pages;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.len = len,
+		.flags = flags,
+		.ops = &user_page_pipe_buf_ops,
+	};
+
+	pipe = file->f_dentry->d_inode->i_pipe;
+	if (unlikely(!pipe))
+		return -EBADF;
+
+	spd.offset = uaddr & ~PAGE_CACHE_MASK;
+	nr_pages = (len + spd.offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	if (nr_pages > PIPE_BUFFERS)
+		nr_pages = PIPE_BUFFERS;
+
+	down_read(&current->mm->mmap_sem);
+	spd.nr_pages = get_user_pages(current, current->mm, uaddr, nr_pages, 0,
+				      0, pages, NULL);
+	up_read(&current->mm->mmap_sem);
+
+	if (spd.nr_pages > 0)
+		return move_to_pipe(pipe, &spd);
+
+	return spd.nr_pages;
+}
+
+asmlinkage long sys_vmsplice(int fd, void __user *buffer, size_t len,
+			     unsigned int flags)
+{
+	long error;
+	struct file *file;
+	int fput;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	file = fget_light(fd, &fput);
+	if (file) {
+		if (file->f_mode & FMODE_WRITE)
+			error = do_vmsplice(file, buffer, len, flags);
+
+		fput_light(file, fput);
+	}
+
+	return error;
+}
+
 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags)
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index b663520..1da0fba 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -146,4 +146,9 @@ struct flock64 {
 #endif
 #endif /* !CONFIG_64BIT */
 
+#ifndef F_SETPSZ
+#define F_SETPSZ	15	/* for pipes. */
+#define F_GETPSZ	16	/* for pipes. */
+#endif
+
 #endif /* _ASM_GENERIC_FCNTL_H */
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index d81d6cf..eb4b152 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -321,8 +321,9 @@ #define __NR_get_robust_list	312
 #define __NR_splice		313
 #define __NR_sync_file_range	314
 #define __NR_tee		315
+#define __NR_vmsplice		316
 
-#define NR_syscalls 316
+#define NR_syscalls 317
 
 /*
  * user-visible error numbers are in the range -1 - -128: see
diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index a40ebec..9aa3487 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h
@@ -290,6 +290,7 @@ #define __NR_set_robust_list		1298
 #define __NR_get_robust_list		1299
 #define __NR_sync_file_range		1300
 #define __NR_tee			1301
+#define __NR_vmsplice			1302
 
 #ifdef __KERNEL__
 
diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h
index c612f1a..34325e2 100644
--- a/include/asm-powerpc/unistd.h
+++ b/include/asm-powerpc/unistd.h
@@ -303,8 +303,9 @@ #define __NR_ppoll		281
 #define __NR_unshare		282
 #define __NR_splice		283
 #define __NR_tee		284
+#define __NR_vmsplice		285
 
-#define __NR_syscalls		285
+#define __NR_syscalls		286
 
 #ifdef __KERNEL__
 #define __NR__exit __NR_exit
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index 98c36ea..feb77cb 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -615,8 +615,10 @@ #define __NR_tee		276
 __SYSCALL(__NR_tee, sys_tee)
 #define __NR_sync_file_range	277
 __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
+#define __NR_vmsplice		278
+__SYSCALL(__NR_vmsplice, sys_vmsplice)
 
-#define __NR_syscall_max __NR_sync_file_range
+#define __NR_syscall_max __NR_vmsplice
 
 #ifndef __NO_STUBS
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d3ebc0e..fc9392c 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -574,6 +574,9 @@ asmlinkage long sys_splice(int fd_in, lo
 			   int fd_out, loff_t __user *off_out,
 			   size_t len, unsigned int flags);
 
+asmlinkage long sys_vmsplice(int fd, void __user *buffer, size_t len,
+			     unsigned int flags);
+
 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
 
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-- 
1.3.0.g2473


-- 
Jens Axboe

/*
 * Use vmsplice to fill some user memory into a pipe. vmsplice writes
 * to stdout, so that must be a pipe.
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <limits.h>
#include <sys/poll.h>
#include <sys/stat.h>
#include <sys/types.h>

#include "splice.h"

#define ALIGN_BUF

#ifdef ALIGN_BUF
#define ALIGN_MASK	(65535)	/* 64k-1, should just be PAGE_SIZE - 1 */
#define ALIGN(buf)	(void *) (((unsigned long) (buf) + ALIGN_MASK) & ~ALIGN_MASK)
#else
#define ALIGN_MASK	(0)
#define ALIGN(buf)	(buf)
#endif

int do_vmsplice(int fd, void *buffer, int len)
{
	struct pollfd pfd = { .fd = fd, .events = POLLOUT, };
	int written;

	while (len) {
		/*
		 * in a real app you'd be more clever with poll of course,
		 * here we are basically just blocking on output room and
		 * not using the free time for anything interesting.
		 */
		if (poll(&pfd, 1, -1) < 0)
			return error("poll");

		written = vmsplice(fd, buffer, min(SPLICE_SIZE, len), 0);

		if (written <= 0)
			return error("vmsplice");

		len -= written;
	}

	return 0;
}

int main(int argc, char *argv[])
{
	unsigned char *buffer;
	struct stat sb;
	long page_size;
	int i, ret;

	if (fstat(STDOUT_FILENO, &sb) < 0)
		return error("stat");
	if (!S_ISFIFO(sb.st_mode)) {
		fprintf(stderr, "stdout must be a pipe\n");
		return 1;
	}

	ret = fcntl(STDOUT_FILENO, F_GETPSZ);
	if (ret < 0)
		return error("F_GETPSZ");

	page_size = sysconf(_SC_PAGESIZE);
	if (page_size < 0)
		return error("_SC_PAGESIZE");

	fprintf(stderr, "Pipe size: %d pages / %ld bytes\n", ret, ret * page_size);

	buffer = ALIGN(malloc(2 * SPLICE_SIZE + ALIGN_MASK));
	for (i = 0; i < 2 * SPLICE_SIZE; i++)
		buffer[i] = (i & 0xff);

	do {
		/*
		 * vmsplice the first half of the buffer into the pipe
		 */
		if (do_vmsplice(STDOUT_FILENO, buffer, SPLICE_SIZE))
			break;

		/*
		 * first half is now in pipe, but we don't quite know when
		 * we can reuse it.
		 */

		/*
		 * vmsplice second half
		 */
		if (do_vmsplice(STDOUT_FILENO, buffer + SPLICE_SIZE, SPLICE_SIZE))
			break;

		/*
		 * We still don't know when we can reuse the second half of
		 * the buffer, but we do now know that all parts of the first
		 * half have been consumed from the pipe - so we can reuse that.
		 */
	} while (0);

	return 0;
}

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux