Aubrey Li <[email protected]> wrote:
> Here, in the attachment I wrote a small test app. Please correct if
> there is anything wrong, and feel free to improve it.
Okay... I have that working... probably. I don't know what output it's
supposed to produce, but I see this:
# /packet-mmap/sample_packet_mmap
00-00-00-01-00-00-00-8a-00-00-00-8a-00-42-00-50-
38-43-13-a0-00-07-ff-3c-00-00-00-00-00-00-00-00-
00-11-08-00-00-00-00-01-00-01-00-06-00-d0-b7-de-
32-7b-00-00-00-00-00-00-00-00-00-00-00-00-00-00-
00-00-00-90-cc-a2-75-6b-00-d0-b7-de-32-7b-08-00-
45-00-00-7c-00-00-40-00-40-11-b4-13-c0-a8-02-80-
c0-a8-02-8d-08-01-03-20-00-68-8e-65-7f-5b-7e-03-
00-00-00-01-00-00-00-00-00-00-00-00-00-00-00-00-
00-00-00-00-00-00-00-00-00-00-00-01-00-00-81-a4-
00-00-00-01-00-00-00-00-00-00-00-00-00-1d-b8-86-
00-00-10-00-ff-ff-ff-ff-00-00-0e-f0-00-00-09-02-
01-cb-03-16-46-26-38-0d-00-00-00-00-46-26-38-1e-
00-00-00-00-46-26-38-1e-00-00-00-00-00-00-00-00-
00-00-00-00-00-00-00-00-00-00-00-00-00-00-00-00- [repeated]
Does that look reasonable?
I've attached the preliminary patch. Note four things about it:
(1) I've had to add the get_unmapped_area() op to the proto_ops struct, but
I've only done it for CONFIG_MMU=n as making it available for CONFIG_MMU=y
could cause problems.
(2) There's a race between packet_get_unmapped_area() being called and
packet_mmap() being called.
(3) I've added an extra check into packet_set_ring() to make sure the caller
isn't asking for a combination of buffer size and count that will exceed
ULONG_MAX. This protects a multiply done elsewhere.
(4) The entire data buffer is allocated as one contiguous lump in NOMMU-mode.
David
---
[PATCH] NOMMU: Support mmap() on AF_PACKET sockets
From: David Howells <[email protected]>
Support mmap() on AF_PACKET sockets in NOMMU-mode kernels.
Signed-Off-By: David Howells <[email protected]>
---
include/linux/net.h | 7 +++
include/net/sock.h | 8 +++
net/core/sock.c | 10 ++++
net/packet/af_packet.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++
net/socket.c | 77 +++++++++++++++++++++++++++++++
5 files changed, 219 insertions(+), 1 deletions(-)
diff --git a/include/linux/net.h b/include/linux/net.h
index 4db21e6..9e77cf6 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -161,6 +161,11 @@ struct proto_ops {
int (*recvmsg) (struct kiocb *iocb, struct socket *sock,
struct msghdr *m, size_t total_len,
int flags);
+#ifndef CONFIG_MMU
+ unsigned long (*get_unmapped_area)(struct file *file, struct socket *sock,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags);
+#endif
int (*mmap) (struct file *file, struct socket *sock,
struct vm_area_struct * vma);
ssize_t (*sendpage) (struct socket *sock, struct page *page,
@@ -191,6 +196,8 @@ extern int sock_sendmsg(struct socket *sock, struct msghdr *msg,
extern int sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags);
extern int sock_map_fd(struct socket *sock);
+extern void sock_make_mappable(struct socket *sock,
+ unsigned long prot);
extern struct socket *sockfd_lookup(int fd, int *err);
#define sockfd_put(sock) fput(sock->file)
extern int net_ratelimit(void);
diff --git a/include/net/sock.h b/include/net/sock.h
index 2c7d60c..d91edea 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -841,6 +841,14 @@ extern int sock_no_sendmsg(struct kiocb *, struct socket *,
struct msghdr *, size_t);
extern int sock_no_recvmsg(struct kiocb *, struct socket *,
struct msghdr *, size_t, int);
+#ifndef CONFIG_MMU
+extern unsigned long sock_no_get_unmapped_area(struct file *,
+ struct socket *,
+ unsigned long,
+ unsigned long,
+ unsigned long,
+ unsigned long);
+#endif
extern int sock_no_mmap(struct file *file,
struct socket *sock,
struct vm_area_struct *vma);
diff --git a/net/core/sock.c b/net/core/sock.c
index 27c4f62..b288799 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1364,6 +1364,15 @@ int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
return -EOPNOTSUPP;
}
+#ifndef CONFIG_MMU
+unsigned long sock_no_get_unmapped_area(struct file *file, struct socket *sock,
+ unsigned long addr, unsigned long len,
+ unsigned long pgoff, unsigned long flags)
+{
+ return (unsigned long) -ENODEV;
+}
+#endif
+
int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
/* Mirror missing mmap method error code */
@@ -1943,6 +1952,7 @@ EXPORT_SYMBOL(sock_no_getname);
EXPORT_SYMBOL(sock_no_getsockopt);
EXPORT_SYMBOL(sock_no_ioctl);
EXPORT_SYMBOL(sock_no_listen);
+EXPORT_SYMBOL(sock_no_get_unmapped_area);
EXPORT_SYMBOL(sock_no_mmap);
EXPORT_SYMBOL(sock_no_poll);
EXPORT_SYMBOL(sock_no_recvmsg);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 28d47e8..12d970a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -78,6 +78,7 @@
#include <linux/poll.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <asm/mman.h>
#ifdef CONFIG_INET
#include <net/inet_common.h>
@@ -1023,6 +1024,7 @@ static int packet_create(struct socket *sock, int protocol)
sock->ops = &packet_ops_spkt;
#endif
sock_init_data(sock, sk);
+ sock_make_mappable(sock, PROT_READ | PROT_WRITE);
po = pkt_sk(sk);
sk->sk_family = PF_PACKET;
@@ -1629,6 +1631,7 @@ static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
}
+#ifdef CONFIG_MMU
static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
{
int i;
@@ -1671,6 +1674,45 @@ out_free_pgvec:
goto out;
}
+#else
+/*
+ * free the buffer pointer block and all the buffers for NOMMU
+ */
+static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+{
+ kfree(pg_vec[0]);
+ kfree(pg_vec);
+}
+
+/*
+ * allocate the buffer pointer block and all the buffer blocks for the mappable
+ * AF_PACKET buffers for NOMMU mode
+ * - in NOMMU mode the buffers must be contiguous with each other
+ */
+static char **alloc_pg_vec(struct tpacket_req *req, int order)
+{
+ char *buffers;
+ char **ptrblock;
+ int loop;
+
+ buffers = kcalloc(req->tp_block_size, req->tp_block_nr, GFP_KERNEL);
+ if (!buffers)
+ return ERR_PTR(-ENOMEM);
+ ptrblock = kmalloc(sizeof(char *) * req->tp_block_nr, GFP_KERNEL);
+ if (!ptrblock) {
+ kfree(buffers);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (loop = 0; loop < req->tp_block_nr; loop++) {
+ ptrblock[loop] = buffers;
+ buffers += req->tp_block_size;
+ }
+
+ return ptrblock;
+}
+#endif
+
static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
{
char **pg_vec = NULL;
@@ -1695,6 +1737,8 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
return -EINVAL;
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
return -EINVAL;
+ if (unlikely(ULONG_MAX / req->tp_block_size < req->tp_block_nr))
+ return -ENOMEM;
po->frames_per_block = req->tp_block_size/req->tp_frame_size;
if (unlikely(po->frames_per_block <= 0))
@@ -1783,6 +1827,7 @@ out:
return err;
}
+#ifdef CONFIG_MMU
static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
struct sock *sk = sock->sk;
@@ -1823,7 +1868,72 @@ out:
release_sock(sk);
return err;
}
-#endif
+
+#else
+/*
+ * find out where the socket buffers are so that NOMMU mmap can return the
+ * address directly
+ */
+static unsigned long packet_get_unmapped_area(struct file *file,
+ struct socket *sock,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+
+ /* check that they want to map the whole buffer */
+ if (pgoff)
+ return (unsigned long) -EINVAL;
+
+ lock_sock(sk);
+ if (!po->pg_vec ||
+ len != po->pg_vec_len * po->pg_vec_pages * PAGE_SIZE)
+ addr = (unsigned long) -EINVAL;
+ else
+ addr = (unsigned long) po->pg_vec[0];
+ release_sock(sk);
+ return addr;
+}
+
+/*
+ * set the mapping of a packet socket buffer for NOMMU mmap
+ */
+static int packet_mmap(struct file *file, struct socket *sock,
+ struct vm_area_struct *vma)
+{
+ struct sock *sk = sock->sk;
+ struct packet_sock *po = pkt_sk(sk);
+ unsigned long len;
+ int ret;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ lock_sock(sk);
+
+ /* check that they want to map the whole buffer */
+ len = vma->vm_end - vma->vm_start;
+ if (!po->pg_vec ||
+ len != po->pg_vec_len * po->pg_vec_pages * PAGE_SIZE) {
+ ret = -EINVAL;
+ } else if (vma->vm_start != (unsigned long) po->pg_vec[0]) {
+ /* someone found the gap between get_unmapped_area and mmap */
+ ret = -ESTALE;
+ } else {
+ atomic_inc(&po->mapped);
+ vma->vm_ops = &packet_mmap_ops;
+ ret = 0;
+ }
+
+ release_sock(sk);
+ return ret;
+}
+
+#endif /* CONFIG_MMU */
+#endif /* CONFIG_PACKET_MMAP */
#ifdef CONFIG_SOCK_PACKET
@@ -1844,6 +1954,9 @@ static const struct proto_ops packet_ops_spkt = {
.getsockopt = sock_no_getsockopt,
.sendmsg = packet_sendmsg_spkt,
.recvmsg = packet_recvmsg,
+#ifndef CONFIG_MMU
+ .get_unmapped_area = sock_no_get_unmapped_area,
+#endif
.mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
@@ -1866,6 +1979,9 @@ static const struct proto_ops packet_ops = {
.getsockopt = packet_getsockopt,
.sendmsg = packet_sendmsg,
.recvmsg = packet_recvmsg,
+#ifndef CONFIG_MMU
+ .get_unmapped_area = packet_get_unmapped_area,
+#endif
.mmap = packet_mmap,
.sendpage = sock_no_sendpage,
};
diff --git a/net/socket.c b/net/socket.c
index ea8f81a..4109db6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -87,6 +87,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
+#include <asm/mman.h>
#include <net/compat.h>
@@ -98,6 +99,13 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos);
+#ifndef CONFIG_MMU
+static unsigned long sock_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags);
+#endif
static int sock_mmap(struct file *file, struct vm_area_struct *vma);
static int sock_close(struct inode *inode, struct file *file);
@@ -127,6 +135,9 @@ static const struct file_operations socket_file_ops = {
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_sock_ioctl,
#endif
+#ifndef CONFIG_MMU
+ .get_unmapped_area = sock_get_unmapped_area,
+#endif
.mmap = sock_mmap,
.open = sock_no_open, /* special open code to disallow open via /proc */
.release = sock_close,
@@ -136,6 +147,26 @@ static const struct file_operations socket_file_ops = {
};
/*
+ * capabilities for non-mappable sockets
+ * - does not permit mmap at all
+ * - no readahead or I/O queue unplugging required
+ */
+struct backing_dev_info non_mappable_socket_bdi = {
+ .capabilities = 0,
+};
+
+/*
+ * capabilities for read/write mappable sockets
+ * - permits shared-mmap for read and write only
+ * - does not permit private mmap
+ * - no readahead or I/O queue unplugging required
+ */
+struct backing_dev_info rw_mappable_socket_bdi = {
+ .capabilities = (BDI_CAP_MAP_DIRECT | BDI_CAP_READ_MAP |
+ BDI_CAP_WRITE_MAP),
+};
+
+/*
* The protocol list. Each protocol is registered in here.
*/
@@ -415,6 +446,34 @@ static struct socket *sock_from_file(struct file *file, int *err)
}
/**
+ * sock_make_mappable - Change the mmappability of a socket
+ * @sock: The socket to alter
+ * @prot: The mmap protection to permit
+ *
+ * The nominated socket has its backing device information selection
+ * changed to indicate to mmap() what sort of mappings are available on
+ * this socket.
+ */
+void sock_make_mappable(struct socket *sock, unsigned long prot)
+{
+ struct address_space *mapping = &SOCK_INODE(sock)->i_data;
+
+ switch (prot) {
+ case 0:
+ mapping->backing_dev_info = &non_mappable_socket_bdi;
+ break;
+ case PROT_READ | PROT_WRITE:
+ mapping->backing_dev_info = &rw_mappable_socket_bdi;
+ break;
+ case PROT_READ:
+ case PROT_WRITE:
+ default:
+ BUG();
+ break;
+ }
+}
+
+/**
* sockfd_lookup - Go from a file number to its socket slot
* @fd: file handle
* @err: pointer to an error code return
@@ -482,6 +541,7 @@ static struct socket *sock_alloc(void)
inode->i_mode = S_IFSOCK | S_IRWXUGO;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
+ inode->i_data.backing_dev_info = &non_mappable_socket_bdi;
get_cpu_var(sockets_in_use)++;
put_cpu_var(sockets_in_use);
@@ -918,6 +978,22 @@ static unsigned int sock_poll(struct file *file, poll_table *wait)
return sock->ops->poll(file, sock, wait);
}
+#ifndef CONFIG_MMU
+static unsigned long sock_get_unmapped_area(struct file *file,
+ unsigned long addr,
+ unsigned long len,
+ unsigned long pgoff,
+ unsigned long flags)
+{
+ struct socket *sock = file->private_data;
+
+ if (!sock->ops->get_unmapped_area)
+ return -ENOSYS;
+ return sock->ops->get_unmapped_area(file, sock,
+ addr, len, pgoff, flags);
+}
+#endif
+
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
{
struct socket *sock = file->private_data;
@@ -2288,6 +2364,7 @@ EXPORT_SYMBOL(sock_create);
EXPORT_SYMBOL(sock_create_kern);
EXPORT_SYMBOL(sock_create_lite);
EXPORT_SYMBOL(sock_map_fd);
+EXPORT_SYMBOL(sock_make_mappable);
EXPORT_SYMBOL(sock_recvmsg);
EXPORT_SYMBOL(sock_register);
EXPORT_SYMBOL(sock_release);