Re: [PATCH 3/7] containers (V7): Add generic multi-subsystem API to containers

Quoting [email protected] ([email protected]):
> This patch removes all cpuset-specific knowlege from the container
> system, replacing it with a generic API that can be used by multiple
> subsystems. Cpusets is adapted to be a container subsystem.
> 
> Signed-off-by: Paul Menage <[email protected]>
> 
> ---
>  Documentation/containers.txt |  415 +++++++++--
>  Documentation/cpusets.txt    |   20 
>  include/linux/container.h    |  178 ++++
>  include/linux/cpuset.h       |   16 
>  include/linux/mempolicy.h    |   12 
>  include/linux/sched.h        |    4 
>  init/Kconfig                 |   12 
>  kernel/container.c           | 1601 ++++++++++++++++++++++++++++++-------------
>  kernel/cpuset.c              |  170 ++--
>  mm/mempolicy.c               |    2 
>  10 files changed, 1808 insertions(+), 622 deletions(-)
> 
> Index: container-2.6.20/include/linux/container.h
> ===================================================================
> --- container-2.6.20.orig/include/linux/container.h
> +++ container-2.6.20/include/linux/container.h
> @@ -9,13 +9,12 @@
>   */
> 
>  #include <linux/sched.h>
> +#include <linux/kref.h>
>  #include <linux/cpumask.h>
>  #include <linux/nodemask.h>
> 
>  #ifdef CONFIG_CONTAINERS
> 
> -extern int number_of_containers;	/* How many containers are defined in system? */
> -
>  extern int container_init_early(void);
>  extern int container_init(void);
>  extern void container_init_smp(void);
> @@ -30,13 +29,105 @@ extern void container_unlock(void);
>  extern void container_manage_lock(void);
>  extern void container_manage_unlock(void);
> 
> +struct containerfs_root;
> +
> +/* Per-subsystem/per-container state maintained by the system. */
> +struct container_subsys_state {
> +	/* The container that this subsystem is attached to. Useful
> +	 * for subsystems that want to know about the container
> +	 * hierarchy structure */
> +	struct container *container;
> +
> +	/* State maintained by the container system to allow
> +	 * subsystems to be "busy". Should be accessed via css_get()
> +	 * and css_put() */
> +	spinlock_t refcnt_lock;
> +	atomic_t refcnt;
> +};
> +
> +/* A container_group is a structure holding pointers to a set of
> + * containers. This saves space in the task struct object and speeds
> + * up fork()/exit(), since a single inc/dec can bump the reference
> + * count on the entire container set for a task. */
> +
> +struct container_group {
> +
> +	/* Reference count */
> +	struct kref ref;
> +
> +	/* List running through all container groups */
> +	struct list_head list;
> +
> +	/* Set of containers, one for each hierarchy. These are
> +	 * immutable once the container group has been created */
> +	struct container *container[CONFIG_MAX_CONTAINER_HIERARCHIES];
> +
> +	/* Set of subsystem states, one for each subsystem. NULL for
> +	 * subsystems that aren't part of this hierarchy. These
> +	 * pointers reduce the number of dereferences required to get
> +	 * from a task to its state for a given container, but result
> +	 * in increased space usage if tasks are in wildly different
> +	 * groupings across different hierarchies. This array is
> +	 * mostly immutable after creation - a newly registered
> +	 * subsystem can result in a pointer in this array
> +	 * transitioning from NULL to non-NULL */
> +	struct container_subsys_state *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
> +};
> +
> +/*
> + * Call css_get() to hold a reference on the container; following a
> + * return of 0, this container subsystem state object is guaranteed
> + * not to be destroyed until css_put() is called on it.  A non-zero
> + * return code indicates that a reference could not be taken.
> + *
> + */
> +
> +static inline int css_get(struct container_subsys_state *css)
> +{
> +	int retval = 0;
> +	unsigned long flags;
> +	/* Synchronize with container_rmdir() */
> +	spin_lock_irqsave(&css->refcnt_lock, flags);
> +	if (atomic_read(&css->refcnt) >= 0) {
> +		/* Container is still alive */
> +		atomic_inc(&css->refcnt);
> +	} else {
> +		/* Container removal is in progress */
> +		retval = -EINVAL;
> +	}
> +	spin_unlock_irqrestore(&css->refcnt_lock, flags);
> +	return retval;
> +}
> +
> +/*
> + * If you are holding current->alloc_lock then it's impossible for you
> + * to be moved out of your container, and hence it's impossible for
> + * your container to be destroyed. Therefore doing a simple
> + * atomic_inc() on a css is safe.
> + */
> +
> +static inline void css_get_current(struct container_subsys_state *css)
> +{
> +	atomic_inc(&css->refcnt);
> +}
> +
> +/*
> + * css_put() should be called to release a reference taken by
> + * css_get() or css_get_current()
> + */
> +
> +static inline void css_put(struct container_subsys_state *css) {
> +	atomic_dec(&css->refcnt);
> +}
> +
>  struct container {
>  	unsigned long flags;		/* "unsigned long" so bitops work */
> 
>  	/*
>  	 * Count is atomic so can incr (fork) or decr (exit) without a lock.
>  	 */
> -	atomic_t count;			/* count tasks using this container */
> +	atomic_t count;			/* count of container groups
> +					 * using this container*/
> 
>  	/*
>  	 * We link our 'sibling' struct into our parent's 'children'.
> @@ -46,11 +137,15 @@ struct container {
>  	struct list_head children;	/* my children */
> 
>  	struct container *parent;	/* my parent */
> -	struct dentry *dentry;		/* container fs entry */
> +	struct dentry *dentry;	  	/* container fs entry */
> 
> -#ifdef CONFIG_CPUSETS
> -	struct cpuset *cpuset;
> -#endif
> +	/* Private pointers for each registered subsystem */
> +	struct container_subsys_state *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
> +
> +	int hierarchy;
> +
> +	struct containerfs_root *root;
> +	struct container *top_container;
>  };
> 
>  /* struct cftype:
> @@ -67,8 +162,11 @@ struct container {
>   */
> 
>  struct inode;
> +#define MAX_CFTYPE_NAME 64
>  struct cftype {
> -	char *name;
> +	/* By convention, the name should begin with the name of the
> +	 * subsystem, followed by a period */
> +	char name[MAX_CFTYPE_NAME];
>  	int private;
>  	int (*open) (struct inode *inode, struct file *file);
>  	ssize_t (*read) (struct container *cont, struct cftype *cft,
> @@ -80,10 +178,72 @@ struct cftype {
>  	int (*release) (struct inode *inode, struct file *file);
>  };
> 
> +/* Add a new file to the given container directory. Should only be
> + * called by subsystems from within a populate() method */
>  int container_add_file(struct container *cont, const struct cftype *cft);
> 
>  int container_is_removed(const struct container *cont);
> -void container_set_release_agent_path(const char *path);
> +
> +int container_path(const struct container *cont, char *buf, int buflen);
> +
> +int container_task_count(const struct container *cont);
> +
> +/* Return true if the container is a descendant of the current container */
> +int container_is_descendant(const struct container *cont);
> +
> +/* Container subsystem type. See Documentation/containers.txt for details */
> +
> +struct container_subsys {
> +	int (*create)(struct container_subsys *ss,
> +		      struct container *cont);
> +	void (*destroy)(struct container_subsys *ss, struct container *cont);
> +	int (*can_attach)(struct container_subsys *ss,
> +			  struct container *cont, struct task_struct *tsk);
> +	void (*attach)(struct container_subsys *ss, struct container *cont,
> +			struct container *old_cont, struct task_struct *tsk);
> +	void (*post_attach)(struct container_subsys *ss,
> +			    struct container *cont,
> +			    struct container *old_cont,
> +			    struct task_struct *tsk);
> +	void (*fork)(struct container_subsys *ss, struct task_struct *task);
> +	void (*exit)(struct container_subsys *ss, struct task_struct *task);
> +	int (*populate)(struct container_subsys *ss,
> +			struct container *cont);
> +	void (*bind)(struct container_subsys *ss, struct container *root);
> +	int subsys_id;
> +	int active;
> +
> +#define MAX_CONTAINER_TYPE_NAMELEN 32
> +	const char *name;
> +
> +	/* Protected by RCU */
> +	int hierarchy;
> +
> +	struct list_head sibling;
> +};
> +
> +int container_register_subsys(struct container_subsys *subsys);
> +int container_clone(struct task_struct *tsk, struct container_subsys *ss);
> +
> +static inline struct container_subsys_state *container_subsys_state(
> +	struct container *cont,
> +	struct container_subsys *ss)
> +{
> +	return cont->subsys[ss->subsys_id];
> +}
> +
> +static inline struct container* task_container(struct task_struct *task,
> +					       struct container_subsys *ss)
> +{
> +	return rcu_dereference(task->containers->container[ss->hierarchy]);
> +}
> +
> +static inline struct container_subsys_state *task_subsys_state(
> +	struct task_struct *task,
> +	struct container_subsys *ss)
> +{
> +	return rcu_dereference(task->containers->subsys[ss->subsys_id]);
> +}
> 
>  int container_path(const struct container *cont, char *buf, int buflen);
> 
> Index: container-2.6.20/include/linux/cpuset.h
> ===================================================================
> --- container-2.6.20.orig/include/linux/cpuset.h
> +++ container-2.6.20/include/linux/cpuset.h
> @@ -70,16 +70,7 @@ static inline int cpuset_do_slab_mem_spr
> 
>  extern void cpuset_track_online_nodes(void);
> 
> -extern int cpuset_can_attach_task(struct container *cont,
> -				  struct task_struct *tsk);
> -extern void cpuset_attach_task(struct container *cont,
> -				struct task_struct *tsk);
> -extern void cpuset_post_attach_task(struct container *cont,
> -				    struct container *oldcont,
> -				    struct task_struct *tsk);
> -extern int cpuset_populate_dir(struct container *cont);
> -extern int cpuset_create(struct container *cont);
> -extern void cpuset_destroy(struct container *cont);
> +extern int current_cpuset_is_being_rebound(void);
> 
>  #else /* !CONFIG_CPUSETS */
> 
> @@ -147,6 +138,11 @@ static inline int cpuset_do_slab_mem_spr
> 
>  static inline void cpuset_track_online_nodes(void) {}
> 
> +static inline int current_cpuset_is_being_rebound(void)
> +{
> +	return 0;
> +}
> +
>  #endif /* !CONFIG_CPUSETS */
> 
>  #endif /* _LINUX_CPUSET_H */
> Index: container-2.6.20/kernel/container.c
> ===================================================================
> --- container-2.6.20.orig/kernel/container.c
> +++ container-2.6.20/kernel/container.c
> @@ -55,7 +55,6 @@
>  #include <linux/time.h>
>  #include <linux/backing-dev.h>
>  #include <linux/sort.h>
> -#include <linux/cpuset.h>
> 
>  #include <asm/uaccess.h>
>  #include <asm/atomic.h>
> @@ -63,17 +62,56 @@
> 
>  #define CONTAINER_SUPER_MAGIC		0x27e0eb
> 
> -/*
> - * Tracks how many containers are currently defined in system.
> - * When there is only one container (the root container) we can
> - * short circuit some hooks.
> +static struct container_subsys *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
> +static int subsys_count = 0;
> +
> +/* A containerfs_root represents the root of a container hierarchy,
> + * and may be associated with a superblock to form an active
> + * hierarchy */
> +struct containerfs_root {
> +	struct super_block *sb;
> +
> +	/* The bitmask of subsystems attached to this hierarchy */
> +	unsigned long subsys_bits;
> +
> +	/* A list running through the attached subsystems */
> +	struct list_head subsys_list;
> +
> +	/* The root container for this hierarchy */
> +	struct container top_container;
> +
> +	/* Tracks how many containers are currently defined in hierarchy.*/
> +	int number_of_containers;
> +
> +};
> +
> +/* The set of hierarchies in use. Hierarchy 0 is the "dummy
> + * container", reserved for the subsystems that are otherwise
> + * unattached - it never has more than a single container, and all
> + * tasks are part of that container. */
> +
> +static struct containerfs_root rootnode[CONFIG_MAX_CONTAINER_HIERARCHIES];
> +
> +/* dummytop is a shorthand for the dummy hierarchy's top container */
> +#define dummytop (&rootnode[0].top_container)
> +
> +/* This flag indicates whether tasks in the fork and exit paths should
> + * take callback_mutex and check for fork/exit handlers to call. This
> + * avoids us having to take locks in the fork/exit path if none of the
> + * subsystems need to be called.
> + *
> + * It is protected via RCU, with the invariant that a process in an
> + * rcu_read_lock() section will never see this as 0 if there are
> + * actually registered subsystems with a fork or exit
> + * handler. (Sometimes it may be 1 without there being any registered
> + * subsystems with such a handler, but such periods are safe and of
> + * short duration).
>   */
> -int number_of_containers __read_mostly;
> +static int need_forkexit_callback = 0;
> 
>  /* bits in struct container flags field */
>  typedef enum {
>  	CONT_REMOVED,
> -	CONT_NOTIFY_ON_RELEASE,
>  } container_flagbits_t;
> 
>  /* convenient tests for these bits */
> @@ -82,31 +120,144 @@ inline int container_is_removed(const st
>  	return test_bit(CONT_REMOVED, &cont->flags);
>  }
> 
> -static inline int notify_on_release(const struct container *cont)
> -{
> -	return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
> +/* for_each_subsys() allows you to act on each subsystem attached to
> + * an active hierarchy */
> +#define for_each_subsys(_hierarchy, _ss) \
> +list_for_each_entry(_ss, &rootnode[_hierarchy].subsys_list, sibling)
> +
> +/* The default container group - used by init and its children prior
> + * to any hierarchies being mounted. It contains a pointer to the top
> + * container in each hierarchy. Also used to anchor the list of
> + * container groups */
> +static struct container_group init_container_group;
> +static DEFINE_SPINLOCK(container_group_lock);
> +static int container_group_count;
> +
> +static void release_container_group(struct kref *k) {
> +	struct container_group *cg =
> +		container_of(k, struct container_group, ref);
> +	int i;
> +	spin_lock(&container_group_lock);
> +	/* Release reference counts on all the containers pointed to
> +	 * by this container_group */
> +	for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
> +		struct container *cont = cg->container[i];
> +		if (!cont) continue;
> +		atomic_dec(&cont->count);
> +	}
> +	list_del(&cg->list);
> +	container_group_count--;
> +	spin_unlock(&container_group_lock);
> +	kfree(cg);
>  }
> 
> -static struct container top_container = {
> -	.count = ATOMIC_INIT(0),
> -	.sibling = LIST_HEAD_INIT(top_container.sibling),
> -	.children = LIST_HEAD_INIT(top_container.children),
> -};
> +static inline void get_container_group(struct container_group *cg) {
> +	kref_get(&cg->ref);
> +}
> 
> -/* The path to use for release notifications. No locking between
> - * setting and use - so if userspace updates this while subcontainers
> - * exist, you could miss a notification */
> -static char release_agent_path[PATH_MAX] = "/sbin/container_release_agent";
> +static inline void put_container_group(struct container_group *cg) {
> +	kref_put(&cg->ref, release_container_group);
> +}
> 
> -void container_set_release_agent_path(const char *path)
> -{
> -	container_manage_lock();
> -	strcpy(release_agent_path, path);
> -	container_manage_unlock();
> +/*
> + * find_existing_container_group() is a helper for
> + * find_container_group(), and checks to see whether an existing
> + * container_group is suitable. This currently walks a linked-list for
> + * simplicity; a later patch will use a hash table for better
> + * performance
> + */
> +
> +static struct container_group *find_existing_container_group(
> +	struct container_group *oldcg,
> +	struct container *cont)
> +{
> +	int h = cont->hierarchy;
> +	struct list_head *l = &init_container_group.list;
> +	do {
> +		int i;
> +		struct container_group *cg =
> +			list_entry(l, struct container_group, list);
> +
> +		/* A container matches what we want if its container
> +		 * set is the same as "oldcg", except for the
> +		 * hierarchy for "cont" which should match "cont" */
> +		for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
> +			if (i == h) {
> +				if (cg->container[i] != cont)
> +					break;
> +			} else {
> +				if (cg->container[i] != oldcg->container[i])
> +					break;
> +			}
> +		}
> +		if (i == CONFIG_MAX_CONTAINER_HIERARCHIES) {
> +			/* All hierarchies matched what we want - success */
> +			return cg;
> +		}
> +		/* Try the next container group */
> +		l = l->next;
> +	} while (l != &init_container_group.list);
> +
> +	/* No existing container group matched */
> +	return NULL;
>  }
> 
> -static struct vfsmount *container_mount;
> -static struct super_block *container_sb;
> +/*
> + * find_container_group() takes an existing container group and a
> + * container object, and returns a container_group object that's
> + * equivalent to the old group, but with the given container
> + * substituted into the appropriate hierarchy. Must be called with
> + * manage_mutex held
> + */
> +
> +static struct container_group *find_container_group(
> +	struct container_group *oldcg, struct container *cont)
> +{
> +	struct container_group *res;
> +	struct container_subsys *ss;
> +	int h = cont->hierarchy;
> +	int i;
> +
> +	BUG_ON(oldcg->container[h] == cont);
> +	/* First see if we already have a container group that matches
> +	 * the desired set */
> +	spin_lock(&container_group_lock);
> +	res = find_existing_container_group(oldcg, cont);
> +	if (res)
> +		get_container_group(res);
> +	spin_unlock(&container_group_lock);
> +
> +	if (res)
> +		return res;
> +
> +	res = kmalloc(sizeof(*res), GFP_KERNEL);
> +	if (!res)
> +		return NULL;
> +
> +	/* Copy the old container group into the new one but overwrite
> +	 * the appropriate hierarchy with the new container object and
> +	 * subsystem states and reset the reference count. */
> +	*res = *oldcg;
> +	kref_init(&res->ref);
> +	res->container[h] = cont;
> +	for_each_subsys(h, ss) {
> +		res->subsys[ss->subsys_id] = cont->subsys[ss->subsys_id];
> +	}
> +	/* Take reference counts on all the referenced containers,
> +	 * including the new one */
> +	for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
> +		BUG_ON(!res->container[i]);
> +		atomic_inc(&res->container[i]->count);
> +	}
> +
> +	/* Link this container group into the list */
> +	spin_lock(&container_group_lock);
> +	list_add(&res->list, &init_container_group.list);
> +	container_group_count++;
> +	spin_unlock(&container_group_lock);
> +
> +	return res;
> +}
> 
>  /*
>   * We have two global container mutexes below.  They can nest.
> @@ -156,44 +307,109 @@ static struct super_block *container_sb;
>   * small pieces of code, such as when reading out possibly multi-word
>   * cpumasks and nodemasks.
>   *
> - * The fork and exit callbacks container_fork() and container_exit(), don't
> - * (usually) take either mutex.  These are the two most performance
> - * critical pieces of code here.  The exception occurs on container_exit(),
> - * when a task in a notify_on_release container exits.  Then manage_mutex
> - * is taken, and if the container count is zero, a usermode call made
> - * to /sbin/container_release_agent with the name of the container (path
> - * relative to the root of container file system) as the argument.
> - *
> - * A container can only be deleted if both its 'count' of using tasks
> - * is zero, and its list of 'children' containers is empty.  Since all
> - * tasks in the system use _some_ container, and since there is always at
> - * least one task in the system (init, pid == 1), therefore, top_container
> - * always has either children containers and/or using tasks.  So we don't
> + * The fork and exit callbacks container_fork() and container_exit(),
> + * don't take either mutex, unless some subsystem has registered a
> + * fork/exit callback.
> + *
> + * A container can only be deleted if all three conditions below hold:
> + *
> + * - its 'count' of using container groups is zero
> + * - its list of 'children' containers is empty.
> + * - all of its subsystems' state records have a zero 'refcnt'
> + *
> + * Since all tasks in the system use _some_ container group, and since
> + * there is always at least one task in the system (init, pid == 1),
> + * therefore, the top_container in each hierarchy always has either
> + * children containers and/or using container groups.  So we don't
>   * need a special hack to ensure that top_container cannot be deleted.
>   *
>   * The above "Tale of Two Semaphores" would be complete, but for:
>   *
>   *	The task_lock() exception
>   *
> - * The need for this exception arises from the action of attach_task(),
> - * which overwrites one tasks container pointer with another.  It does
> - * so using both mutexes, however there are several performance
> - * critical places that need to reference task->container without the
> - * expense of grabbing a system global mutex.  Therefore except as
> - * noted below, when dereferencing or, as in attach_task(), modifying
> - * a tasks container pointer we use task_lock(), which acts on a spinlock
> + * The need for this exception arises from the action of
> + * attach_task(), which overwrites a task's container group pointer
> + * with a pointer to a different group.  It does so using both
> + * mutexes, however there are several performance critical places that
> + * need to reference task->containers without the expense of grabbing
> + * a system global mutex.  Therefore except as noted below, when
> + * dereferencing or, as in attach_task(), modifying a task's
> + * containers pointer we use task_lock(), which acts on a spinlock
>   * (task->alloc_lock) already in the task_struct routinely used for
>   * such matters.
>   *
>   * P.S.  One more locking exception.  RCU is used to guard the
> - * update of a tasks container pointer by attach_task() and the
> + * update of a task's containers pointer by attach_task() and the
>   * access of task->container->mems_generation via that pointer in
>   * the routine container_update_task_memory_state().
> + *
> + * Some container subsystems and other external code also use these
> + * mutexes, exposed through the container_lock()/container_unlock()
> + * and container_manage_lock()/container_manage_unlock() functions.
> + *
> + * E.g. the out of memory (OOM) code needs to prevent containers from
> + * being changed while it scans the tasklist looking for a task in an
> + * overlapping container. The tasklist_lock is a spinlock, so must be
> + * taken inside callback_mutex.
> + *
> + * Some container subsystems (including cpusets) also use
> + * callback_mutex as a primary lock for synchronizing access to
> + * subsystem state. Deciding on best practices of when to use
> + * fine-grained locks vs container_lock()/container_unlock() is still
> + * a TODO.
> + *
> + * Note that manage_mutex and callback_mutex should both nest inside
> + * any inode->i_mutex, unless the inode isn't accessible to any code
> + * outside the current thread.
>   */
> 
>  static DEFINE_MUTEX(manage_mutex);
>  static DEFINE_MUTEX(callback_mutex);
> 
> +/**
> + * container_lock - lock out any changes to container structures
> + *
> + */
> +
> +void container_lock(void)
> +{
> +	mutex_lock(&callback_mutex);
> +}
> +
> +/**
> + * container_unlock - release lock on container changes
> + *
> + * Undo the lock taken in a previous container_lock() call.
> + */
> +
> +void container_unlock(void)
> +{
> +	mutex_unlock(&callback_mutex);
> +}
> +
> +/**
> + * container_manage_lock() - lock out anyone else considering making
> + * changes to container structures. This is a more heavy-weight lock
> + * than the callback_mutex taken by container_lock() */
> +
> +void container_manage_lock(void)
> +{
> +	mutex_lock(&manage_mutex);
> +}
> +
> +/**
> + * container_manage_unlock
> + *
> + * Undo the lock taken in a previous container_manage_lock() call.
> + */
> +
> +void container_manage_unlock(void)
> +{
> +	mutex_unlock(&manage_mutex);
> +}
> +
> +
> +
>  /*
>   * A couple of forward declarations required, due to cyclic reference loop:
>   *  container_mkdir -> container_create -> container_populate_dir -> container_add_file
> @@ -202,15 +418,18 @@ static DEFINE_MUTEX(callback_mutex);
> 
>  static int container_mkdir(struct inode *dir, struct dentry *dentry, int mode);
>  static int container_rmdir(struct inode *unused_dir, struct dentry *dentry);
> +static int container_populate_dir(struct container *cont);
> +static struct inode_operations container_dir_inode_operations;
> +struct file_operations proc_containerstats_operations;
> 
>  static struct backing_dev_info container_backing_dev_info = {
>  	.ra_pages = 0,		/* No readahead */
>  	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
>  };
> 
> -static struct inode *container_new_inode(mode_t mode)
> +static struct inode *container_new_inode(mode_t mode, struct super_block *sb)
>  {
> -	struct inode *inode = new_inode(container_sb);
> +	struct inode *inode = new_inode(sb);
> 
>  	if (inode) {
>  		inode->i_mode = mode;
> @@ -238,7 +457,8 @@ static struct dentry_operations containe
>  	.d_iput = container_diput,
>  };
> 
> -static struct dentry *container_get_dentry(struct dentry *parent, const char *name)
> +static struct dentry *container_get_dentry(struct dentry *parent,
> +					   const char *name)
>  {
>  	struct dentry *d = lookup_one_len(name, parent, strlen(name));
>  	if (!IS_ERR(d))
> @@ -255,19 +475,19 @@ static void remove_dir(struct dentry *d)
>  	dput(parent);
>  }
> 
> -/*
> - * NOTE : the dentry must have been dget()'ed
> - */
> -static void container_d_remove_dir(struct dentry *dentry)
> +static void container_clear_directory(struct dentry *dentry)
>  {
>  	struct list_head *node;
> -
> +	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
>  	spin_lock(&dcache_lock);
>  	node = dentry->d_subdirs.next;
>  	while (node != &dentry->d_subdirs) {
>  		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
>  		list_del_init(node);
>  		if (d->d_inode) {
> +			/* This should never be called on a container
> +			 * directory with child containers */
> +			BUG_ON(d->d_inode->i_mode & S_IFDIR);
>  			d = dget_locked(d);
>  			spin_unlock(&dcache_lock);
>  			d_delete(d);
> @@ -277,37 +497,222 @@ static void container_d_remove_dir(struc
>  		}
>  		node = dentry->d_subdirs.next;
>  	}
> +	spin_unlock(&dcache_lock);
> +}
> +
> +/*
> + * NOTE : the dentry must have been dget()'ed
> + */
> +static void container_d_remove_dir(struct dentry *dentry)
> +{
> +	container_clear_directory(dentry);
> +
> +	spin_lock(&dcache_lock);
>  	list_del_init(&dentry->d_u.d_child);
>  	spin_unlock(&dcache_lock);
>  	remove_dir(dentry);
>  }
> 
> +static int rebind_subsystems(struct containerfs_root *root,
> +			      unsigned long final_bits)
> +{
> +	unsigned long added_bits, removed_bits;
> +	struct container *cont = &root->top_container;
> +	int i;
> +	int hierarchy = cont->hierarchy;
> +
> +	removed_bits = root->subsys_bits & ~final_bits;
> +	added_bits = final_bits & ~root->subsys_bits;
> +	/* Check that any added subsystems are currently free */
> +	for (i = 0; i < subsys_count; i++) {
> +		unsigned long long bit = 1ull << i;
> +		struct container_subsys *ss = subsys[i];
> +		if (!(bit & added_bits))
> +			continue;
> +		if (ss->hierarchy != 0) {
> +			/* Subsystem isn't free */
> +			return -EBUSY;
> +		}
> +	}
> +
> +	/* Currently we don't handle adding/removing subsystems when
> +	 * any subcontainers exist. This is theoretically supportable
> +	 * but involves complex erro r handling, so it's being left until
> +	 * later */
> +	if (!list_empty(&cont->children)) {
> +		return -EBUSY;
> +	}
> +
> +	mutex_lock(&callback_mutex);
> +	/* Process each subsystem */
> +	for (i = 0; i < subsys_count; i++) {
> +		struct container_subsys *ss = subsys[i];
> +		unsigned long bit = 1UL << i;
> +		if (bit & added_bits) {
> +			/* We're binding this subsystem to this hierarchy */
> +			BUG_ON(cont->subsys[i]);
> +			BUG_ON(dummytop->subsys[i]->container != dummytop);
> +			cont->subsys[i] = dummytop->subsys[i];
> +			cont->subsys[i]->container = cont;
> +			list_add(&ss->sibling, &root->subsys_list);
> +			rcu_assign_pointer(ss->hierarchy, hierarchy);
> +			if (ss->bind)
> +				ss->bind(ss, cont);
> +
> +		} else if (bit & removed_bits) {
> +			/* We're removing this subsystem */
> +			BUG_ON(cont->subsys[i] != dummytop->subsys[i]);
> +			BUG_ON(cont->subsys[i]->container != cont);
> +			if (ss->bind)
> +				ss->bind(ss, dummytop);
> +			dummytop->subsys[i]->container = dummytop;
> +			cont->subsys[i] = NULL;
> +			rcu_assign_pointer(subsys[i]->hierarchy, 0);
> +			list_del(&ss->sibling);
> +		} else if (bit & final_bits) {
> +			/* Subsystem state should already exist */
> +			BUG_ON(!cont->subsys[i]);
> +		} else {
> +			/* Subsystem state shouldn't exist */
> +			BUG_ON(cont->subsys[i]);
> +		}
> +	}
> +	root->subsys_bits = final_bits;
> +	mutex_unlock(&callback_mutex);
> +	synchronize_rcu();
> +
> +	return 0;
> +}
> +
> +/*
> + * Release the last use of a hierarchy.  Will never be called when
> + * there are active subcontainers since each subcontainer bumps the
> + * value of sb->s_active.
> + */
> +
> +static void container_put_super(struct super_block *sb) {
> +
> +	struct containerfs_root *root = sb->s_fs_info;
> +	struct container *cont = &root->top_container;
> +	int ret;
> +
> +	root->sb = NULL;
> +	sb->s_fs_info = NULL;
> +
> +	mutex_lock(&manage_mutex);
> +
> +	BUG_ON(root->number_of_containers != 1);
> +	BUG_ON(!list_empty(&cont->children));
> +	BUG_ON(!list_empty(&cont->sibling));
> +	BUG_ON(!root->subsys_bits);
> +
> +	/* Rebind all subsystems back to the default hierarchy */
> +	ret = rebind_subsystems(root, 0);
> +	BUG_ON(ret);
> +
> +	mutex_unlock(&manage_mutex);
> +}
> +
> +static int container_show_options(struct seq_file *seq, struct vfsmount *vfs)
> +{
> +	struct containerfs_root *root = vfs->mnt_sb->s_fs_info;
> +	struct container_subsys *ss;
> +	for_each_subsys(root->top_container.hierarchy, ss) {
> +		seq_printf(seq, ",%s", ss->name);
> +	}
> +	return 0;
> +}
> +
> +/* Convert a hierarchy specifier into a bitmask. LL=manage_mutex */
> +static int parse_containerfs_options(char *opts, unsigned long *bits)
> +{
> +	char *token, *o = opts ?: "all";
> +
> +	*bits = 0;
> +
> +	while ((token = strsep(&o, ",")) != NULL) {
> +		if (!*token)
> +			return -EINVAL;
> +		if (!strcmp(token, "all")) {
> +			*bits = (1 << subsys_count) - 1;
> +		} else {
> +			struct container_subsys *ss;
> +			int i;
> +			for (i = 0; i < subsys_count; i++) {
> +				ss = subsys[i];
> +				if (!strcmp(token, ss->name)) {
> +					*bits |= 1 << i;
> +					break;
> +				}
> +			}
> +			if (i == subsys_count)
> +				return -ENOENT;
> +		}
> +	}
> +
> +	/* We can't have an empty hierarchy */
> +	if (!*bits)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static int container_remount(struct super_block *sb, int *flags, char *data)
> +{
> +	int ret = 0;
> +	unsigned long subsys_bits;
> +	struct containerfs_root *root = sb->s_fs_info;
> +	struct container *cont = &root->top_container;
> +
> +	mutex_lock(&cont->dentry->d_inode->i_mutex);
> +	mutex_lock(&manage_mutex);
> +
> +	/* See what subsystems are wanted */
> +	ret = parse_containerfs_options(data, &subsys_bits);
> +	if (ret)
> +		goto out_unlock;
> +
> +	ret = rebind_subsystems(root, subsys_bits);
> +
> +	/* (re)populate subsystem files */
> +	if (!ret)
> +		container_populate_dir(cont);
> +
> + out_unlock:
> +	mutex_unlock(&manage_mutex);
> +	mutex_unlock(&cont->dentry->d_inode->i_mutex);
> +	return ret;
> +}
> +
>  static struct super_operations container_ops = {
>  	.statfs = simple_statfs,
>  	.drop_inode = generic_delete_inode,
> +	.put_super = container_put_super,
> +	.show_options = container_show_options,
> +	.remount_fs = container_remount,
>  };
> 
> -static int container_fill_super(struct super_block *sb, void *unused_data,
> -							int unused_silent)
> +static int container_fill_super(struct super_block *sb, void *options,
> +				int unused_silent)
>  {
>  	struct inode *inode;
>  	struct dentry *root;
> +	struct containerfs_root *hroot = options;
> 
>  	sb->s_blocksize = PAGE_CACHE_SIZE;
>  	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
>  	sb->s_magic = CONTAINER_SUPER_MAGIC;
>  	sb->s_op = &container_ops;
> -	container_sb = sb;
> 
> -	inode = container_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR);
> -	if (inode) {
> -		inode->i_op = &simple_dir_inode_operations;
> -		inode->i_fop = &simple_dir_operations;
> -		/* directories start off with i_nlink == 2 (for "." entry) */
> -		inode->i_nlink++;
> -	} else {
> +	inode = container_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb);
> +	if (!inode)
>  		return -ENOMEM;
> -	}
> +
> +	inode->i_op = &simple_dir_inode_operations;
> +	inode->i_fop = &simple_dir_operations;
> +	inode->i_op = &container_dir_inode_operations;
> +	/* directories start off with i_nlink == 2 (for "." entry) */
> +	inc_nlink(inode);
> 
>  	root = d_alloc_root(inode);
>  	if (!root) {
> @@ -315,6 +720,12 @@ static int container_fill_super(struct s
>  		return -ENOMEM;
>  	}
>  	sb->s_root = root;
> +	root->d_fsdata = &hroot->top_container;
> +	hroot->top_container.dentry = root;
> +
> +	sb->s_fs_info = hroot;
> +	hroot->sb = sb;
> +
>  	return 0;
>  }
> 
> @@ -322,7 +733,82 @@ static int container_get_sb(struct file_
>  			 int flags, const char *unused_dev_name,
>  			 void *data, struct vfsmount *mnt)
>  {
> -	return get_sb_single(fs_type, flags, data, container_fill_super, mnt);
> +	int i;
> +	unsigned long subsys_bits = 0;
> +	int ret = 0;
> +	struct containerfs_root *root = NULL;
> +	int hierarchy;
> +
> +	mutex_lock(&manage_mutex);
> +
> +	/* First find the desired set of resource controllers */
> +	ret = parse_containerfs_options(data, &subsys_bits);
> +	if (ret)
> +		goto out_unlock;
> +
> +	/* See if we already have a hierarchy containing this set */
> +
> +	for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
> +		root = &rootnode[i];
> +		/* We match - use this hieracrchy */
> +		if (root->subsys_bits == subsys_bits) break;
> +		/* We clash - fail */
> +		if (root->subsys_bits & subsys_bits) {
> +			ret = -EBUSY;
> +			goto out_unlock;
> +		}
> +	}
> +
> +	if (i == CONFIG_MAX_CONTAINER_HIERARCHIES) {
> +		/* No existing hierarchy matched this set - but we
> +		 * know that all the subsystems are free */
> +		for (i = 1; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
> +			root = &rootnode[i];
> +			if (!root->sb && !root->subsys_bits) break;
> +		}
> +	}
> +
> +	if (i == CONFIG_MAX_CONTAINER_HIERARCHIES) {
> +		ret = -ENOSPC;
> +		goto out_unlock;
> +	}
> +
> +	hierarchy = i;
> +
> +	if (!root->sb) {
> +		/* We need a new superblock for this container combination */
> +		struct container *cont = &root->top_container;
> +
> +		BUG_ON(root->subsys_bits);
> +		ret = get_sb_nodev(fs_type, flags, root,
> +				   container_fill_super, mnt);
> +		if (ret)
> +			goto out_unlock;
> +
> +		BUG_ON(!list_empty(&cont->sibling));
> +		BUG_ON(!list_empty(&cont->children));
> +		BUG_ON(root->number_of_containers != 1);
> +
> +		ret = rebind_subsystems(root, subsys_bits);
> +
> +		/* It's safe to nest i_mutex inside manage_mutex in
> +		 * this case, since no-one else can be accessing this
> +		 * directory yet */
> +		mutex_lock(&cont->dentry->d_inode->i_mutex);
> +		container_populate_dir(cont);
> +		mutex_unlock(&cont->dentry->d_inode->i_mutex);
> +		BUG_ON(ret);
> +
> +	} else {
> +		/* Reuse the existing superblock */
> +		ret = simple_set_mnt(mnt, root->sb);
> +		if (!ret)
> +			atomic_inc(&root->sb->s_active);
> +	}
> +
> + out_unlock:
> +	mutex_unlock(&manage_mutex);
> +	return ret;
>  }
> 
>  static struct file_system_type container_fs_type = {
> @@ -372,135 +858,79 @@ int container_path(const struct containe
>  }
> 
>  /*
> - * Notify userspace when a container is released, by running
> - * /sbin/container_release_agent with the name of the container (path
> - * relative to the root of container file system) as the argument.
> - *
> - * Most likely, this user command will try to rmdir this container.
> - *
> - * This races with the possibility that some other task will be
> - * attached to this container before it is removed, or that some other
> - * user task will 'mkdir' a child container of this container.  That's ok.
> - * The presumed 'rmdir' will fail quietly if this container is no longer
> - * unused, and this container will be reprieved from its death sentence,
> - * to continue to serve a useful existence.  Next time it's released,
> - * we will get notified again, if it still has 'notify_on_release' set.
> - *
> - * The final arg to call_usermodehelper() is 0, which means don't
> - * wait.  The separate /sbin/container_release_agent task is forked by
> - * call_usermodehelper(), then control in this thread returns here,
> - * without waiting for the release agent task.  We don't bother to
> - * wait because the caller of this routine has no use for the exit
> - * status of the /sbin/container_release_agent task, so no sense holding
> - * our caller up for that.
> - *
> - * When we had only one container mutex, we had to call this
> - * without holding it, to avoid deadlock when call_usermodehelper()
> - * allocated memory.  With two locks, we could now call this while
> - * holding manage_mutex, but we still don't, so as to minimize
> - * the time manage_mutex is held.
> + * Attach task 'tsk' to container 'cont'
> + *
> + * Call holding manage_mutex.  May take callback_mutex and task_lock of
> + * the task 'pid' during call.
>   */
> 
> -static void container_release_agent(const char *pathbuf)
> +static int attach_task(struct container *cont, struct task_struct *tsk)
>  {
> -	char *argv[3], *envp[3];
> -	int i;
> -
> -	if (!pathbuf)
> -		return;
> -
> -	i = 0;
> -	argv[i++] = release_agent_path;
> -	argv[i++] = (char *)pathbuf;
> -	argv[i] = NULL;
> -
> -	i = 0;
> -	/* minimal command environment */
> -	envp[i++] = "HOME=/";
> -	envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
> -	envp[i] = NULL;
> +	int retval = 0;
> +	struct container_subsys *ss;
> +	struct container_group *oldcg, *newcg;
> +	struct container *oldcont;
> +	int h = cont->hierarchy;
> 
> -	call_usermodehelper(argv[0], argv, envp, 0);
> -	kfree(pathbuf);
> -}
> +	/* Nothing to do if the task is already in that container */
> +	if (tsk->containers->container[h] == cont)
> +		return 0;
> 
> -/*
> - * Either cont->count of using tasks transitioned to zero, or the
> - * cont->children list of child containers just became empty.  If this
> - * cont is notify_on_release() and now both the user count is zero and
> - * the list of children is empty, prepare container path in a kmalloc'd
> - * buffer, to be returned via ppathbuf, so that the caller can invoke
> - * container_release_agent() with it later on, once manage_mutex is dropped.
> - * Call here with manage_mutex held.
> - *
> - * This check_for_release() routine is responsible for kmalloc'ing
> - * pathbuf.  The above container_release_agent() is responsible for
> - * kfree'ing pathbuf.  The caller of these routines is responsible
> - * for providing a pathbuf pointer, initialized to NULL, then
> - * calling check_for_release() with manage_mutex held and the address
> - * of the pathbuf pointer, then dropping manage_mutex, then calling
> - * container_release_agent() with pathbuf, as set by check_for_release().
> - */
> -
> -static void check_for_release(struct container *cont, char **ppathbuf)
> -{
> -	if (notify_on_release(cont) && atomic_read(&cont->count) == 0 &&
> -	    list_empty(&cont->children)) {
> -		char *buf;
> -
> -		buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
> -		if (!buf)
> -			return;
> -
> -		if (container_path(cont, buf, PAGE_SIZE) < 0)
> -			kfree(buf);
> -		else
> -			*ppathbuf = buf;
> +	for_each_subsys(h, ss) {
> +		if (ss->can_attach) {
> +			retval = ss->can_attach(ss, cont, tsk);
> +			if (retval) {
> +				put_task_struct(tsk);
> +				return retval;
> +			}
> +		}
>  	}
> -}
> 
> +	/* Locate or allocate a new container_group for this task,
> +	 * based on its final set of containers */
> + 	oldcg = tsk->containers;
> +	newcg = find_container_group(oldcg, cont);
> +	if (!newcg) {
> +		put_task_struct(tsk);
> +		return -ENOMEM;
> +	}
> 
> -/*
> - * update_flag - read a 0 or a 1 in a file and update associated flag
> - * bit:	the bit to update (CONT_NOTIFY_ON_RELEASE)
> - * cont: the container to update
> - * buf:	the buffer where we read the 0 or 1
> - *
> - * Call with manage_mutex held.
> - */
> -
> -static int update_flag(container_flagbits_t bit, struct container *cont, char *buf)
> -{
> -	int turning_on;
> +	mutex_lock(&callback_mutex);
> +	task_lock(tsk);
> +	rcu_assign_pointer(tsk->containers, newcg);
> +	task_unlock(tsk);
> 
> -	turning_on = (simple_strtoul(buf, NULL, 10) != 0);
> +	oldcont = oldcg->container[h];
> +	for_each_subsys(h, ss) {
> +		if (ss->attach) {
> +			ss->attach(ss, cont, oldcont, tsk);
> +		}
> +	}
> 
> -	mutex_lock(&callback_mutex);
> -	if (turning_on)
> -		set_bit(bit, &cont->flags);
> -	else
> -		clear_bit(bit, &cont->flags);
>  	mutex_unlock(&callback_mutex);
> 
> +	for_each_subsys(h, ss) {
> +		if (ss->post_attach) {
> +			ss->post_attach(ss, cont, oldcont, tsk);
> +		}
> +	}
> +
> +	synchronize_rcu();
> +	put_container_group(oldcg);
>  	return 0;
>  }
> 
> -
>  /*
> - * Attack task specified by pid in 'pidbuf' to container 'cont', possibly
> - * writing the path of the old container in 'ppathbuf' if it needs to be
> - * notified on release.
> + * Attach task with pid 'pid' to container 'cont'. Call with
> + * manage_mutex, may take callback_mutex and task_lock of task
>   *
> - * Call holding manage_mutex.  May take callback_mutex and task_lock of
> - * the task 'pid' during call.
>   */
> 
> -static int attach_task(struct container *cont, char *pidbuf, char **ppathbuf)
> +static int attach_task_by_pid(struct container *cont, char *pidbuf)
>  {
>  	pid_t pid;
>  	struct task_struct *tsk;
> -	struct container *oldcont;
> -	int retval = 0;
> +	int ret;
> 
>  	if (sscanf(pidbuf, "%d", &pid) != 1)
>  		return -EIO;
> @@ -527,43 +957,9 @@ static int attach_task(struct container 
>  		get_task_struct(tsk);
>  	}
> 
> -#ifdef CONFIG_CPUSETS
> -	retval = cpuset_can_attach_task(cont, tsk);
> -#endif
> -	if (retval) {
> -		put_task_struct(tsk);
> -		return retval;
> -	}
> -
> -	mutex_lock(&callback_mutex);
> -
> -	task_lock(tsk);
> -	oldcont = tsk->container;
> -	if (!oldcont) {
> -		task_unlock(tsk);
> -		mutex_unlock(&callback_mutex);
> -		put_task_struct(tsk);
> -		return -ESRCH;
> -	}
> -	atomic_inc(&cont->count);
> -	rcu_assign_pointer(tsk->container, cont);
> -	task_unlock(tsk);
> -
> -#ifdef CONFIG_CPUSETS
> -	cpuset_attach_task(cont, tsk);
> -#endif
> -
> -	mutex_unlock(&callback_mutex);
> -
> -#ifdef CONFIG_CPUSETS
> -	cpuset_post_attach_task(cont, oldcont, tsk);
> -#endif
> -
> +	ret = attach_task(cont, tsk);
>  	put_task_struct(tsk);
> -	synchronize_rcu();
> -	if (atomic_dec_and_test(&oldcont->count))
> -		check_for_release(oldcont, ppathbuf);
> -	return 0;
> +	return ret;
>  }
> 
>  /* The various types of files and directories in a container file system */
> @@ -571,9 +967,7 @@ static int attach_task(struct container 
>  typedef enum {
>  	FILE_ROOT,
>  	FILE_DIR,
> -	FILE_NOTIFY_ON_RELEASE,
>  	FILE_TASKLIST,
> -	FILE_RELEASE_AGENT,
>  } container_filetype_t;
> 
>  static ssize_t container_common_file_write(struct container *cont,
> @@ -584,7 +978,6 @@ static ssize_t container_common_file_wri
>  {
>  	container_filetype_t type = cft->private;
>  	char *buffer;
> -	char *pathbuf = NULL;
>  	int retval = 0;
> 
>  	if (nbytes >= PATH_MAX)
> @@ -608,26 +1001,9 @@ static ssize_t container_common_file_wri
>  	}
> 
>  	switch (type) {
> -	case FILE_NOTIFY_ON_RELEASE:
> -		retval = update_flag(CONT_NOTIFY_ON_RELEASE, cont, buffer);
> -		break;
>  	case FILE_TASKLIST:
> -		retval = attach_task(cont, buffer, &pathbuf);
> -		break;
> -	case FILE_RELEASE_AGENT:
> -	{
> -		if (nbytes < sizeof(release_agent_path)) {
> -			/* We never write anything other than '\0'
> -			 * into the last char of release_agent_path,
> -			 * so it always remains a NUL-terminated
> -			 * string */
> -			strncpy(release_agent_path, buffer, nbytes);
> -			release_agent_path[nbytes] = 0;
> -		} else {
> -			retval = -ENOSPC;
> -		}
> +		retval = attach_task_by_pid(cont, buffer);
>  		break;
> -	}
>  	default:
>  		retval = -EINVAL;
>  		goto out2;
> @@ -637,7 +1013,6 @@ static ssize_t container_common_file_wri
>  		retval = nbytes;
>  out2:
>  	mutex_unlock(&manage_mutex);
> -	container_release_agent(pathbuf);
>  out1:
>  	kfree(buffer);
>  	return retval;
> @@ -646,80 +1021,27 @@ out1:
>  static ssize_t container_file_write(struct file *file, const char __user *buf,
>  						size_t nbytes, loff_t *ppos)
>  {
> -	ssize_t retval = 0;
>  	struct cftype *cft = __d_cft(file->f_dentry);
>  	struct container *cont = __d_cont(file->f_dentry->d_parent);
>  	if (!cft)
>  		return -ENODEV;
> +	if (!cft->write)
> +		return -EINVAL;
> 
> -	/* special function ? */
> -	if (cft->write)
> -		retval = cft->write(cont, cft, file, buf, nbytes, ppos);
> -	else
> -		retval = -EINVAL;
> -
> -	return retval;
> +	return cft->write(cont, cft, file, buf, nbytes, ppos);
>  }
> 
> -static ssize_t container_common_file_read(struct container *cont,
> -					  struct cftype *cft,
> -					  struct file *file,
> -					  char __user *buf,
> -					  size_t nbytes, loff_t *ppos)
> +static ssize_t container_file_read(struct file *file, char __user *buf,
> +				   size_t nbytes, loff_t *ppos)
>  {
> -	container_filetype_t type = cft->private;
> -	char *page;
> -	ssize_t retval = 0;
> -	char *s;
> -
> -	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
> -		return -ENOMEM;
> -
> -	s = page;
> -
> -	switch (type) {
> -	case FILE_NOTIFY_ON_RELEASE:
> -		*s++ = notify_on_release(cont) ? '1' : '0';
> -		break;
> -	case FILE_RELEASE_AGENT:
> -	{
> -		size_t n;
> -		container_manage_lock();
> -		n = strnlen(release_agent_path, sizeof(release_agent_path));
> -		n = min(n, (size_t) PAGE_SIZE);
> -		strncpy(s, release_agent_path, n);
> -		container_manage_unlock();
> -		s += n;
> -		break;
> -	}
> -	default:
> -		retval = -EINVAL;
> -		goto out;
> -	}
> -	*s++ = '\n';
> -
> -	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
> -out:
> -	free_page((unsigned long)page);
> -	return retval;
> -}
> -
> -static ssize_t container_file_read(struct file *file, char __user *buf, size_t nbytes,
> -								loff_t *ppos)
> -{
> -	ssize_t retval = 0;
>  	struct cftype *cft = __d_cft(file->f_dentry);
>  	struct container *cont = __d_cont(file->f_dentry->d_parent);
>  	if (!cft)
>  		return -ENODEV;
> +	if (!cft->read)
> +		return -EINVAL;
> 
> -	/* special function ? */
> -	if (cft->read)
> -		retval = cft->read(cont, cft, file, buf, nbytes, ppos);
> -	else
> -		retval = -EINVAL;
> -
> -	return retval;
> +	return cft->read(cont, cft, file, buf, nbytes, ppos);
>  }
> 
>  static int container_file_open(struct inode *inode, struct file *file)
> @@ -780,7 +1102,7 @@ static struct inode_operations container
>  	.rename = container_rename,
>  };
> 
> -static int container_create_file(struct dentry *dentry, int mode)
> +static int container_create_file(struct dentry *dentry, int mode, struct super_block *sb)
>  {
>  	struct inode *inode;
> 
> @@ -789,7 +1111,7 @@ static int container_create_file(struct 
>  	if (dentry->d_inode)
>  		return -EEXIST;
> 
> -	inode = container_new_inode(mode);
> +	inode = container_new_inode(mode, sb);
>  	if (!inode)
>  		return -ENOMEM;
> 
> @@ -798,7 +1120,11 @@ static int container_create_file(struct 
>  		inode->i_fop = &simple_dir_operations;
> 
>  		/* start off with i_nlink == 2 (for "." entry) */
> -		inode->i_nlink++;
> +		inc_nlink(inode);
> +
> +		/* start with the directory inode held, so that we can
> +		 * populate it without racing with another mkdir */
> +		mutex_lock(&inode->i_mutex);
>  	} else if (S_ISREG(mode)) {
>  		inode->i_size = 0;
>  		inode->i_fop = &container_file_operations;
> @@ -818,20 +1144,19 @@ static int container_create_file(struct 
>   *	mode:	mode to set on new directory.
>   */
> 
> -static int container_create_dir(struct container *cont, const char *name, int mode)
> +static int container_create_dir(struct container *cont, struct dentry *dentry,
> +				int mode)
>  {
> -	struct dentry *dentry = NULL;
>  	struct dentry *parent;
>  	int error = 0;
> 
>  	parent = cont->parent->dentry;
> -	dentry = container_get_dentry(parent, name);
>  	if (IS_ERR(dentry))
>  		return PTR_ERR(dentry);
> -	error = container_create_file(dentry, S_IFDIR | mode);
> +	error = container_create_file(dentry, S_IFDIR | mode, cont->root->sb);
>  	if (!error) {
>  		dentry->d_fsdata = cont;
> -		parent->d_inode->i_nlink++;
> +		inc_nlink(parent->d_inode);
>  		cont->dentry = dentry;
>  	}
>  	dput(dentry);
> @@ -845,19 +1170,40 @@ int container_add_file(struct container 
>  	struct dentry *dentry;
>  	int error;
> 
> -	mutex_lock(&dir->d_inode->i_mutex);
> +	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
>  	dentry = container_get_dentry(dir, cft->name);
>  	if (!IS_ERR(dentry)) {
> -		error = container_create_file(dentry, 0644 | S_IFREG);
> +		error = container_create_file(dentry, 0644 | S_IFREG, cont->root->sb);
>  		if (!error)
>  			dentry->d_fsdata = (void *)cft;
>  		dput(dentry);
>  	} else
>  		error = PTR_ERR(dentry);
> -	mutex_unlock(&dir->d_inode->i_mutex);
>  	return error;
>  }
> 
> +/* Count the number of tasks in a container. Could be made more
> + * time-efficient but less space-efficient with more linked lists
> + * running through each container and the container_group structures
> + * that referenced it. */
> +
> +int container_task_count(const struct container *cont) {
> +	int count = 0;
> +	int hierarchy = cont->hierarchy;
> +	struct list_head *l;
> +	spin_lock(&container_group_lock);
> +	l = &init_container_group.list;
> +	do {
> +		struct container_group *cg =
> +			list_entry(l, struct container_group, list);
> +		if (cg->container[hierarchy] == cont)
> +			count += atomic_read(&cg->ref.refcount);
> +		l = l->next;
> +	} while (l != &init_container_group.list);
> +	spin_unlock(&container_group_lock);
> +	return count;
> +}
> +
>  /*
>   * Stuff for reading the 'tasks' file.
>   *
> @@ -881,20 +1227,23 @@ struct ctr_struct {
>  };
> 
>  /*
> - * Load into 'pidarray' up to 'npids' of the tasks using container 'cont'.
> - * Return actual number of pids loaded.  No need to task_lock(p)
> - * when reading out p->container, as we don't really care if it changes
> - * on the next cycle, and we are not going to try to dereference it.
> + * Load into 'pidarray' up to 'npids' of the tasks using container
> + * 'cont'.  Return actual number of pids loaded.  No need to
> + * task_lock(p) when reading out p->container, since we're in an RCU
> + * read section, so the container_group can't go away, and is
> + * immutable after creation.
>   */
>  static int pid_array_load(pid_t *pidarray, int npids, struct container *cont)
>  {
>  	int n = 0;
>  	struct task_struct *g, *p;
> +	int h = cont->hierarchy;
> 
> +	rcu_read_lock();
>  	read_lock(&tasklist_lock);
> 
>  	do_each_thread(g, p) {
> -		if (p->container == cont) {
> +		if (p->containers->container[h] == cont) {
>  			pidarray[n++] = pid_nr(task_pid(p));
>  			if (unlikely(n == npids))
>  				goto array_full;
> @@ -903,6 +1252,7 @@ static int pid_array_load(pid_t *pidarra
> 
>  array_full:
>  	read_unlock(&tasklist_lock);
> +	rcu_read_unlock();
>  	return n;
>  }
> 
> @@ -953,7 +1303,7 @@ static int container_tasks_open(struct i
>  	 * caller from the case that the additional container users didn't
>  	 * show up until sometime later on.
>  	 */
> -	npids = atomic_read(&cont->count);
> +	npids = container_task_count(cont);
>  	pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
>  	if (!pidarray)
>  		goto err1;
> @@ -1020,38 +1370,34 @@ static struct cftype cft_tasks = {
>  	.private = FILE_TASKLIST,
>  };
> 
> -static struct cftype cft_notify_on_release = {
> -	.name = "notify_on_release",
> -	.read = container_common_file_read,
> -	.write = container_common_file_write,
> -	.private = FILE_NOTIFY_ON_RELEASE,
> -};
> -
> -static struct cftype cft_release_agent = {
> -	.name = "release_agent",
> -	.read = container_common_file_read,
> -	.write = container_common_file_write,
> -	.private = FILE_RELEASE_AGENT,
> -};
> -
>  static int container_populate_dir(struct container *cont)
>  {
>  	int err;
> +	struct container_subsys *ss;
> +
> +	/* First clear out any existing files */
> +	container_clear_directory(cont->dentry);
> 
> -	if ((err = container_add_file(cont, &cft_notify_on_release)) < 0)
> -		return err;
>  	if ((err = container_add_file(cont, &cft_tasks)) < 0)
>  		return err;
> -	if ((cont == &top_container) &&
> -	    (err = container_add_file(cont, &cft_release_agent)) < 0)
> -		return err;
> -#ifdef CONFIG_CPUSETS
> -	if ((err = cpuset_populate_dir(cont)) < 0)
> -		return err;
> -#endif
> +
> +	for_each_subsys(cont->hierarchy, ss) {
> +		if (ss->populate && (err = ss->populate(ss, cont)) < 0)
> +			return err;
> +	}
> +
>  	return 0;
>  }
> 
> +static void init_container_css(struct container_subsys *ss,
> +			       struct container *cont)
> +{
> +	struct container_subsys_state *css = cont->subsys[ss->subsys_id];
> +	css->container = cont;
> +	spin_lock_init(&css->refcnt_lock);
> +	atomic_set(&css->refcnt, 0);
> +}
> +
>  /*
>   *	container_create - create a container
>   *	parent:	container that will be parent of the new container.
> @@ -1061,61 +1407,83 @@ static int container_populate_dir(struct
>   *	Must be called with the mutex on the parent inode held
>   */
> 
> -static long container_create(struct container *parent, const char *name, int mode)
> +static long container_create(struct container *parent, struct dentry *dentry,
> +			     int mode)
>  {
>  	struct container *cont;
> -	int err;
> +	struct containerfs_root *root = parent->root;
> +	int err = 0;
> +	struct container_subsys *ss;
> +	struct super_block *sb = root->sb;
> 
> -	cont = kmalloc(sizeof(*cont), GFP_KERNEL);
> +	cont = kzalloc(sizeof(*cont), GFP_KERNEL);
>  	if (!cont)
>  		return -ENOMEM;
> 
> +	/* Grab a reference on the superblock so the hierarchy doesn't
> +	 * get deleted on unmount if there are child containers.  This
> +	 * can be done outside manage_mutex, since the sb can't
> +	 * disappear while someone has an open control file on the
> +	 * fs */
> +	atomic_inc(&sb->s_active);
> +
>  	mutex_lock(&manage_mutex);
> +
>  	cont->flags = 0;
> -	if (notify_on_release(parent))
> -		set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
>  	atomic_set(&cont->count, 0);
>  	INIT_LIST_HEAD(&cont->sibling);
>  	INIT_LIST_HEAD(&cont->children);
> 
>  	cont->parent = parent;
> -
> -#ifdef CONFIG_CPUSETS
> -	err = cpuset_create(cont);
> -	if (err)
> -		goto err_unlock_free;
> -#endif
> +	cont->root = parent->root;
> +	cont->hierarchy = parent->hierarchy;
> +	cont->top_container = parent->top_container;
> +
> +	for_each_subsys(cont->hierarchy, ss) {
> +		err = ss->create(ss, cont);
> +		if (err) goto err_destroy;
> +		init_container_css(ss, cont);
> +	}
> 
>  	mutex_lock(&callback_mutex);
>  	list_add(&cont->sibling, &cont->parent->children);
> -	number_of_containers++;
> +	root->number_of_containers++;
>  	mutex_unlock(&callback_mutex);
> 
> -	err = container_create_dir(cont, name, mode);
> +	err = container_create_dir(cont, dentry, mode);
>  	if (err < 0)
>  		goto err_remove;
> 
> -	/*
> -	 * Release manage_mutex before container_populate_dir() because it
> -	 * will down() this new directory's i_mutex and if we race with
> -	 * another mkdir, we might deadlock.
> -	 */
> -	mutex_unlock(&manage_mutex);
> +	/* The container directory was pre-locked for us */
> +	BUG_ON(!mutex_is_locked(&cont->dentry->d_inode->i_mutex));
> 
>  	err = container_populate_dir(cont);
>  	/* If err < 0, we have a half-filled directory - oh well ;) */
> +
> +	mutex_unlock(&manage_mutex);
> +	mutex_unlock(&cont->dentry->d_inode->i_mutex);
> +
>  	return 0;
> 
>   err_remove:
> -#ifdef CONFIG_CPUSETS
> -	cpuset_destroy(cont);
> -#endif
> +
>  	mutex_lock(&callback_mutex);
>  	list_del(&cont->sibling);
> -	number_of_containers--;
> +	root->number_of_containers--;
>  	mutex_unlock(&callback_mutex);
> - err_unlock_free:
> +
> + err_destroy:
> +
> +	for_each_subsys(cont->hierarchy, ss) {
> +		if (cont->subsys[ss->subsys_id])
> +			ss->destroy(ss, cont);
> +	}
> +
>  	mutex_unlock(&manage_mutex);
> +
> +	/* Release the reference count that we took on the superblock */
> +	deactivate_super(sb);
> +
>  	kfree(cont);
>  	return err;
>  }
> @@ -1125,26 +1493,20 @@ static int container_mkdir(struct inode 
>  	struct container *c_parent = dentry->d_parent->d_fsdata;
> 
>  	/* the vfs holds inode->i_mutex already */
> -	return container_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
> +	return container_create(c_parent, dentry, mode | S_IFDIR);
>  }
> 
> -/*
> - * Locking note on the strange update_flag() call below:
> - *
> - * If the container being removed is marked cpu_exclusive, then simulate
> - * turning cpu_exclusive off, which will call update_cpu_domains().
> - * The lock_cpu_hotplug() call in update_cpu_domains() must not be
> - * made while holding callback_mutex.  Elsewhere the kernel nests
> - * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
> - * nesting would risk an ABBA deadlock.
> - */
> -
>  static int container_rmdir(struct inode *unused_dir, struct dentry *dentry)
>  {
>  	struct container *cont = dentry->d_fsdata;
>  	struct dentry *d;
>  	struct container *parent;
> -	char *pathbuf = NULL;
> +	struct container_subsys *ss;
> +	struct super_block *sb;
> +	struct containerfs_root *root;
> +	unsigned long flags;
> +	int css_busy = 0;
> +	int hierarchy;
> 
>  	/* the vfs holds both inode->i_mutex already */
> 
> @@ -1157,82 +1519,331 @@ static int container_rmdir(struct inode 
>  		mutex_unlock(&manage_mutex);
>  		return -EBUSY;
>  	}
> +
> +	hierarchy = cont->hierarchy;
>  	parent = cont->parent;
> +	root = cont->root;
> +	sb = root->sb;
> +
> +	local_irq_save(flags);
> +	/* Check each container, locking the refcnt lock and testing
> +	 * the refcnt. This will lock out any calls to css_get() */
> +	for_each_subsys(hierarchy, ss) {
> +		struct container_subsys_state *css;
> +		css = cont->subsys[ss->subsys_id];
> +		spin_lock(&css->refcnt_lock);
> +		css_busy += atomic_read(&css->refcnt);
> +	}
> +	/* Go through and release all the locks; if we weren't busy,
> +	 * then set the refcount to -1 to prevent css_get() from adding
> +	 * a refcount */
> +	for_each_subsys(hierarchy, ss) {
> +		struct container_subsys_state *css;
> +		css = cont->subsys[ss->subsys_id];
> +		if (!css_busy) atomic_dec(&css->refcnt);
> +		spin_unlock(&css->refcnt_lock);
> +	}
> +	local_irq_restore(flags);
> +	if (css_busy) {
> +		mutex_unlock(&manage_mutex);
> +		return -EBUSY;
> +	}
> +
> +	for_each_subsys(hierarchy, ss) {
> +		if (cont->subsys[ss->subsys_id])
> +			ss->destroy(ss, cont);
> +	}
> +
>  	mutex_lock(&callback_mutex);
>  	set_bit(CONT_REMOVED, &cont->flags);
> -	list_del(&cont->sibling);	/* delete my sibling from parent->children */
> +	/* delete my sibling from parent->children */
> +	list_del(&cont->sibling);
>  	spin_lock(&cont->dentry->d_lock);
>  	d = dget(cont->dentry);
>  	cont->dentry = NULL;
>  	spin_unlock(&d->d_lock);
> +
>  	container_d_remove_dir(d);
>  	dput(d);
> -	number_of_containers--;
> +	root->number_of_containers--;
>  	mutex_unlock(&callback_mutex);
> -#ifdef CONFIG_CPUSETS
> -	cpuset_destroy(cont);
> -#endif
> -	if (list_empty(&parent->children))
> -		check_for_release(parent, &pathbuf);
> +
>  	mutex_unlock(&manage_mutex);
> -	container_release_agent(pathbuf);
> +	/* Drop the active superblock reference that we took when we
> +	 * created the container */
> +	deactivate_super(sb);
>  	return 0;
>  }
> 
> -/*
> - * container_init_early - probably not needed yet, but will be needed
> - * once cpusets are hooked into this code
> +static atomic_t namecnt;
> +static void get_unused_name(char *buf) {
> +	sprintf(buf, "node%d", atomic_inc_return(&namecnt));
> +}
> +
> +/**
> + * container_clone - duplicate the current container and move this
> + * task into the new child
>   */
> +int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
> +{
> +	struct dentry *dentry;
> +	int ret = 0;
> +	char nodename[32];
> +	struct container *parent, *child;
> +	struct inode *inode;
> +	int h;
> +
> +	/* We shouldn't be called by an unregistered subsystem */
> +	BUG_ON(subsys->subsys_id < 0);
> +
> +	/* First figure out what hierarchy and container we're dealing
> +	 * with, and pin them so we can drop manage_mutex */
> +	mutex_lock(&manage_mutex);
> + again:
> +	h = subsys->hierarchy;
> +	if (h == 0) {
> +		printk(KERN_INFO
> +		       "Not cloning container for unused subsystem %s\n",
> +		       subsys->name);
> +		mutex_unlock(&manage_mutex);
> +		return 0;
> +	}
> +	parent = tsk->containers->container[h];
> +	/* Pin the hierarchy */
> +	atomic_inc(&parent->root->sb->s_active);
> +	/* Keep the container alive */
> +	atomic_inc(&parent->count);
> +	mutex_unlock(&manage_mutex);
> +
> +	/* Now do the VFS work to create a container */
> +	get_unused_name(nodename);
> +	inode = parent->dentry->d_inode;
> +
> +	/* Hold the parent directory mutex across this operation to
> +	 * stop anyone else deleting the new container */
> +	mutex_lock(&inode->i_mutex);
> +	dentry = container_get_dentry(parent->dentry, nodename);
> +	if (IS_ERR(dentry)) {
> +		printk(KERN_INFO
> +		       "Couldn't allocate dentry for %s: %ld\n", nodename,
> +		       PTR_ERR(dentry));
> +		ret = PTR_ERR(dentry);
> +		goto out_release;
> +	}
> +
> +	/* Create the container directory, which also creates the container */
> +	ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
> +	child = __d_cont(dentry);
> +	dput(dentry);
> +	if (ret) {
> +		printk(KERN_INFO
> +		       "Failed to create container %s: %d\n", nodename,
> +		       ret);
> +		goto out_release;
> +	}
> +
> +	if (!child) {
> +		printk(KERN_INFO
> +		       "Couldn't find new container %s\n", nodename);
> +		ret = -ENOMEM;
> +		goto out_release;
> +	}
> +
> +	/* The container now exists. Retake manage_mutex and check
> +	 * that we're still in the same state that we thought we
> +	 * were. */
> +	mutex_lock(&manage_mutex);
> +	if ((h != subsys->hierarchy) ||
> +	    (parent != tsk->containers->container[h])) {
> +		/* Aargh, we raced ... */
> +		mutex_unlock(&inode->i_mutex);
> +		atomic_dec(&parent->count);
> +		deactivate_super(parent->root->sb);
> +		printk(KERN_INFO
> +		       "Race in container_clone() - leaking container %s\n",
> +		       nodename);
> +		goto again;
> +	}
> +
> +	/* All seems fine. Finish by moving the task into the new container */
> +	ret = attach_task(child, tsk);
> +	mutex_unlock(&manage_mutex);
> +
> + out_release:
> +	mutex_unlock(&inode->i_mutex);
> +	atomic_dec(&parent->count);
> +	deactivate_super(parent->root->sb);
> +	return ret;
> +}
> +
> +int container_is_descendant(const struct container *cont) {
> +	int ret;
> +	struct container *target;
> +	container_lock();
> +	target = current->containers->container[cont->hierarchy];
> +	while (cont != target && cont!= target->top_container) {
> +		cont = cont->parent;
> +	}
> +	ret = (cont == target);
> +	container_unlock();
> +	return ret;
> +}
> +
> +/**
> + * container_init_early - initialize containers at system boot
> + *
> + * Description: Initialize the container housekeeping structures
> + **/
> 
>  int __init container_init_early(void)
>  {
> -	struct task_struct *tsk = current;
> +	int i;
> +
> +	kref_init(&init_container_group.ref);
> +	get_container_group(&init_container_group);
> +	INIT_LIST_HEAD(&init_container_group.list);
> +	container_group_count = 1;
> +
> +	for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
> +		struct containerfs_root *root = &rootnode[i];
> +		struct container *cont = &root->top_container;
> +		INIT_LIST_HEAD(&root->subsys_list);
> +		root->number_of_containers = 1;
> +
> +		cont->root = root;
> +		cont->hierarchy = i;
> +		INIT_LIST_HEAD(&cont->sibling);
> +		INIT_LIST_HEAD(&cont->children);
> +		cont->top_container = cont;
> +		atomic_set(&cont->count, 1);
> +
> +		init_container_group.container[i] = cont;
> +	}
> +	init_task.containers = &init_container_group;
> 
> -	tsk->container = &top_container;
>  	return 0;
>  }
> 
>  /**
> - * container_init - initialize containers at system boot
> - *
> - * Description: Initialize top_container and the container internal file system,
> + * container_init - register container filesystem and /proc file
>   **/
> 
>  int __init container_init(void)
>  {
> -	struct dentry *root;
>  	int err;
> -
> -	init_task.container = &top_container;
> +	struct proc_dir_entry *entry;
> 
>  	err = register_filesystem(&container_fs_type);
>  	if (err < 0)
>  		goto out;
> -	container_mount = kern_mount(&container_fs_type);
> -	if (IS_ERR(container_mount)) {
> -		printk(KERN_ERR "container: could not mount!\n");
> -		err = PTR_ERR(container_mount);
> -		container_mount = NULL;
> -		goto out;
> -	}
> -	root = container_mount->mnt_sb->s_root;
> -	root->d_fsdata = &top_container;
> -	root->d_inode->i_nlink++;
> -	top_container.dentry = root;
> -	root->d_inode->i_op = &container_dir_inode_operations;
> -	number_of_containers = 1;
> -	err = container_populate_dir(&top_container);
> +
> +	entry = create_proc_entry("containers", 0, NULL);
> +	if (entry)
> +		entry->proc_fops = &proc_containerstats_operations;
> +
>  out:
>  	return err;
>  }
> 
> +#include <asm/proto.h>

I did have a problem with this include.  On s390 it didn't exist so I've
just been running without it (with no problems).  A quick 'find'
suggests it only exists on x86_64, so I'd expect failures on all other
arches.

-serge
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Follow-Ups:
- Re: [PATCH 3/7] containers (V7): Add generic multi-subsystem API to containers
  - From: Cedric Le Goater <[email protected]>
References:
- [PATCH 0/7] containers (V7): Generic Process Containers
  - From: [email protected]
Prev by Date: Re: [PATCH 6/7] containers (V7): BeanCounters over generic process containers
Next by Date: Re: Documenting MS_RELATIME
Previous by thread: Re: [PATCH 3/7] containers (V7): Add generic multi-subsystem API to containers
Next by thread: Re: [PATCH 3/7] containers (V7): Add generic multi-subsystem API to containers
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]