Re: [PATCH 1/2] (repost) New System call, unshare (fwd)

* Janak Desai (janak@us.ibm.com) wrote:
> diff -Naurp linux-2.6.13-mm1/kernel/fork.c linux-2.6.13-mm1+unshare-patch1/kernel/fork.c
> --- linux-2.6.13-mm1/kernel/fork.c	2005-09-07 13:25:01.000000000 +0000
> +++ linux-2.6.13-mm1+unshare-patch1/kernel/fork.c	2005-09-07 13:40:11.000000000 +0000
> @@ -58,6 +58,17 @@ int nr_threads; 		/* The idle threads do
>  
>  int max_threads;		/* tunable limit on nr_threads */
>  
> +/*
> + * mm_copy gets called from clone or unshare system calls. When called
> + * from clone, mm_struct may be shared depending on the clone flags
> + * argument, however, when called from the unshare system call, a private
> + * copy of mm_struct is made.
> + */
> +enum mm_copy_share {
> +	MAY_SHARE,
> +	UNSHARE,
> +};
> +
>  DEFINE_PER_CPU(unsigned long, process_counts) = 0;
>  
>   __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
> @@ -448,16 +459,26 @@ void mm_release(struct task_struct *tsk,
>  	}
>  }
>  
> -static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
> +static int copy_mm(unsigned long clone_flags, struct task_struct * tsk,
> +			enum mm_copy_share copy_share_action)

minor nitpick, pretty verbose name, could just be share.

>  {
>  	struct mm_struct * mm, *oldmm;
>  	int retval;
>  
> -	tsk->min_flt = tsk->maj_flt = 0;
> -	tsk->nvcsw = tsk->nivcsw = 0;
> +	/*
> +	 * If the process memory is being duplicated as part of the
> +	 * unshare system call, we are working with the current process
> +	 * and not a newly allocated task strucutre, and should not
> +	 * zero out fault info, context switch counts, mm and active_mm
> +	 * fields.
> +	 */
> +	if (copy_share_action == MAY_SHARE) {
> +		tsk->min_flt = tsk->maj_flt = 0;
> +		tsk->nvcsw = tsk->nivcsw = 0;
>  
> -	tsk->mm = NULL;
> -	tsk->active_mm = NULL;
> +		tsk->mm = NULL;
> +		tsk->active_mm = NULL;
> +	}
>  
>  	/*
>  	 * Are we cloning a kernel thread?
> @@ -1002,7 +1023,7 @@ static task_t *copy_process(unsigned lon
>  		goto bad_fork_cleanup_fs;
>  	if ((retval = copy_signal(clone_flags, p)))
>  		goto bad_fork_cleanup_sighand;
> -	if ((retval = copy_mm(clone_flags, p)))
> +	if ((retval = copy_mm(clone_flags, p, MAY_SHARE)))
>  		goto bad_fork_cleanup_signal;
>  	if ((retval = copy_keys(clone_flags, p)))
>  		goto bad_fork_cleanup_mm;
> @@ -1317,3 +1338,172 @@ void __init proc_caches_init(void)
>  			sizeof(struct mm_struct), 0,
>  			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
>  }
> +
> +/*
> + * unshare_mm is called from the unshare system call handler function to
> + * make a private copy of the mm_struct structure. It calls copy_mm with
> + * CLONE_VM flag cleard, to ensure that a private copy of mm_struct is made,
> + * and with mm_copy_share enum set to UNSHARE, to ensure that copy_mm
> + * does not clear fault info, context switch counts, mm and active_mm
> + * fields of the mm_struct.
> + */
> +static int unshare_mm(unsigned long unshare_flags, struct task_struct *tsk)
> +{
> +	int retval = 0;
> +	struct mm_struct *mm = tsk->mm;
> +
> +	/*
> +	 * If the virtual memory is being shared, make a private
> +	 * copy and disassociate the process from the shared virtual
> +	 * memory.
> +	 */
> +	if (atomic_read(&mm->mm_users) > 1) {
> +		retval = copy_mm((unshare_flags & ~CLONE_VM), tsk, UNSHARE);

Looks like this could allow aio to be done on behalf of wrong mm?  Hmm,
and what about the futex code?

> +
> +		/*
> +		 * If copy_mm was successful, decrement the number of users
> +		 * on the original, shared, mm_struct.
> +		 */
> +		if (!retval)
> +			atomic_dec(&mm->mm_users);

This is not sufficient.  Should be mmput().  It's trivial to bump the
refcount without it actually being shared, so you may have just leaked
the mm_struct if you were last holder.

> +	}
> +	return retval;
> +}
> +
> +/*
> + * unshare_sighand is called from the unshare system call handler function to
> + * make a private copy of the sighand_struct structure. It calls copy_sighand
> + * with CLONE_SIGHAND cleared to ensure that a new signal handler structure
> + * is cloned from the current shared one.
> + */
> +static int unshare_sighand(unsigned long unshare_flags, struct task_struct *tsk)
> +{
> +	int retval = 0;
> +	struct sighand_struct *sighand = tsk->sighand;
> +
> +	/*
> +	 * If the signal handlers are being shared, make a private
> +	 * copy and disassociate the process from the shared signal
> +	 * handlers.
> +	 */
> +	if (atomic_read(&sighand->count) > 1) {
> +		retval = copy_sighand((unshare_flags & ~CLONE_SIGHAND), tsk);
> +
> +		/*
> +		 * If copy_sighand was successful, decrement the use count
> +		 * on the original, shared, sighand_struct.
> +		 */
> +		if (!retval)
> +			atomic_dec(&sighand->count);

Actually, if other thread that was sharing sighand is doing execve() or
exit(), this atomic_dec could drop count to either 0 (it's now leaked) or
1, in which case the exit/execve will drop count to 0 and destroy
sighand (which you later restore under error conditions).

> +	}
> +	return retval;
> +}
> +
> +/*
> + * unshare_namespace is called from the unshare system call handler
> + * function to make a private copy of the current shared namespace. It
> + * calls copy_namespace with CLONE_NEWNS set to ensure that a new
> + * namespace is cloned from the current namespace.
> + */
> +static int unshare_namespace(struct task_struct *tsk)
> +{
> +	int retval = 0;
> +	struct namespace *namespace = tsk->namespace;
> +
> +	/*
> +	 * If the namespace is being shared, make a private copy
> +	 * and disassociate the process from the shared namespace.
> +	 */
> +	if (atomic_read(&namespace->count) > 1) {
> +		retval = copy_namespace(CLONE_NEWNS, tsk);
> +
> +		/*
> +		 * If copy_namespace was successful, decrement the use count
> +		 * on the original, shared, namespace struct.
> +		 */
> +		if (!retval)
> +			atomic_dec(&namespace->count);

This can leak namespace.

> +	}
> +	return retval;
> +}
> +
> +/*
> + * unshare allows a process to 'unshare' part of the process
> + * context which was originally shared using clone.
> + */
> +asmlinkage long sys_unshare(unsigned long unshare_flags)
> +{
> +	struct task_struct *tsk = current;
> +	int retval = 0;
> +	struct namespace *namespace;
> +	struct mm_struct *mm;
> +	struct sighand_struct *sighand;
> +
> +	if (unshare_flags & ~(CLONE_NEWNS | CLONE_VM | CLONE_SIGHAND))
> +		goto bad_unshare_invalid_val;

Just start with retval (I'd call it err) set to -EINVAL, and goto
unshare_out directly.

> +	/*
> +	 * Shared signal handlers imply shared VM, so if CLONE_SIGHAND is
> +	 * set, CLONE_VM must also be set in the system call argument.
> +	 */
> +	if ((unshare_flags & CLONE_SIGHAND) && !(unshare_flags & CLONE_VM))
> +		goto bad_unshare_invalid_val;

ditto

> +	task_lock(tsk);

The current copy_* are able to make assumptions about visibility of new
task (it's not yet globally visible).  Did you do careful review to
make sure you're not violating such assumptions.

Simple example, unshare_* below are called with task_lock held, but then
they do allocations with GFP_KERNEL via copy_*.

> +	namespace = tsk->namespace;
> +	mm = tsk->mm;
> +	sighand = tsk->sighand;
> +
> +	if (unshare_flags & CLONE_VM) {
> +		retval = unshare_mm(unshare_flags, tsk);
> +		if (retval)
> +			goto unshare_unlock_task;
> +		else if (unshare_flags & CLONE_SIGHAND) {

Looks like this doesn't properly handle:

clone(CLONE_SIGHAND|CLONE_VM);
unshare(CLONE_VM);

I think you'll need to check if sighand->count > 1.

> +			retval = unshare_sighand(unshare_flags, tsk);
> +			if (retval)
> +				goto bad_unshare_cleanup_mm;
> +		}
> +	}
> +
> +	if (unshare_flags & CLONE_NEWNS) {
> +		retval = unshare_namespace(tsk);

This poses a problem for:

clone(CLONE_FS)
unshare(CLONE_NEWNS)

You must guarantee unshared ->fs, since copy_namespace is going to write
to it.

> +		if (retval)
> +			goto bad_unshare_cleanup_sighand;
> +	}
> +
> +unshare_unlock_task:
> +	task_unlock(tsk);
> +
> +unshare_out:
> +	return retval;
> +
> +bad_unshare_cleanup_sighand:
> +	/*
> +	 * If signal handlers were unshared (private copy was made),
> +	 * clean them up (delete the private copy) and restore
> +	 * the task to point to the old, shared, value.
> +	 */
> +	if (unshare_flags & CLONE_SIGHAND) {
> +		exit_sighand(tsk);
> +		tsk->sighand = sighand;

sighand could be long gone, this is trouble waiting to happen.

> +		atomic_inc(&sighand->count);
> +	}
> +
> +bad_unshare_cleanup_mm:
> +	/*
> +	 * If mm struct was unshared (private copy was made),
> +	 * clean it up (delete the private copy) and restore
> +	 * the task to point to the old, shared, value.
> +	 */
> +	if (unshare_flags & CLONE_VM) {
> +		if (tsk->mm)
> +			mmput(tsk->mm);
> +		tsk->mm = mm;

mm could be destroyed already.

> +		atomic_inc(&mm->mm_users);
> +	}
> +	goto unshare_unlock_task;
> +
> +bad_unshare_invalid_val:
> +	retval = -EINVAL;
> +	goto unshare_out;
> +}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Follow-Ups:
- Re: [PATCH 1/2] (repost) New System call, unshare (fwd)
  - From: Janak Desai <janak@us.ibm.com>
- Re: [PATCH 1/2] (repost) New System call, unshare (fwd)
  - From: Janak Desai <janak@us.ibm.com>
References:
- [PATCH 1/2] (repost) New System call, unshare (fwd)
  - From: Janak Desai <janak@us.ibm.com>
Prev by Date: Re: [ham] Re: Gracefully killing kswapd, or any kernel thread
Next by Date: Re: [patch] KGDB for Real-Time Preemption systems
Previous by thread: [PATCH 1/2] (repost) New System call, unshare (fwd)
Next by thread: Re: [PATCH 1/2] (repost) New System call, unshare (fwd)
Index(es):
- Date
- Thread
[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind]