[PATCH] AF_UNIX recv/shutdown race

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On some of our systems (Linux 2.6, 4-way SMP), we have found a race where
occasionally recv() can detect a shutdown before the last bytes written to
the socket, and will exhibit the odd behavior where recv() will return 0
to indicate a shutdown socket, and a subsequent call will return the last
bit data, after which it will return 0 again.

This is demonstrated by the attached test cases (cli.c and srv.c).  Start
one srv and several cli processes (perhaps 4-8) on a 2x dual core P4 (most
likely other systems as well, but I haven't reproduced it on other
configurations), and you will soon see this behavior.

What happens is that in unix_stream_recvmsg, the system checks for any
skbuffs on the queue ready to return.  Meanwhile, immediately afterwards,
another process/processor writes an skbuff and shuts down the sock.  Our
original reader process then checks shutdown, and returns 0.  On the next
call, it finally sees the skbuff written.

This patch combines af_unix's unix_sk(sk).readlock with
unix_state_rlock/runlock(sk).  readlock was only ever used by bind,
autobind, and unix_{stream,dgram}_recvmsg.  I can't see any way of
ensuring correctness for unix_stream_recvmsg without forcing it to hold
the state lock across getting the skbuf and checking shutdown.


Signed-off-by: Joseph Malicki <[email protected]>
-------------------------------
diff -rup linux-2.6.18.orig/include/net/af_unix.h
linux-2.6.18/include/net/af_unix.h
--- linux-2.6.18.orig/include/net/af_unix.h	2006-09-19 23:42:06.000000000
-0400
+++ linux-2.6.18/include/net/af_unix.h	2006-10-02 19:42:07.000000000 -0400
@@ -78,7 +78,6 @@ struct unix_sock {
         struct unix_address     *addr;
         struct dentry		*dentry;
         struct vfsmount		*mnt;
-	struct mutex		readlock;
         struct sock		*peer;
         struct sock		*other;
         struct sock		*gc_tree;
diff -rup linux-2.6.18.orig/net/unix/af_unix.c
linux-2.6.18/net/unix/af_unix.c
--- linux-2.6.18.orig/net/unix/af_unix.c	2006-09-19 23:42:06.000000000
-0400 +++ linux-2.6.18/net/unix/af_unix.c	2006-10-06 15:28:44.000000000
-0400 @@ -50,6 +50,8 @@
  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
  *	     				the core infrastructure is doing that
  *	     				for all net proto families now (2.5.69+)
+ *	       Joseph Malicki	:	Fix SMP race between write/shutdown
+ *					and read.
  *
  *
  * Known differences from reference BSD that was tested:
@@ -593,7 +595,6 @@ static struct sock * unix_create1(struct
 	u->mnt	  = NULL;
 	spin_lock_init(&u->lock);
 	atomic_set(&u->inflight, sock ? 0 : -1);
-	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
@@ -650,7 +651,7 @@ static int unix_autobind(struct socket *
 	struct unix_address * addr;
 	int err;

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	err = 0;
 	if (u->addr)
@@ -687,7 +688,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;

-out:	mutex_unlock(&u->readlock);
+out:	unix_state_runlock(sk);
 	return err;
 }

@@ -770,7 +771,7 @@ static int unix_bind(struct socket *sock
 		goto out;
 	addr_len = err;

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	err = -EINVAL;
 	if (u->addr)
@@ -842,7 +843,7 @@ static int unix_bind(struct socket *sock
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 out:
 	return err;

@@ -1572,7 +1573,7 @@ static int unix_dgram_recvmsg(struct kio

 	msg->msg_namelen = 0;

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
@@ -1628,7 +1629,7 @@ static int unix_dgram_recvmsg(struct kio
 out_free:
 	skb_free_datagram(sk,skb);
 out_unlock:
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 out:
 	return err;
 }
@@ -1674,7 +1675,6 @@ static int unix_stream_recvmsg(struct ki
 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
 	struct scm_cookie tmp_scm;
 	struct sock *sk = sock->sk;
-	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	int copied = 0;
 	int check_creds = 0;
@@ -1704,7 +1704,7 @@ static int unix_stream_recvmsg(struct ki
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}

-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);

 	do
 	{
@@ -1728,7 +1728,7 @@ static int unix_stream_recvmsg(struct ki
 			err = -EAGAIN;
 			if (!timeo)
 				break;
-			mutex_unlock(&u->readlock);
+			unix_state_runlock(sk);

 			timeo = unix_stream_data_wait(sk, timeo);

@@ -1736,7 +1736,7 @@ static int unix_stream_recvmsg(struct ki
 				err = sock_intr_errno(timeo);
 				goto out;
 			}
-			mutex_lock(&u->readlock);
+			unix_state_rlock(sk);
 			continue;
 		}

@@ -1802,7 +1802,7 @@ static int unix_stream_recvmsg(struct ki
 		}
 	} while (size);

-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
 	return copied ? : err;


diff -rup linux-2.6.18.orig/include/net/af_unix.h linux-2.6.18/include/net/af_unix.h
--- linux-2.6.18.orig/include/net/af_unix.h	2006-09-19 23:42:06.000000000 -0400
+++ linux-2.6.18/include/net/af_unix.h	2006-10-02 19:42:07.000000000 -0400
@@ -78,7 +78,6 @@ struct unix_sock {
         struct unix_address     *addr;
         struct dentry		*dentry;
         struct vfsmount		*mnt;
-	struct mutex		readlock;
         struct sock		*peer;
         struct sock		*other;
         struct sock		*gc_tree;
diff -rup linux-2.6.18.orig/net/unix/af_unix.c linux-2.6.18/net/unix/af_unix.c
--- linux-2.6.18.orig/net/unix/af_unix.c	2006-09-19 23:42:06.000000000 -0400
+++ linux-2.6.18/net/unix/af_unix.c	2006-10-06 15:28:44.000000000 -0400
@@ -50,6 +50,8 @@
  *	     Arnaldo C. Melo	:	Remove MOD_{INC,DEC}_USE_COUNT,
  *	     				the core infrastructure is doing that
  *	     				for all net proto families now (2.5.69+)
+ *	       Joseph Malicki	:	Fix SMP race between write/shutdown
+ *					and read.
  *
  *
  * Known differences from reference BSD that was tested:
@@ -593,7 +595,6 @@ static struct sock * unix_create1(struct
 	u->mnt	  = NULL;
 	spin_lock_init(&u->lock);
 	atomic_set(&u->inflight, sock ? 0 : -1);
-	mutex_init(&u->readlock); /* single task reading lock */
 	init_waitqueue_head(&u->peer_wait);
 	unix_insert_socket(unix_sockets_unbound, sk);
 out:
@@ -650,7 +651,7 @@ static int unix_autobind(struct socket *
 	struct unix_address * addr;
 	int err;
 
-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);
 
 	err = 0;
 	if (u->addr)
@@ -687,7 +688,7 @@ retry:
 	spin_unlock(&unix_table_lock);
 	err = 0;
 
-out:	mutex_unlock(&u->readlock);
+out:	unix_state_runlock(sk);
 	return err;
 }
 
@@ -770,7 +771,7 @@ static int unix_bind(struct socket *sock
 		goto out;
 	addr_len = err;
 
-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);
 
 	err = -EINVAL;
 	if (u->addr)
@@ -842,7 +843,7 @@ static int unix_bind(struct socket *sock
 out_unlock:
 	spin_unlock(&unix_table_lock);
 out_up:
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 out:
 	return err;
 
@@ -1572,7 +1573,7 @@ static int unix_dgram_recvmsg(struct kio
 
 	msg->msg_namelen = 0;
 
-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);
 
 	skb = skb_recv_datagram(sk, flags, noblock, &err);
 	if (!skb)
@@ -1628,7 +1629,7 @@ static int unix_dgram_recvmsg(struct kio
 out_free:
 	skb_free_datagram(sk,skb);
 out_unlock:
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 out:
 	return err;
 }
@@ -1674,7 +1675,6 @@ static int unix_stream_recvmsg(struct ki
 	struct sock_iocb *siocb = kiocb_to_siocb(iocb);
 	struct scm_cookie tmp_scm;
 	struct sock *sk = sock->sk;
-	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	int copied = 0;
 	int check_creds = 0;
@@ -1704,7 +1704,7 @@ static int unix_stream_recvmsg(struct ki
 		memset(&tmp_scm, 0, sizeof(tmp_scm));
 	}
 
-	mutex_lock(&u->readlock);
+	unix_state_rlock(sk);
 
 	do
 	{
@@ -1728,7 +1728,7 @@ static int unix_stream_recvmsg(struct ki
 			err = -EAGAIN;
 			if (!timeo)
 				break;
-			mutex_unlock(&u->readlock);
+			unix_state_runlock(sk);
 
 			timeo = unix_stream_data_wait(sk, timeo);
 
@@ -1736,7 +1736,7 @@ static int unix_stream_recvmsg(struct ki
 				err = sock_intr_errno(timeo);
 				goto out;
 			}
-			mutex_lock(&u->readlock);
+			unix_state_rlock(sk);
 			continue;
 		}
 
@@ -1802,7 +1802,7 @@ static int unix_stream_recvmsg(struct ki
 		}
 	} while (size);
 
-	mutex_unlock(&u->readlock);
+	unix_state_runlock(sk);
 	scm_recv(sock, msg, siocb->scm, flags);
 out:
 	return copied ? : err;
/* a client in the unix domain */
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdio.h>

int main(int argc, char *argv[])
{
  int conn;
  struct sockaddr_un  serv_addr;

  char buffer[4];
  int  len;
   
  while( 1 ) {  
    bzero((char *)&serv_addr,sizeof(serv_addr));
    serv_addr.sun_family = AF_UNIX;
    strcpy(serv_addr.sun_path, "/tmp/ugh");
    
    conn = socket(AF_UNIX, SOCK_STREAM,0);
    if( conn < 0 ) {
      perror( "couldn't create socket" );
      return -1;
    }
    
    if( connect( conn, (struct sockaddr *)&serv_addr,
                 strlen(serv_addr.sun_path) + sizeof(serv_addr.sun_family) ) < 0 ) {
      perror( "couldn't connect" );
      return -1; 
    }
    
    bzero(buffer, 4);
    len = recv(conn,buffer,3,MSG_WAITALL);
    if( len != 3 ) {
      perror( "eh?" );
      printf( "HUH: %d\n", len );
      len = recv(conn,buffer,3,MSG_WAITALL);
      if( len == 3 ) {
        printf( "WTF: %d, %s\n", len, buffer );
      }
      return -1;
    }
    else {
      printf( "%s\n", buffer );
    }
    shutdown(conn,2);
    close(conn);
  }

   return 0;
}
/* a server in the unix domain.  The pathname of 
   the socket address is passed as an argument */
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdio.h>

int main(int argc, char *argv[])
{
  int listener;
  struct sockaddr_un serv_addr;
  int len;
  int conn;
  
  listener = socket(AF_UNIX,SOCK_STREAM,0);
  if( listener < 0 ) {
    perror( "couldn't create socket" );
    return -1;
  }


  bzero( (char *)&serv_addr, sizeof(serv_addr) );
  serv_addr.sun_family = AF_UNIX;
  unlink( "/tmp/ugh" );
  strcpy( serv_addr.sun_path, "/tmp/ugh" );
  
  if( bind( listener, (struct sockaddr *)&serv_addr,
            strlen(serv_addr.sun_path) + sizeof(serv_addr.sun_family) ) < 0 ) {
    perror( "bind failed" );
    return -1;         
  }

  listen( listener, 0 );
  
  while( 1 ) {
    conn = accept( listener, NULL, NULL );
    if( conn < 0 ) {
      perror( "accept() failed" );
      return -1;
    }
    len = write( conn, "one", 3 );
    if(len != 3) {
	    printf("short write: %d!\n", len);
    }
    shutdown( conn, 2 );
    close( conn );
  }
  
  return 0;
}

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux