[RFC][PATCH 2/9] deadlock prevention core

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



The core of the VM deadlock avoidance framework.

>From the 'user' side of things it provides a function to mark a 'struct sock'
as SOCK_MEMALLOC, meaning this socket may dip into the memalloc reserves on
the receive side.

>From the net_device side of things, the extra 'struct net_device *' argument
to {,__}netdev_alloc_skb() is used to attribute/account the memalloc usage.
Converted drivers will make use of this new API and will set NETIF_F_MEMALLOC
to indicate the driver fully supports this feature.

When a SOCK_MEMALLOC socket is marked, the device is checked for this feature
and tries to increase the memalloc pool; if both succeed, the device is marked
with IFF_MEMALLOC, indicating to {,__}netdev_alloc_skb() that it is OK to dip
into the memalloc pool.

Memalloc sk_buff allocations are not done from the SLAB but are done using 
alloc_pages(). sk_buff::memalloc records this exception so that kfree_skbmem()
can do the right thing.

Signed-off-by: Peter Zijlstra <[email protected]>
Signed-off-by: Daniel Phillips <[email protected]>

---
 include/linux/gfp.h       |    3 -
 include/linux/if.h        |    1 
 include/linux/mmzone.h    |    1 
 include/linux/netdevice.h |   48 ++++++++++++++-----
 include/linux/skbuff.h    |    3 -
 include/net/sock.h        |    8 +++
 mm/page_alloc.c           |   29 ++++++++++-
 net/core/dev.c            |    1 
 net/core/skbuff.c         |  114 +++++++++++++++++++++++++++++++++++++++++++---
 net/core/sock.c           |   54 +++++++++++++++++++++
 net/ethernet/eth.c        |    1 
 net/ipv4/af_inet.c        |   14 +++++
 net/ipv4/icmp.c           |    5 ++
 net/ipv4/tcp_ipv4.c       |    6 ++
 net/ipv4/udp.c            |   11 ++++
 15 files changed, 274 insertions(+), 25 deletions(-)

Index: linux-2.6/include/linux/gfp.h
===================================================================
--- linux-2.6.orig/include/linux/gfp.h
+++ linux-2.6/include/linux/gfp.h
@@ -46,6 +46,7 @@ struct vm_area_struct;
 #define __GFP_ZERO	((__force gfp_t)0x8000u)/* Return zeroed page on success */
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
+#define __GFP_MEMALLOC  ((__force gfp_t)0x40000u) /* Use emergency reserves */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
@@ -54,7 +55,7 @@ struct vm_area_struct;
 #define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
 			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
 			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP| \
-			__GFP_NOMEMALLOC|__GFP_HARDWALL)
+			__GFP_NOMEMALLOC|__GFP_HARDWALL|__GFP_MEMALLOC)
 
 /* This equals 0, but use constants in case they ever change */
 #define GFP_NOWAIT	(GFP_ATOMIC & ~__GFP_HIGH)
Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h
+++ linux-2.6/include/linux/mmzone.h
@@ -420,6 +420,7 @@ int percpu_pagelist_fraction_sysctl_hand
 					void __user *, size_t *, loff_t *);
 int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
 			struct file *, void __user *, size_t *, loff_t *);
+int adjust_memalloc_reserve(int bytes);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
Index: linux-2.6/include/linux/netdevice.h
===================================================================
--- linux-2.6.orig/include/linux/netdevice.h
+++ linux-2.6/include/linux/netdevice.h
@@ -298,18 +298,22 @@ struct net_device
 
 	/* Net device features */
 	unsigned long		features;
-#define NETIF_F_SG		1	/* Scatter/gather IO. */
-#define NETIF_F_IP_CSUM		2	/* Can checksum only TCP/UDP over IPv4. */
-#define NETIF_F_NO_CSUM		4	/* Does not require checksum. F.e. loopack. */
-#define NETIF_F_HW_CSUM		8	/* Can checksum all the packets. */
-#define NETIF_F_HIGHDMA		32	/* Can DMA to high memory. */
-#define NETIF_F_FRAGLIST	64	/* Scatter/gather IO. */
-#define NETIF_F_HW_VLAN_TX	128	/* Transmit VLAN hw acceleration */
-#define NETIF_F_HW_VLAN_RX	256	/* Receive VLAN hw acceleration */
-#define NETIF_F_HW_VLAN_FILTER	512	/* Receive filtering on VLAN */
-#define NETIF_F_VLAN_CHALLENGED	1024	/* Device cannot handle VLAN packets */
-#define NETIF_F_GSO		2048	/* Enable software GSO. */
-#define NETIF_F_LLTX		4096	/* LockLess TX */
+#define NETIF_F_SG		0x0001	/* Scatter/gather IO. */
+#define NETIF_F_IP_CSUM		0x0002	/* Can checksum only TCP/UDP over IPv4. */
+#define NETIF_F_NO_CSUM		0x0004	/* Does not require checksum. F.e. loopack. */
+#define NETIF_F_HW_CSUM		0x0008	/* Can checksum all the packets. */
+
+#define NETIF_F_HIGHDMA		0x0010	/* Can DMA to high memory. */
+#define NETIF_F_FRAGLIST	0x0020	/* Scatter/gather IO. */
+#define NETIF_F_HW_VLAN_TX	0x0040	/* Transmit VLAN hw acceleration */
+#define NETIF_F_HW_VLAN_RX	0x0080	/* Receive VLAN hw acceleration */
+
+#define NETIF_F_HW_VLAN_FILTER	0x0100	/* Receive filtering on VLAN */
+#define NETIF_F_VLAN_CHALLENGED	0x0200	/* Device cannot handle VLAN packets */
+#define NETIF_F_GSO		0x0400	/* Enable software GSO. */
+#define NETIF_F_LLTX		0x0800	/* LockLess TX */
+
+#define NETIF_F_MEMALLOC	0x1000	/* Supports {SOCK,__GFP}_MEMALLOC */
 
 	/* Segmentation offload features */
 #define NETIF_F_GSO_SHIFT	16
@@ -409,6 +413,12 @@ struct net_device
 	struct Qdisc		*qdisc_sleeping;
 	struct list_head	qdisc_list;
 	unsigned long		tx_queue_len;	/* Max frames per queue allowed */
+	int			rx_reserve;
+	atomic_t		rx_reserve_used;
+
+	atomic_t		memalloc_socks;
+	unsigned long		memalloc_reserve;
+	spinlock_t		memalloc_lock;
 
 	/* Partially transmitted GSO packet. */
 	struct sk_buff		*gso_skb;
@@ -686,6 +696,20 @@ static inline void dev_kfree_skb_irq(str
  */
 extern void dev_kfree_skb_any(struct sk_buff *skb);
 
+/*
+ * Support for critical network IO under low memory conditions
+ */
+static inline int dev_reserve_used(struct net_device *dev)
+{
+	return atomic_read(&dev->rx_reserve_used);
+}
+
+static inline void dev_unreserve_skb(struct net_device *dev)
+{
+	if (atomic_dec_return(&dev->rx_reserve_used) < 0)
+		atomic_inc(&dev->rx_reserve_used);
+}
+
 #define HAVE_NETIF_RX 1
 extern int		netif_rx(struct sk_buff *skb);
 extern int		netif_rx_ni(struct sk_buff *skb);
Index: linux-2.6/include/linux/skbuff.h
===================================================================
--- linux-2.6.orig/include/linux/skbuff.h
+++ linux-2.6/include/linux/skbuff.h
@@ -282,7 +282,8 @@ struct sk_buff {
 				nfctinfo:3;
 	__u8			pkt_type:3,
 				fclone:2,
-				ipvs_property:1;
+				ipvs_property:1,
+				memalloc:1;
 	__be16			protocol;
 
 	void			(*destructor)(struct sk_buff *skb);
Index: linux-2.6/include/net/sock.h
===================================================================
--- linux-2.6.orig/include/net/sock.h
+++ linux-2.6/include/net/sock.h
@@ -391,6 +391,7 @@ enum sock_flags {
 	SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */
 	SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */
 	SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */
+	SOCK_MEMALLOC, /* protocol can use memalloc reserve */
 };
 
 static inline void sock_copy_flags(struct sock *nsk, struct sock *osk)
@@ -413,6 +414,13 @@ static inline int sock_flag(struct sock 
 	return test_bit(flag, &sk->sk_flags);
 }
 
+static inline int sk_is_memalloc(struct sock *sk)
+{
+	return sock_flag(sk, SOCK_MEMALLOC);
+}
+
+extern int sk_set_memalloc(struct sock *sk);
+
 static inline void sk_acceptq_removed(struct sock *sk)
 {
 	sk->sk_ack_backlog--;
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -82,6 +82,7 @@ EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
+int var_free_kbytes;
 
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
@@ -970,8 +971,8 @@ restart:
 
 	/* This allocation should allow future memory freeing. */
 
-	if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-			&& !in_interrupt()) {
+	if ((((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+			&& !in_interrupt()) || (gfp_mask & __GFP_MEMALLOC)) {
 		if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
 			/* go through the zonelist yet again, ignoring mins */
@@ -2196,7 +2197,8 @@ static void setup_per_zone_lowmem_reserv
  */
 void setup_per_zone_pages_min(void)
 {
-	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned pages_min = (min_free_kbytes + var_free_kbytes)
+		>> (PAGE_SHIFT - 10);
 	unsigned long lowmem_pages = 0;
 	struct zone *zone;
 	unsigned long flags;
@@ -2248,6 +2250,27 @@ void setup_per_zone_pages_min(void)
 	calculate_totalreserve_pages();
 }
 
+int adjust_memalloc_reserve(int pages)
+{
+	int kbytes = var_free_kbytes + (pages << (PAGE_SHIFT - 10));
+	if (kbytes < 0)
+		return -EINVAL;
+	var_free_kbytes = kbytes;
+	setup_per_zone_pages_min();
+	if (pages > 0) {
+		int i;
+		pg_data_t *pgdat;
+		for_each_online_pgdat(pgdat) {
+			for (i = 0; i < pgdat->nr_zones; ++i)
+				wakeup_kswapd(&pgdat->node_zones[i], 0);
+		}
+	}
+	printk(KERN_DEBUG "RX reserve: %d\n", var_free_kbytes);
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(adjust_memalloc_reserve);
+
 /*
  * Initialise min_free_kbytes.
  *
Index: linux-2.6/net/core/skbuff.c
===================================================================
--- linux-2.6.orig/net/core/skbuff.c
+++ linux-2.6/net/core/skbuff.c
@@ -43,6 +43,7 @@
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/interrupt.h>
 #include <linux/in.h>
 #include <linux/inet.h>
@@ -125,6 +126,8 @@ EXPORT_SYMBOL(skb_truesize_bug);
  *
  */
 
+#define ceiling_log2(x)	fls((x) - 1)
+
 /**
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
@@ -147,6 +150,49 @@ struct sk_buff *__alloc_skb(unsigned int
 	struct sk_buff *skb;
 	u8 *data;
 
+	size = SKB_DATA_ALIGN(size);
+
+	if (gfp_mask & __GFP_MEMALLOC) {
+		/*
+		 * We have to do higher order allocations for icky jumbo
+		 * frame drivers :-(
+		 * They really should be migrated to scater/gather DMA
+		 * and use skb fragments.
+		 */
+		unsigned int data_offset =
+			sizeof(struct sk_buff) + sizeof(unsigned int);
+		unsigned long length = size + data_offset +
+			sizeof(struct skb_shared_info);
+		unsigned int pages;
+		unsigned int order;
+		struct page *page;
+		void *kaddr;
+
+		/*
+		 * force fclone alloc in order to fudge a lacking in skb_clone().
+		 */
+		fclone = 1;
+		if (fclone) {
+			data_offset += sizeof(struct sk_buff) + sizeof(atomic_t);
+			length += sizeof(struct sk_buff) + sizeof(atomic_t);
+		}
+		pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		order = ceiling_log2(pages);
+
+		skb = NULL;
+		if (!(page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order)))
+			goto out;
+
+		kaddr = pfn_to_kaddr(page_to_pfn(page));
+		skb = (struct sk_buff *)kaddr;
+
+		*((unsigned int *)(kaddr + data_offset -
+					sizeof(unsigned int))) = order;
+		data = (u8 *)(kaddr + data_offset);
+
+		goto allocated;
+	}
+
 	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
 
 	/* Get the HEAD */
@@ -155,12 +201,13 @@ struct sk_buff *__alloc_skb(unsigned int
 		goto out;
 
 	/* Get the DATA. Size must match skb_add_mtu(). */
-	size = SKB_DATA_ALIGN(size);
 	data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
 	if (!data)
 		goto nodata;
 
+allocated:
 	memset(skb, 0, offsetof(struct sk_buff, truesize));
+	skb->memalloc = !!(gfp_mask & __GFP_MEMALLOC);
 	skb->truesize = size + sizeof(struct sk_buff);
 	atomic_set(&skb->users, 1);
 	skb->head = data;
@@ -185,6 +232,7 @@ struct sk_buff *__alloc_skb(unsigned int
 		atomic_set(fclone_ref, 1);
 
 		child->fclone = SKB_FCLONE_UNAVAILABLE;
+		child->memalloc = skb->memalloc;
 	}
 out:
 	return skb;
@@ -250,7 +298,7 @@ nodata:
 }
 
 /**
- *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
+ *	___netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
  *	@length: length to allocate
  *	@gfp_mask: get_free_pages mask, passed to alloc_skb
@@ -262,7 +310,7 @@ nodata:
  *
  *	%NULL is returned if there is no free memory.
  */
-struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+static struct sk_buff *___netdev_alloc_skb(struct net_device *dev,
 		unsigned int length, gfp_t gfp_mask)
 {
 	struct sk_buff *skb;
@@ -273,6 +321,34 @@ struct sk_buff *__netdev_alloc_skb(struc
 	return skb;
 }
 
+struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
+		unsigned length, gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+
+	if (dev && (dev->flags & IFF_MEMALLOC)) {
+		WARN_ON(gfp_mask & (__GFP_NOMEMALLOC | __GFP_MEMALLOC));
+		gfp_mask &= ~(__GFP_NOMEMALLOC | __GFP_MEMALLOC);
+
+		if ((skb = ___netdev_alloc_skb(dev, length,
+					       gfp_mask | __GFP_NOMEMALLOC)))
+			goto done;
+		if (dev_reserve_used(dev) >= dev->rx_reserve)
+			goto out;
+		if (!(skb = ___netdev_alloc_skb(dev, length,
+						gfp_mask | __GFP_MEMALLOC)))
+			goto out;
+		atomic_inc(&dev->rx_reserve_used);
+	} else
+		if (!(skb = ___netdev_alloc_skb(dev, length, gfp_mask)))
+			goto out;
+
+done:
+	skb->dev = dev;
+out:
+	return skb;
+}
+
 static void skb_drop_list(struct sk_buff **listp)
 {
 	struct sk_buff *list = *listp;
@@ -313,10 +389,19 @@ static void skb_release_data(struct sk_b
 		if (skb_shinfo(skb)->frag_list)
 			skb_drop_fraglist(skb);
 
-		kfree(skb->head);
+		if (!skb->memalloc)
+			kfree(skb->head);
 	}
 }
 
+static void free_skb_pages(struct kmem_cache *cache, void *objp)
+{
+	struct sk_buff *skb = (struct sk_buff *)objp;
+	unsigned int order =
+		*(unsigned int *)(skb->head - sizeof(unsigned int));
+	free_pages((unsigned long)skb, order);
+}
+
 /*
  *	Free an skbuff by memory without cleaning the state.
  */
@@ -324,17 +409,21 @@ void kfree_skbmem(struct sk_buff *skb)
 {
 	struct sk_buff *other;
 	atomic_t *fclone_ref;
+	void (*free_skb)(struct kmem_cache *, void *);
 
 	skb_release_data(skb);
+
+	free_skb = skb->memalloc ? free_skb_pages : kmem_cache_free;
+
 	switch (skb->fclone) {
 	case SKB_FCLONE_UNAVAILABLE:
-		kmem_cache_free(skbuff_head_cache, skb);
+		free_skb(skbuff_head_cache, skb);
 		break;
 
 	case SKB_FCLONE_ORIG:
 		fclone_ref = (atomic_t *) (skb + 2);
 		if (atomic_dec_and_test(fclone_ref))
-			kmem_cache_free(skbuff_fclone_cache, skb);
+			free_skb(skbuff_fclone_cache, skb);
 		break;
 
 	case SKB_FCLONE_CLONE:
@@ -347,7 +436,7 @@ void kfree_skbmem(struct sk_buff *skb)
 		skb->fclone = SKB_FCLONE_UNAVAILABLE;
 
 		if (atomic_dec_and_test(fclone_ref))
-			kmem_cache_free(skbuff_fclone_cache, other);
+			free_skb(skbuff_fclone_cache, other);
 		break;
 	};
 }
@@ -363,6 +452,8 @@ void kfree_skbmem(struct sk_buff *skb)
 
 void __kfree_skb(struct sk_buff *skb)
 {
+	struct net_device *dev = skb->dev;
+
 	dst_release(skb->dst);
 #ifdef CONFIG_XFRM
 	secpath_put(skb->sp);
@@ -389,6 +480,8 @@ void __kfree_skb(struct sk_buff *skb)
 #endif
 
 	kfree_skbmem(skb);
+	if (dev && (dev->flags & IFF_MEMALLOC))
+		dev_unreserve_skb(dev);
 }
 
 /**
@@ -434,10 +527,15 @@ struct sk_buff *skb_clone(struct sk_buff
 		n->fclone = SKB_FCLONE_CLONE;
 		atomic_inc(fclone_ref);
 	} else {
+		/*
+		 * should we special-case skb->memalloc cloning?
+		 * for now fudge it by forcing fast-clone alloc.
+		 */
 		n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
 		if (!n)
 			return NULL;
 		n->fclone = SKB_FCLONE_UNAVAILABLE;
+		n->memalloc = 0;
 	}
 
 #define C(x) n->x = skb->x
@@ -686,6 +784,8 @@ int pskb_expand_head(struct sk_buff *skb
 	if (skb_shared(skb))
 		BUG();
 
+	BUG_ON(skb->memalloc);
+
 	size = SKB_DATA_ALIGN(size);
 
 	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
Index: linux-2.6/net/ethernet/eth.c
===================================================================
--- linux-2.6.orig/net/ethernet/eth.c
+++ linux-2.6/net/ethernet/eth.c
@@ -275,6 +275,7 @@ void ether_setup(struct net_device *dev)
 	dev->mtu		= ETH_DATA_LEN;
 	dev->addr_len		= ETH_ALEN;
 	dev->tx_queue_len	= 1000;	/* Ethernet wants good queues */	
+	dev->rx_reserve		= 384;
 	dev->flags		= IFF_BROADCAST|IFF_MULTICAST;
 	
 	memset(dev->broadcast,0xFF, ETH_ALEN);
Index: linux-2.6/net/ipv4/icmp.c
===================================================================
--- linux-2.6.orig/net/ipv4/icmp.c
+++ linux-2.6/net/ipv4/icmp.c
@@ -938,6 +938,11 @@ int icmp_rcv(struct sk_buff *skb)
 			goto error;
 	}
 
+	if (unlikely(dev_reserve_used(skb->dev))) {
+		dev_unreserve_skb(skb->dev);
+		goto drop;
+	}
+
 	if (!pskb_pull(skb, sizeof(struct icmphdr)))
 		goto error;
 
Index: linux-2.6/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp_ipv4.c
+++ linux-2.6/net/ipv4/tcp_ipv4.c
@@ -1093,6 +1093,12 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	if (!sk)
 		goto no_tcp_socket;
 
+	if (unlikely(dev_reserve_used(skb->dev))) {
+		dev_unreserve_skb(skb->dev);
+		if (!sk_is_memalloc(sk))
+			goto discard_and_relse;
+	}
+
 process:
 	if (sk->sk_state == TCP_TIME_WAIT)
 		goto do_time_wait;
Index: linux-2.6/net/ipv4/udp.c
===================================================================
--- linux-2.6.orig/net/ipv4/udp.c
+++ linux-2.6/net/ipv4/udp.c
@@ -1136,7 +1136,15 @@ int udp_rcv(struct sk_buff *skb)
 	sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
 
 	if (sk != NULL) {
-		int ret = udp_queue_rcv_skb(sk, skb);
+		int ret;
+
+		if (unlikely(dev_reserve_used(skb->dev))) {
+			dev_unreserve_skb(skb->dev);
+			if (!sk_is_memalloc(sk))
+				goto drop_noncritical;
+		}
+
+		ret = udp_queue_rcv_skb(sk, skb);
 		sock_put(sk);
 
 		/* a return value > 0 means to resubmit the input, but
@@ -1147,6 +1155,7 @@ int udp_rcv(struct sk_buff *skb)
 		return 0;
 	}
 
+drop_noncritical:
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 		goto drop;
 	nf_reset(skb);
Index: linux-2.6/include/linux/if.h
===================================================================
--- linux-2.6.orig/include/linux/if.h
+++ linux-2.6/include/linux/if.h
@@ -49,6 +49,7 @@
 
 #define IFF_LOWER_UP	0x10000		/* driver signals L1 up		*/
 #define IFF_DORMANT	0x20000		/* driver signals dormant	*/
+#define IFF_MEMALLOC	0x40000		/* driver has a reserve allocated */
 
 #define IFF_VOLATILE	(IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST|\
 		IFF_MASTER|IFF_SLAVE|IFF_RUNNING|IFF_LOWER_UP|IFF_DORMANT)
Index: linux-2.6/net/core/dev.c
===================================================================
--- linux-2.6.orig/net/core/dev.c
+++ linux-2.6/net/core/dev.c
@@ -2900,6 +2900,7 @@ int register_netdevice(struct net_device
 #ifdef CONFIG_NET_CLS_ACT
 	spin_lock_init(&dev->ingress_lock);
 #endif
+	spin_lock_init(&dev->memalloc_lock);
 
 	ret = alloc_divert_blk(dev);
 	if (ret)
Index: linux-2.6/net/ipv4/af_inet.c
===================================================================
--- linux-2.6.orig/net/ipv4/af_inet.c
+++ linux-2.6/net/ipv4/af_inet.c
@@ -131,6 +131,20 @@ static DEFINE_SPINLOCK(inetsw_lock);
 void inet_sock_destruct(struct sock *sk)
 {
 	struct inet_sock *inet = inet_sk(sk);
+	struct net_device *dev = ip_dev_find(inet->rcv_saddr);
+
+	if (dev && (dev->flags & IFF_MEMALLOC) &&
+			sk_is_memalloc(sk) &&
+			(atomic_read(&dev->memalloc_socks) == 1)) {
+		spin_lock(&dev->memalloc_lock);
+		if (atomic_dec_and_test(&dev->memalloc_socks)) {
+			dev->flags &= ~IFF_MEMALLOC;
+			WARN_ON(dev_reserve_used(dev));
+			atomic_set(&dev->rx_reserve_used, 0);
+			adjust_memalloc_reserve(-dev->memalloc_reserve);
+		}
+		spin_unlock(&dev->memalloc_lock);
+	}
 
 	__skb_queue_purge(&sk->sk_receive_queue);
 	__skb_queue_purge(&sk->sk_error_queue);
Index: linux-2.6/net/core/sock.c
===================================================================
--- linux-2.6.orig/net/core/sock.c
+++ linux-2.6/net/core/sock.c
@@ -111,6 +111,7 @@
 #include <linux/poll.h>
 #include <linux/tcp.h>
 #include <linux/init.h>
+#include <linux/inetdevice.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
@@ -195,6 +196,59 @@ __u32 sysctl_rmem_default = SK_RMEM_MAX;
 /* Maximal space eaten by iovec or ancilliary data plus some space */
 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
 
+#define ceiling_log2(x) fls((x) - 1)
+
+static inline unsigned int skb_pages(unsigned int mtu)
+{
+	unsigned int pages = (mtu + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	unsigned int order = ceiling_log2(pages);
+	pages = 1 << order;
+	if (pages > 1) ++pages;
+
+	return pages;
+}
+
+int sk_set_memalloc(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct net_device *dev = ip_dev_find(inet->rcv_saddr);
+	int err = 0;
+
+	if (!dev)
+		return -ENODEV;
+
+	if (!(dev->features & NETIF_F_MEMALLOC)) {
+		err = -EPERM;
+		goto out;
+	}
+
+	if (atomic_read(&dev->memalloc_socks) == 0) {
+		spin_lock(&dev->memalloc_lock);
+		if (atomic_read(&dev->memalloc_socks) == 0) {
+			dev->memalloc_reserve =
+				dev->rx_reserve * skb_pages(dev->mtu);
+			err = adjust_memalloc_reserve(dev->memalloc_reserve);
+			if (err) {
+				spin_unlock(&dev->memalloc_lock);
+				printk(KERN_WARNING
+			"%s: Unable to allocate RX reserve, error: %d\n",
+					dev->name, err);
+				goto out;
+			}
+			sock_set_flag(sk, SOCK_MEMALLOC);
+			dev->flags |= IFF_MEMALLOC;
+		}
+		atomic_inc(&dev->memalloc_socks);
+		spin_unlock(&dev->memalloc_lock);
+	} else
+		atomic_inc(&dev->memalloc_socks);
+
+out:
+	dev_put(dev);
+	return err;
+}
+EXPORT_SYMBOL_GPL(sk_set_memalloc);
+
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
 	struct timeval tv;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux