[RFC][PATCH 5/5] I/OAT DMA support and TCP acceleration

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



TCP recv offload to I/OAT

Locks down user pages and sets up for the DMA in tcp_recvmsg, then calls
dma_async_try_early_copy in tcp_v4_do_rcv

---
 tcp.c       |  166 +++++++++++++++++++++++++++++++++++++++++++++---------------
 tcp_input.c |   63 +++++++++++++++++++++-
 tcp_ipv4.c  |   16 +++++
 3 files changed, 202 insertions(+), 43 deletions(-)

diff -rup a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c	2005-12-21 12:10:49.000000000 -0800
+++ b/net/ipv4/tcp.c	2005-12-21 12:14:42.000000000 -0800
@@ -257,6 +257,7 @@
 #include <linux/fs.h>
 #include <linux/random.h>
 #include <linux/bootmem.h>
+#include <linux/dmaengine.h>
 
 #include <net/icmp.h>
 #include <net/tcp.h>
@@ -941,7 +942,7 @@ void tcp_cleanup_rbuf(struct sock *sk, i
 	struct tcp_sock *tp = tcp_sk(sk);
 	int time_to_ack = 0;
 
-#if TCP_DEBUG
+#if TCP_DEBUG && !defined(CONFIG_NET_DMA)
 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
 	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
@@ -1132,6 +1133,21 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 
 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
 
+#ifdef CONFIG_NET_DMA
+	if ((len > 1024) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency) {
+		preempt_disable();
+		tp->ucopy.dma_chan = __get_cpu_var(net_dma);
+		preempt_enable();
+	} else {
+		tp->ucopy.dma_chan = NULL;
+	}
+
+	if (tp->ucopy.dma_chan)
+		if (dma_lock_iovec_pages(msg->msg_iov, len,
+		                         &tp->ucopy.locked_list))
+			tp->ucopy.dma_chan = NULL;
+#endif
+
 	do {
 		struct sk_buff *skb;
 		u32 offset;
@@ -1161,7 +1177,12 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
 				break;
 			}
-			offset = *seq - TCP_SKB_CB(skb)->seq;
+
+			if (!skb->copied_early)
+				offset = *seq - TCP_SKB_CB(skb)->seq;
+			else
+				offset = 0;
+
 			if (skb->h.th->syn)
 				offset--;
 			if (offset < skb->len)
@@ -1273,6 +1294,8 @@ int tcp_recvmsg(struct kiocb *iocb, stru
 		} else
 			sk_wait_data(sk, &timeo);
 
+		tp->ucopy.wakeup = 0;
+
 		if (user_recv) {
 			int chunk;
 
@@ -1304,45 +1327,69 @@ do_prequeue:
 		}
 		continue;
 
-	found_ok_skb:
+found_ok_skb:
 		/* Ok so how much can we use? */
-		used = skb->len - offset;
-		if (len < used)
-			used = len;
-
-		/* Do we have urgent data here? */
-		if (tp->urg_data) {
-			u32 urg_offset = tp->urg_seq - *seq;
-			if (urg_offset < used) {
-				if (!urg_offset) {
-					if (!sock_flag(sk, SOCK_URGINLINE)) {
-						++*seq;
-						offset++;
-						used--;
-						if (!used)
-							goto skip_copy;
-					}
-				} else
-					used = urg_offset;
+		if (!skb->copied_early) {
+			used = skb->len - offset;
+			if (len < used)
+				used = len;
+
+			/* Do we have urgent data here? */
+			if (tp->urg_data) {
+				u32 urg_offset = tp->urg_seq - *seq;
+				if (urg_offset < used) {
+					if (!urg_offset) {
+						if (!sock_flag(sk, SOCK_URGINLINE)) {
+							++*seq;
+							offset++;
+							used--;
+							if (!used)
+								goto skip_copy;
+						}
+					} else
+						used = urg_offset;
+				}
 			}
-		}
 
-		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_iovec(skb, offset,
-						      msg->msg_iov, used);
-			if (err) {
-				/* Exception. Bailout! */
-				if (!copied)
-					copied = -EFAULT;
-				break;
+			if (!(flags & MSG_TRUNC)) {
+				if (tp->ucopy.dma_chan) {
+					tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec(
+						tp->ucopy.dma_chan, skb, offset,
+						msg->msg_iov, used,
+						tp->ucopy.locked_list);
+
+					if (tp->ucopy.dma_cookie < 0) {
+
+						printk(KERN_ALERT "dma_cookie < 0\n");
+
+						/* Exception. Bailout! */
+						if (!copied)
+							copied = -EFAULT;
+						break;
+					}
+					if ((offset + used) == skb->len)
+						skb->copied_early = 1;
+
+				} else {
+					err = skb_copy_datagram_iovec(skb, offset,
+							msg->msg_iov, used);
+					if (err) {
+						/* Exception. Bailout! */
+						if (!copied)
+							copied = -EFAULT;
+						break;
+					}
+				}
 			}
-		}
 
-		*seq += used;
-		copied += used;
-		len -= used;
+			*seq += used;
+			copied += used;
+			len -= used;
 
-		tcp_rcv_space_adjust(sk);
+			tcp_rcv_space_adjust(sk);
+		} else {
+			used = skb->len;
+		}
 
 skip_copy:
 		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
@@ -1354,15 +1401,27 @@ skip_copy:
 
 		if (skb->h.th->fin)
 			goto found_fin_ok;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			if (!skb->copied_early)
+				sk_eat_skb(sk, skb);
+			else {
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__skb_queue_tail(&tp->async_wait_queue, skb);
+			}
+		}
 		continue;
 
 	found_fin_ok:
 		/* Process the FIN. */
 		++*seq;
-		if (!(flags & MSG_PEEK))
-			sk_eat_skb(sk, skb);
+		if (!(flags & MSG_PEEK)) {
+			if (!skb->copied_early)
+				sk_eat_skb(sk, skb);
+			else {
+				__skb_unlink(skb, &sk->sk_receive_queue);
+				__skb_queue_tail(&tp->async_wait_queue, skb);
+			}
+		}
 		break;
 	} while (len > 0);
 
@@ -1383,8 +1442,37 @@ skip_copy:
 
 		tp->ucopy.task = NULL;
 		tp->ucopy.len = 0;
+		tp->ucopy.bytes_early_copied = 0;
 	}
 
+
+#ifdef CONFIG_NET_DMA
+	if (tp->ucopy.dma_chan) {
+		struct sk_buff *skb;
+		dma_cookie_t done, used;
+
+		dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+		while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+		                                 tp->ucopy.dma_cookie, &done,
+		                                 &used) == DMA_IN_PROGRESS) {
+			/* do partial cleanup of async_wait_queue */
+			while ((skb = skb_peek(&tp->async_wait_queue)) &&
+			       (dma_async_is_complete(TCP_SKB_CB(skb)->dma_cookie,
+			                       done, used) == DMA_SUCCESS)) {
+				__skb_dequeue(&tp->async_wait_queue);
+				kfree_skb(skb);
+			}
+		}
+
+		/* Safe to free early-copied skbs now */
+		__skb_queue_purge(&tp->async_wait_queue);
+		dma_unlock_iovec_pages(tp->ucopy.locked_list);
+		tp->ucopy.dma_chan = NULL;
+		tp->ucopy.locked_list = NULL;
+	}
+#endif
+
 	/* According to UNIX98, msg_name/msg_namelen are ignored
 	 * on connected socket. I was just happy when found this 8) --ANK
 	 */
diff -rup a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
--- a/net/ipv4/tcp_input.c	2005-12-21 12:05:00.000000000 -0800
+++ b/net/ipv4/tcp_input.c	2005-12-21 12:14:42.000000000 -0800
@@ -70,6 +70,7 @@
 #include <net/tcp.h>
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
+#include <linux/dmaengine.h>
 #include <asm/unaligned.h>
 
 int sysctl_tcp_timestamps = 1;
@@ -3623,10 +3624,12 @@ tcp_checksum_complete_user(struct sock *
  *	the rest is checked inline. Fast processing is turned on in 
  *	tcp_data_queue when everything is OK.
  */
+
 int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			struct tcphdr *th, unsigned len)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	int eaten = 0;
 
 	/*
 	 *	Header prediction.
@@ -3715,12 +3718,11 @@ int tcp_rcv_established(struct sock *sk,
 				goto discard;
 			}
 		} else {
-			int eaten = 0;
-
 			if (tp->ucopy.task == current &&
 			    tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len &&
-			    sock_owned_by_user(sk)) {
+			    sock_owned_by_user(sk) &&
+			    !skb->copied_early) {
 				__set_current_state(TASK_RUNNING);
 
 				if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
@@ -3767,6 +3769,9 @@ int tcp_rcv_established(struct sock *sk,
 				__skb_queue_tail(&sk->sk_receive_queue, skb);
 				sk_stream_set_owner_r(skb, sk);
 				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+
+				if (skb->copied_early)
+					tcp_cleanup_rbuf(sk, skb->len);
 			}
 
 			tcp_event_data_recv(sk, tp, skb);
@@ -3781,6 +3786,9 @@ int tcp_rcv_established(struct sock *sk,
 
 			__tcp_ack_snd_check(sk, 0);
 no_ack:
+			if (skb->copied_early)
+				return 0;
+
 			if (eaten)
 				__kfree_skb(skb);
 			else
@@ -3864,6 +3872,55 @@ discard:
 	return 0;
 }
 
+#ifdef CONFIG_NET_DMA
+void dma_async_try_early_copy(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int dma_cookie;
+	int chunk = skb->len - tp->tcp_header_len;
+	struct tcphdr *th = skb->h.th;
+
+	if ((tp->ucopy.task == NULL) || (tp->ucopy.dma_chan == NULL) || tp->ucopy.wakeup)
+          	return;
+
+	if ((tcp_flag_word(th) & TCP_HP_BITS) != tp->pred_flags ||
+		TCP_SKB_CB(skb)->seq != tp->rcv_nxt)
+		return;
+
+	if (tp->ucopy.dma_chan &&
+	    chunk > 0 &&
+	    chunk <= (tp->ucopy.len) &&
+	    tp->copied_seq == tp->rcv_nxt &&
+	    skb->ip_summed == CHECKSUM_UNNECESSARY) {
+
+		dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+			skb, tp->tcp_header_len,
+			tp->ucopy.iov, chunk, tp->ucopy.locked_list);
+
+		if (dma_cookie < 0)
+			return;
+
+		tp->ucopy.dma_cookie = dma_cookie;
+		skb->copied_early = 1;
+
+		tp->ucopy.bytes_early_copied += chunk;
+		tp->copied_seq += chunk;
+		tp->ucopy.len -= chunk;
+		tcp_rcv_space_adjust(sk);
+
+		if (tp->ucopy.len == 0) {
+			tp->ucopy.wakeup = 1;
+			wake_up_interruptible(sk->sk_sleep);
+		}
+	} else if (chunk > 0) {
+		tp->ucopy.wakeup = 1;
+		wake_up_interruptible(sk->sk_sleep);
+	}
+}
+
+EXPORT_SYMBOL(dma_async_try_early_copy);
+#endif /* CONFIG_NET_DMA */
+
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 struct tcphdr *th, unsigned len)
 {
diff -rup a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
--- a/net/ipv4/tcp_ipv4.c	2005-12-21 12:10:49.000000000 -0800
+++ b/net/ipv4/tcp_ipv4.c	2005-12-21 12:14:42.000000000 -0800
@@ -1148,6 +1148,11 @@ int tcp_v4_do_rcv(struct sock *sk, struc
 {
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
 		TCP_CHECK_TIMER(sk);
+
+#ifdef CONFIG_NET_DMA
+		dma_async_try_early_copy(sk, skb);
+#endif
+
 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
 			goto reset;
 		TCP_CHECK_TIMER(sk);
@@ -1256,10 +1261,19 @@ process:
 	bh_lock_sock(sk);
 	ret = 0;
 	if (!sock_owned_by_user(sk)) {
-		if (!tcp_prequeue(sk, skb))
+#ifdef CONFIG_NET_DMA
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (tp->ucopy.dma_chan)
 			ret = tcp_v4_do_rcv(sk, skb);
+		else
+#endif
+		{
+			if (!tcp_prequeue(sk, skb))
+			ret = tcp_v4_do_rcv(sk, skb);
+		}
 	} else
 		sk_add_backlog(sk, skb);
+
 	bh_unlock_sock(sk);
 
 	sock_put(sk);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux