[RFC][PATCH 5/7] VPIDs: vpid/pid conversion in VPID enabled case

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is the main patch which contains all vpid-to-pid conversions
and auxilliary stuff. Virtual pids are distinguished from real ones
by the VPID_BIT bit set. Conversion from vpid to pid and vice versa
is performed in two ways: fast way, when vpid and it's according pid
differ only in VPID_BIT bit set ("linear" case), and more complex way,
when pid may correspond to any vpid ("sparse" case) - in this case we
use a hash-table based mapping.

Note that this patch implies that we have a public vps_info_t pointer
type to represent VPS (otherwise it is useless) so that it can be used
in any virtualisation solution. Virtualization solution should have the following "interface":
1. vps_info_t has member int id;
2. vps_info_t has member struct task_struct *init_task;
3. the following macros/functions are defined:
a. inside_vps() - returns true if current task is now
inside VPS;
b. task_inside_vps(task_t *) - returns true if task
belongs to VPS;
c. current_vps() - returns vps_info_t for current VPS;
d. task_vps(task_t *) - returns vps_info_t that task
belongs to;
e. set_sparce_vpid(vps_info_t) - switches VPS into state
when "sparce" conversion is used;
f. sparse_vpid(vps_info_t) - returns true if vps is in
"sparce" state and false if it is in "linear";
g. get_vps_tasks_num(vps_info_t) - returns the number of
tasks that belong to VPS.

Kirill
--- ./include/linux/pid.h.vpid_virt	2006-02-02 14:32:41.162807472 +0300
+++ ./include/linux/pid.h	2006-02-02 14:33:17.963212960 +0300
@@ -10,11 +10,17 @@ enum pid_type
 	PIDTYPE_MAX
 };
 
+#define VPID_BIT	10
+#define VPID_DIV	(1 << VPID_BIT)
+
 struct pid
 {
 	/* Try to keep pid_chain in the same cacheline as nr for find_pid */
 	int nr;
 	struct hlist_node pid_chain;
+#ifdef CONFIG_VIRTUAL_PIDS
+	int vnr;
+#endif
 	/* list of pids with the same nr, only one of them is in the hash */
 	struct list_head pid_list;
 };
@@ -30,6 +36,52 @@ struct pid
 #define comb_pid_to_vpid(pid)		(pid)
 #define alloc_vpid(pid, vpid)		(pid)
 #define free_vpid(vpid)			do { } while (0)
+#else /* CONFIG_VIRTUAL_PIDS */
+#define __is_virtual_pid(pid)		((pid) & VPID_DIV)
+#define is_virtual_pid(pid)		(__is_virtual_pid(pid) || \
+		(((pid) == 1) && inside_vps()))
+
+extern int vpid_to_pid(int pid);
+extern int __vpid_to_pid(int pid);
+extern pid_t pid_type_to_vpid(int type, pid_t pid);
+extern pid_t __pid_type_to_vpid(int type, pid_t pid);
+
+static inline int comb_vpid_to_pid(int vpid)
+{
+	int pid = vpid;
+
+	if (vpid > 0) {
+		pid = vpid_to_pid(vpid);
+		if (unlikely(pid < 0))
+			return 0;
+	} else if (vpid < 0) {
+		pid = vpid_to_pid(-vpid);
+		if (unlikely(pid < 0))
+			return 0;
+		pid = -pid;
+	}
+	return pid;
+}
+
+static inline int comb_pid_to_vpid(int pid)
+{
+	int vpid = pid;
+
+	if (pid > 0) {
+		vpid = pid_type_to_vpid(PIDTYPE_PID, pid);
+		if (unlikely(vpid < 0))
+			return 0;
+	} else if (pid < 0) {
+		vpid = pid_type_to_vpid(PIDTYPE_PGID, -pid);
+		if (unlikely(vpid < 0))
+			return 0;
+		vpid = -vpid;
+	}
+	return vpid;
+}
+
+extern int alloc_vpid(int pid, int vpid);
+extern void free_vpid(int vpid);
 #endif
 
 #define pid_task(elem, type) \
--- ./include/linux/sched.h.vpid_virt	2006-02-02 14:32:41.164807168 +0300
+++ ./include/linux/sched.h	2006-02-02 14:58:53.129832160 +0300
@@ -1327,6 +1327,80 @@ static inline pid_t get_task_ppid(struct
 		return 0;
 	return (p->pid > 1 ? p->group_leader->real_parent->tgid : 0);
 }
+#else
+static inline pid_t virt_pid(struct task_struct *tsk)
+{
+	return tsk->pids[PIDTYPE_PID].vnr;
+}
+
+static inline pid_t virt_tgid(struct task_struct *tsk)
+{
+	return tsk->pids[PIDTYPE_TGID].vnr;
+}
+
+static inline pid_t virt_pgid(struct task_struct *tsk)
+{
+	return tsk->pids[PIDTYPE_PGID].vnr;
+}
+
+static inline pid_t virt_sid(struct task_struct *tsk)
+{
+	return tsk->pids[PIDTYPE_SID].vnr;
+}
+
+static inline pid_t get_task_pid_ve(struct task_struct *tsk,
+		struct task_struct *ve_tsk)
+{
+	return task_inside_vps(ve_tsk) ? virt_pid(tsk) : tsk->pid;
+}
+
+static inline pid_t get_task_pid(struct task_struct *tsk)
+{
+	return inside_vps() ? virt_pid(tsk) : tsk->pid;
+}
+
+static inline pid_t get_task_tgid(struct task_struct *tsk)
+{
+	return inside_vps() ? virt_tgid(tsk) : tsk->pid;
+}
+
+static inline pid_t get_task_pgid(struct task_struct *tsk)
+{
+	return inside_vps() ? virt_pgid(tsk) : tsk->signal->pgrp;
+}
+
+static inline pid_t get_task_sid(struct task_struct *tsk)
+{
+	return inside_vps() ? virt_sid(tsk) : tsk->signal->session;
+}
+
+static inline int set_virt_pid(struct task_struct *tsk, pid_t pid)
+{
+	tsk->pids[PIDTYPE_PID].vnr = pid;
+	return pid;
+}
+
+static inline void set_virt_tgid(struct task_struct *tsk, pid_t pid)
+{
+	tsk->pids[PIDTYPE_TGID].vnr = pid;
+}
+
+static inline void set_virt_pgid(struct task_struct *tsk, pid_t pid)
+{
+	tsk->pids[PIDTYPE_PGID].vnr = pid;
+}
+
+static inline void set_virt_sid(struct task_struct *tsk, pid_t pid)
+{
+	tsk->pids[PIDTYPE_SID].vnr = pid;
+}
+
+static inline pid_t get_task_ppid(struct task_struct *p)
+{
+	if (!pid_alive(p))
+		return 0;
+	return (get_task_pid(p) > 1) ? get_task_pid(p->real_parent) : 0;
+}
 #endif
 
 /* set thread flags in other task's structures
--- ./kernel/pid.c.vpid_virt	2006-02-02 14:15:35.165782728 +0300
+++ ./kernel/pid.c	2006-02-02 14:58:34.632644160 +0300
@@ -27,6 +27,14 @@
 #include <linux/bootmem.h>
 #include <linux/hash.h>
 
+#ifdef CONFIG_VIRTUAL_PIDS
+static void __free_vpid(int vpid, struct task_struct *ve_tsk);
+#define PIDMAP_NRFREE	(BITS_PER_PAGE / 2)
+#else
+#define __free_vpid(vpid, tsk)	do { } while (0)
+#define PIDMAP_NRFREE	BITS_PER_PAGE
+#endif
+
 #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift)
 static struct hlist_head *pid_hash[PIDTYPE_MAX];
 static int pidhash_shift;
@@ -58,7 +66,7 @@ typedef struct pidmap {
 } pidmap_t;
 
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
-	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
+	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(PIDMAP_NRFREE), NULL } };
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
@@ -67,6 +75,7 @@ fastcall void free_pidmap(int pid)
 	pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE;
 	int offset = pid & BITS_PER_PAGE_MASK;
 
+	BUG_ON(__is_virtual_pid(pid) || pid == 1);
 	clear_bit(offset, map->page);
 	atomic_inc(&map->nr_free);
 }
@@ -77,6 +86,8 @@ int alloc_pidmap(void)
 	pidmap_t *map;
 
 	pid = last + 1;
+	if (__is_virtual_pid(pid))
+		pid += VPID_DIV;
 	if (pid >= pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
@@ -107,6 +118,8 @@ int alloc_pidmap(void)
 				}
 				offset = find_next_offset(map, offset);
 				pid = mk_pid(map, offset);
+				if (__is_virtual_pid(pid))
+					pid += VPID_DIV;
 			/*
 			 * find_next_offset() found a bit, the pid from it
 			 * is in-bounds, and if we fell back to the last
@@ -127,6 +140,8 @@ int alloc_pidmap(void)
 				break;
 		}
 		pid = mk_pid(map, offset);
+		if (__is_virtual_pid(pid))
+			pid += VPID_DIV;
 	}
 	return -1;
 }
@@ -201,6 +216,7 @@ void fastcall detach_pid(task_t *task, e
 		if (tmp != type && find_pid(tmp, nr))
 			return;
 
+	__free_vpid(task->pids[type].vnr, task);
 	free_pidmap(nr);
 }
 
@@ -234,6 +250,9 @@ void switch_exec_pids(task_t *leader, ta
 
 	leader->pid = leader->tgid = thread->pid;
 	thread->pid = thread->tgid;
+	set_virt_tgid(leader, virt_pid(thread));
+	set_virt_pid(leader, virt_pid(thread));
+	set_virt_pid(thread, virt_tgid(thread));
 
 	attach_pid(thread, PIDTYPE_PID, thread->pid);
 	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
@@ -247,6 +266,344 @@ void switch_exec_pids(task_t *leader, ta
 	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 
+#ifdef CONFIG_VIRTUAL_PIDS
+/* Virtual PID bits.
+ *
+ * At the moment all internal structures in kernel store real global pid.
+ * The only place, where virtual PID is used, is at user frontend. We
+ * remap virtual pids obtained from user to global ones (vpid_to_pid) and
+ * map globals to virtuals before showing them to user (virt_pid_type).
+ *
+ * We hold virtual PIDs inside struct pid, so map global -> virtual is easy.
+ */
+
+pid_t __pid_type_to_vpid(int type, pid_t pid)
+{
+	struct pid * p;
+
+	if (unlikely(is_virtual_pid(pid)))
+		return -1;
+
+	read_lock(&tasklist_lock);
+	p = find_pid(type, pid);
+	if (p) {
+		pid = p->vnr;
+	} else {
+		pid = -1;
+	}
+	read_unlock(&tasklist_lock);
+	return pid;
+}
+
+pid_t pid_type_to_vpid(int type, pid_t pid)
+{
+	int vpid;
+
+	if (unlikely(pid <= 0))
+		return pid;
+
+	BUG_ON(is_virtual_pid(pid));
+
+	if (!inside_vps())
+		return pid;
+
+	vpid = __pid_type_to_vpid(type, pid);
+	if (unlikely(vpid == -1)) {
+		/* It is allowed: global pid can be used everywhere.
+		 * This can happen, when kernel remembers stray pids:
+		 * signal queues, locks etc.
+		 */
+		vpid = pid;
+	}
+	return vpid;
+}
+
+/* To map virtual pids to global we maintain special hash table.
+ *
+ * Mapping entries are allocated when a process with non-trivial
+ * mapping is forked, which is possible only after VE migrated.
+ * Mappings are destroyed, when a global pid is removed from global
+ * pidmap, which means we do not need to refcount mappings.
+ */
+
+static struct hlist_head *vpid_hash;
+
+struct vpid_mapping
+{
+	int     pid;
+	int     vpid;
+	int     vpsid;
+	struct hlist_node link;
+};
+
+static kmem_cache_t *vpid_mapping_cachep;
+
+static inline int vpid_hashfn(int vnr, int vpsid)
+{
+	return hash_long((unsigned long)(vnr + (vpsid << 16)), pidhash_shift);
+}
+
+struct vpid_mapping *__lookup_vpid_mapping(int vnr, int vpsid)
+{
+	struct hlist_node *elem;
+	struct vpid_mapping *map;
+
+	hlist_for_each_entry(map, elem,
+			&vpid_hash[vpid_hashfn(vnr, vpsid)], link) {
+		if (map->vpid == vnr && map->vpsid == vpsid)
+			return map;
+	}
+	return NULL;
+}
+
+/* __vpid_to_pid() is raw version of vpid_to_pid(). It is to be used
+ * only under tasklist_lock. In some places we must use only this version
+ * (f.e. __kill_pg_info is called under write lock!)
+ *
+ * Caller should pass virtual pid. This function returns an error, when
+ * seeing a global pid.
+ */
+int __vpid_to_pid(int pid)
+{
+	struct vpid_mapping *map;
+	vps_info_t vps;
+
+	if (unlikely(!is_virtual_pid(pid) || !inside_vps()))
+		return -1;
+
+	vps = current_vps();
+	if (!sparse_vpid(vps)) {
+		if (pid != 1)
+			return pid - VPID_DIV;
+		return vps->init_task->pid;
+	}
+
+	map = __lookup_vpid_mapping(pid, vps->id);
+	if (map)
+		return map->pid;
+	return -1;
+}
+
+int vpid_to_pid(int pid)
+{
+	/* User gave bad pid. It is his problem. */
+	if (unlikely(pid <= 0))
+		return pid;
+
+	if (!is_virtual_pid(pid))
+		return pid;
+
+	read_lock(&tasklist_lock);
+	pid = __vpid_to_pid(pid);
+	read_unlock(&tasklist_lock);
+	return pid;
+}
+
+/*
+ * In simple case we have trivial vpid-to-pid conversion rule:
+ * vpid == 1 -> vps->init_task->pid
+ * else         pid & ~VPID_DIV
+ *
+ * when things get more complex we need to allocate mappings...
+ */
+
+static int add_mapping(int pid, int vpid, int vpsid, struct hlist_head *cache)
+{
+	if (pid > 0 && vpid > 0 && !__lookup_vpid_mapping(vpid, vpsid)) {
+		struct vpid_mapping *m;
+
+		if (hlist_empty(cache)) {
+			m = kmem_cache_alloc(vpid_mapping_cachep, GFP_ATOMIC);
+			if (unlikely(m == NULL))
+				return -ENOMEM;
+		} else {
+			m = hlist_entry(cache->first, struct vpid_mapping,
+					link);
+			hlist_del(&m->link);
+		}
+		m->pid = pid;
+		m->vpid = vpid;
+		m->vpsid = vpsid;
+		hlist_add_head(&m->link,
+				&vpid_hash[vpid_hashfn(vpid, vpsid)]);
+	}
+	return 0;
+}
+
+static int switch_to_sparse_mapping(int pid, vps_info_t vps)
+{
+	struct hlist_head cache;
+	task_t *g, *t;
+	int pcount;
+	int err;
+
+	/* Transition happens under write_lock_irq, so we try to make
+	 * it more reliable and fast preallocating mapping entries.
+	 * pcounter may be not enough, we could have lots of orphaned
+	 * process groups and sessions, which also require mappings.
+	 */
+	INIT_HLIST_HEAD(&cache);
+	pcount = get_vps_tasks_num(vps);
+	err = -ENOMEM;
+	while (pcount > 0) {
+		struct vpid_mapping *m;
+		m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL);
+		if (!m)
+			goto out;
+		hlist_add_head(&m->link, &cache);
+		pcount--;
+	}
+
+	write_lock_irq(&tasklist_lock);
+	err = 0;
+	if (sparse_vpid(vps))
+		goto out_sparse;
+
+	err = -ENOMEM;
+	do_each_thread(g, t) {
+		if (t->pid == pid)
+			continue;
+		if (add_mapping(t->pid, virt_pid(t), vps->id, &cache))
+			goto out_unlock;
+	} while_each_thread(g, t);
+
+	for_each_process(t) {
+		if (t->pid == pid)
+			continue;
+
+		if (add_mapping(t->tgid, virt_tgid(t), vps->id,
+					&cache))
+			goto out_unlock;
+		if (add_mapping(t->signal->pgrp, virt_pgid(t), vps->id,
+					&cache))
+			goto out_unlock;
+		if (add_mapping(t->signal->session, virt_sid(t), vps->id,
+					&cache))
+			goto out_unlock;
+	}
+	set_sparse_vpid(vps);
+	err = 0;
+
+out_unlock:
+	if (err) {
+		int i;
+
+		for (i=0; i<(1<<pidhash_shift); i++) {
+			struct hlist_node *elem, *next;
+			struct vpid_mapping *map;
+
+			hlist_for_each_entry_safe(map, elem, next, &vpid_hash[i], link) {
+				if (map->vpsid == vps->id) {
+					hlist_del(elem);
+					hlist_add_head(elem, &cache);
+				}
+			}
+		}
+	}
+out_sparse:
+	write_unlock_irq(&tasklist_lock);
+
+out:
+	while (!hlist_empty(&cache)) {
+		struct vpid_mapping *m;
+		m = hlist_entry(cache.first, struct vpid_mapping, link);
+		hlist_del(&m->link);
+		kmem_cache_free(vpid_mapping_cachep, m);
+	}
+	return err;
+}
+
+int alloc_vpid(int pid, int virt_pid)
+{
+	int result;
+	struct vpid_mapping *m;
+	vps_info_t vps;
+
+	if (!inside_vps())
+		return pid;
+
+	vps = current_vps();
+	if (!sparse_vpid(vps)) {
+		if (virt_pid == -1)
+			return pid + VPID_DIV;
+
+		if (virt_pid == 1 || virt_pid == pid + VPID_DIV)
+			return virt_pid;
+
+		if ((result = switch_to_sparse_mapping(pid, vps)) < 0)
+			return result;
+	}
+
+	m = kmem_cache_alloc(vpid_mapping_cachep, GFP_KERNEL);
+	if (!m)
+		return -ENOMEM;
+
+	m->pid = pid;
+	m->vpsid = vps->id;
+
+	result = (virt_pid == -1) ? pid + VPID_DIV : virt_pid;
+
+	write_lock_irq(&tasklist_lock);
+	if (unlikely(__lookup_vpid_mapping(result, m->vpsid))) {
+		if (virt_pid > 0) {
+			result = -EEXIST;
+			goto out;
+		}
+
+		/* No luck. Now we search for some not-existing vpid.
+		 * It is weak place. We do linear search. */
+		do {
+			result++;
+			if (!__is_virtual_pid(result))
+				result += VPID_DIV;
+			if (result >= pid_max)
+				result = RESERVED_PIDS + VPID_DIV;
+		} while (__lookup_vpid_mapping(result, m->vpsid) != NULL);
+
+		/* And set last_pid in hope future alloc_pidmap to avoid
+		 * collisions after future alloc_pidmap() */
+		last_pid = result - VPID_DIV;
+	}
+	if (result > 0) {
+		m->vpid = result;
+		hlist_add_head(&m->link,
+				&vpid_hash[vpid_hashfn(result, m->vpsid)]);
+	}
+out:
+	write_unlock_irq(&tasklist_lock);
+	if (result < 0)
+		kmem_cache_free(vpid_mapping_cachep, m);
+	return result;
+}
+
+static void __free_vpid(int vpid, struct task_struct *ve_tsk)
+{
+	struct vpid_mapping *m;
+	vps_info_t vps;
+
+	if (!__is_virtual_pid(vpid) && (vpid != 1 || !task_inside_vps(ve_tsk)))
+		return;
+
+	vps = task_vps(ve_tsk);
+	if (!sparse_vpid(vps))
+		return;
+
+	m = __lookup_vpid_mapping(vpid, vps->id);
+	BUG_ON(m == NULL);
+	hlist_del(&m->link);
+	kmem_cache_free(vpid_mapping_cachep, m);
+}
+EXPORT_SYMBOL(alloc_vpid);
+
+void free_vpid(int vpid)
+{
+	write_lock_irq(&tasklist_lock);
+	__free_vpid(vpid, current);
+	write_unlock_irq(&tasklist_lock);
+}
+
+#endif
+
 /*
  * The pid hash table is scaled according to the amount of memory in the
  * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
@@ -273,6 +630,14 @@ void __init pidhash_init(void)
 		for (j = 0; j < pidhash_size; j++)
 			INIT_HLIST_HEAD(&pid_hash[i][j]);
 	}
+
+#ifdef CONFIG_VIRTUAL_PIDS
+	vpid_hash = alloc_bootmem(pidhash_size * sizeof(struct hlist_head));
+	if (!vpid_hash)
+		panic("Could not alloc vpid_hash!\n");
+	for (j = 0; j < pidhash_size; j++)
+		INIT_HLIST_HEAD(&vpid_hash[j]);
+#endif
 }
 
 void __init pidmap_init(void)
@@ -289,4 +654,12 @@ void __init pidmap_init(void)
 
 	for (i = 0; i < PIDTYPE_MAX; i++)
 		attach_pid(current, i, 0);
+
+#ifdef CONFIG_VIRTUAL_PIDS
+	vpid_mapping_cachep =
+		kmem_cache_create("vpid_mapping",
+				sizeof(struct vpid_mapping),
+				__alignof__(struct vpid_mapping),
+				SLAB_PANIC, NULL, NULL);
+#endif
 }

[Index of Archives]     [Kernel Newbies]     [Netfilter]     [Bugtraq]     [Photo]     [Stuff]     [Gimp]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Video 4 Linux]     [Linux for the blind]     [Linux Resources]
  Powered by Linux