Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API)

Matt Helsley wrote:

[snip]

+obj-$(CONFIG_USER_RESOURCE) += beancounter.o
--- /dev/null	2006-07-18 14:52:43.075228448 +0400
+++ ./kernel/ub/beancounter.c	2006-08-10 15:09:34.000000000 +0400
@@ -0,0 +1,398 @@
+/*
+ *  kernel/ub/beancounter.c
+ *
+ *  Copyright (C) 2006 OpenVZ. SWsoft Inc
+ *  Original code by (C) 1998      Alan Cox
+ *                       1998-2000 Andrey Savochkin <[email protected]>
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <ub/beancounter.h>
+
+static kmem_cache_t *ub_cachep;
+static struct user_beancounter default_beancounter;
+static struct user_beancounter default_subbeancounter;
+
+static void init_beancounter_struct(struct user_beancounter *ub, uid_t id);
+
+struct user_beancounter ub0;
+
+const char *ub_rnames[] = {
+};
+
+#define ub_hash_fun(x) ((((x) >> 8) ^ (x)) & (UB_HASH_SIZE - 1))
+#define ub_subhash_fun(p, id) ub_hash_fun((p)->ub_uid + (id) * 17)
+
+struct hlist_head ub_hash[UB_HASH_SIZE];
+spinlock_t ub_hash_lock;
+
+EXPORT_SYMBOL(ub_hash);
+EXPORT_SYMBOL(ub_hash_lock);
+
+/*
+ *	Per user resource beancounting. Resources are tied to their luid.



You haven't explained what an luid is at this point in the patch series.
Patch 0 says:

diff-ubc-syscalls.patch:
    Patch adds system calls for UB management:
    1. sys_getluid    - get current UB id

But I have no idea what that l is there for. Why not sys_get_ubid() for
instance?

it is a historical name given by Alan Cox (imho) and Andrey Savochkin.
will rename it to sys_getubid, sys_setubid and field
ub_uid will be renamed into ub_id.

+ *	The resource structure itself is tagged both to the process and
+ *	the charging resources (a socket doesn't want to have to search for
+ *	things at irq time for example). Reference counters keep things in
+ *	hand.
+ *
+ *	The case where a user creates resource, kills all his processes and
+ *	then starts new ones is correctly handled this way. The refcounters
+ *	will mean the old entry is still around with resource tied to it.
+ */
+



So we create one beancounter object for every resource the user's tasks
allocate? For instance, one per socket? Or does "resource structure"
refer to something else?

no, user_beancounter structure exists one for a set of processes and its resources.

+struct user_beancounter *beancounter_findcreate(uid_t uid,
+		struct user_beancounter *p, int mask)
+{
+	struct user_beancounter *new_ub, *ub, *tmpl_ub;
+	unsigned long flags;
+	struct hlist_head *slot;
+	struct hlist_node *pos;
+
+	if (mask & UB_LOOKUP_SUB) {
+		WARN_ON(p == NULL);
+		tmpl_ub = &default_subbeancounter;
+		slot = &ub_hash[ub_subhash_fun(p, uid)];
+	} else {
+		WARN_ON(p != NULL);
+		tmpl_ub = &default_beancounter;
+		slot = &ub_hash[ub_hash_fun(uid)];
+	}
+	new_ub = NULL;
+
+retry:
+	spin_lock_irqsave(&ub_hash_lock, flags);
+	hlist_for_each_entry (ub, pos, slot, hash)
+		if (ub->ub_uid == uid && ub->parent == p)
+			break;
+
+	if (pos != NULL) {
+		get_beancounter(ub);
+		spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+		if (new_ub != NULL) {
+			put_beancounter(new_ub->parent);
+			kmem_cache_free(ub_cachep, new_ub);
+		}
+		return ub;
+	}
+
+	if (!(mask & UB_ALLOC))
+		goto out_unlock;
+
+	if (new_ub != NULL)
+		goto out_install;
+
+	if (mask & UB_ALLOC_ATOMIC) {



This block..

+		new_ub = kmem_cache_alloc(ub_cachep, GFP_ATOMIC);
+		if (new_ub == NULL)
+			goto out_unlock;
+
+		memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
+		init_beancounter_struct(new_ub, uid);
+		if (p)
+			new_ub->parent = get_beancounter(p);



ending here is almost exactly the same as the block ..

+		goto out_install;
+	}
+
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+

From here..

+	new_ub = kmem_cache_alloc(ub_cachep, GFP_KERNEL);
+	if (new_ub == NULL)
+		goto out;
+
+	memcpy(new_ub, tmpl_ub, sizeof(*new_ub));
+	init_beancounter_struct(new_ub, uid);
+	if (p)
+		new_ub->parent = get_beancounter(p);



to here. You could make a flag variable that holds GFP_ATOMIC or
GFP_KERNEL based on mask & UB_ALLOC_ATOMIC and perhaps turn this block
into a small helper.

yeah, Oleg Nesterov already pointed to this. will cleanup.

+	goto retry;
+
+out_install:
+	hlist_add_head(&new_ub->hash, slot);
+out_unlock:
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+out:
+	return new_ub;
+}
+
+EXPORT_SYMBOL(beancounter_findcreate);
+
+void ub_print_uid(struct user_beancounter *ub, char *str, int size)
+{
+	if (ub->parent != NULL)
+		snprintf(str, size, "%u.%u", ub->parent->ub_uid, ub->ub_uid);
+	else
+		snprintf(str, size, "%u", ub->ub_uid);
+}
+
+EXPORT_SYMBOL(ub_print_uid);

From what I can see this patch doesn't really justify the need for the

EXPORT_SYMBOL. Shouldn't that be done in the patch where it's needed
outside of the kernel/ub code itself?

AFAIK these exports were used in our checkpointing code (OpenVZ)
ok, will remove exports from this patch series...

+void ub_print_resource_warning(struct user_beancounter *ub, int res,
+		char *str, unsigned long val, unsigned long held)
+{
+	char uid[64];
+
+	ub_print_uid(ub, uid, sizeof(uid));
+	printk(KERN_WARNING "UB %s %s warning: %s "
+			"(held %lu, fails %lu, val %lu)\n",
+			uid, ub_rnames[res], str,
+			(res < UB_RESOURCES ? ub->ub_parms[res].held : held),
+			(res < UB_RESOURCES ? ub->ub_parms[res].failcnt : 0),
+			val);
+}
+
+EXPORT_SYMBOL(ub_print_resource_warning);
+
+static inline void verify_held(struct user_beancounter *ub)
+{
+	int i;
+
+	for (i = 0; i < UB_RESOURCES; i++)
+		if (ub->ub_parms[i].held != 0)
+			ub_print_resource_warning(ub, i,
+					"resource is held on put", 0, 0);
+}
+
+void __put_beancounter(struct user_beancounter *ub)
+{
+	unsigned long flags;
+	struct user_beancounter *parent;
+
+again:
+	parent = ub->parent;
+	/* equevalent to atomic_dec_and_lock_irqsave() */



nit: s/que/qui/

thanks!

+	local_irq_save(flags);
+	if (likely(!atomic_dec_and_lock(&ub->ub_refcount, &ub_hash_lock))) {
+		if (unlikely(atomic_read(&ub->ub_refcount) < 0))
+			printk(KERN_ERR "UB: Bad ub refcount: ub=%p, "
+					"luid=%d, ref=%d\n",
+					ub, ub->ub_uid,
+					atomic_read(&ub->ub_refcount));



This seems to be for debugging purposes only. If not, perhaps this
printk ought to be rate limited?

this is BUG_ON with more description. there is no much need to ratelimit it
as when it happens something really wrong happened in your system.

+		local_irq_restore(flags);
+		return;
+	}
+
+	if (unlikely(ub == &ub0)) {
+		printk(KERN_ERR "Trying to put ub0\n");



Same thing for this printk.

will replace it with real BUG_ON here.

+		spin_unlock_irqrestore(&ub_hash_lock, flags);
+		return;
+	}
+
+	verify_held(ub);
+	hlist_del(&ub->hash);
+	spin_unlock_irqrestore(&ub_hash_lock, flags);
+
+	kmem_cache_free(ub_cachep, ub);
+
+	ub = parent;
+	if (ub != NULL)
+		goto again;



Couldn't this be replaced by a do { } while (ub != NULL); loop?

this is ugly from indentation POV. also restarts are frequently used everywhere...

+}
+
+EXPORT_SYMBOL(__put_beancounter);
+
+/*
+ *	Generic resource charging stuff
+ */
+
+int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum severity strict)
+{
+	/*
+	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition

+ * at the moment is possible so an overflow is impossible.+ */

+	ub->ub_parms[resource].held += val;
+
+	switch (strict) {
+		case UB_BARRIER:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].barrier)
+				break;
+			/* fallthrough */
+		case UB_LIMIT:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].limit)
+				break;
+			/* fallthrough */
+		case UB_FORCE:
+			ub_adjust_held_minmax(ub, resource);
+			return 0;
+		default:
+			BUG();
+	}
+
+	ub->ub_parms[resource].failcnt++;
+	ub->ub_parms[resource].held -= val;
+	return -ENOMEM;
+}
+
+int charge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val, enum severity strict)
+{
+	int retval;
+	struct user_beancounter *p, *q;
+	unsigned long flags;
+
+	retval = -EINVAL;
+	BUG_ON(val > UB_MAXVALUE);
+
+	local_irq_save(flags);



<factor>

+	for (p = ub; p != NULL; p = p->parent) {



Seems rather expensive to walk up the tree for every charge. Especially
if the administrator wants a fine degree of resource control and makes a
tall tree. This would be a problem especially when it comes to resources
that require frequent and fast allocation.

in heirarchical accounting you always have to update all the nodes :/
with flat UBC this doesn't introduce significant overhead.

+		spin_lock(&p->ub_lock);
+		retval = __charge_beancounter_locked(p, resource, val, strict);
+		spin_unlock(&p->ub_lock);
+		if (retval)
+			goto unroll;



This can be factored by passing a flag that breaks the loop on an error:

		if (retval && do_break_err)
			return retval;

how about uncharge here?
didn't get your idea, sorry...

+	}



</factor>

+out_restore:
+	local_irq_restore(flags);
+	return retval;
+



<factor>

+unroll:
+	for (q = ub; q != p; q = q->parent) {
+		spin_lock(&q->ub_lock);
+		__uncharge_beancounter_locked(q, resource, val);
+		spin_unlock(&q->ub_lock);
+	}



</factor>

+	goto out_restore;
+}
+
+EXPORT_SYMBOL(charge_beancounter);
+
+void charge_beancounter_notop(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct user_beancounter *p;
+	unsigned long flags;
+
+	local_irq_save(flags);



<factor>

+	for (p = ub; p->parent != NULL; p = p->parent) {
+		spin_lock(&p->ub_lock);
+		__charge_beancounter_locked(p, resource, val, UB_FORCE);
+		spin_unlock(&p->ub_lock);
+	}



<factor>

+	local_irq_restore(flags);



Again, this could be factored with charge_beancounter using a helper
function.

+}
+
+EXPORT_SYMBOL(charge_beancounter_notop);
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	if (unlikely(ub->ub_parms[resource].held < val)) {
+		ub_print_resource_warning(ub, resource,
+				"uncharging too much", val, 0);
+		val = ub->ub_parms[resource].held;
+	}
+	ub->ub_parms[resource].held -= val;
+	ub_adjust_held_minmax(ub, resource);
+}
+
+void uncharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+	struct user_beancounter *p;
+
+	for (p = ub; p != NULL; p = p->parent) {
+		spin_lock_irqsave(&p->ub_lock, flags);
+		__uncharge_beancounter_locked(p, resource, val);
+		spin_unlock_irqrestore(&p->ub_lock, flags);
+	}
+}



Looks like your unroll: label in charge_beancounter above.

ok, will introduce helpers.

+
+EXPORT_SYMBOL(uncharge_beancounter);
+
+void uncharge_beancounter_notop(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct user_beancounter *p;
+	unsigned long flags;
+
+	local_irq_save(flags);



<factor>

+	for (p = ub; p->parent != NULL; p = p->parent) {
+		spin_lock(&p->ub_lock);
+		__uncharge_beancounter_locked(p, resource, val);
+		spin_unlock(&p->ub_lock);
+	}



</factor>

+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(uncharge_beancounter_notop);
+
+/*
+ *	Initialization
+ *
+ *	struct user_beancounter contains
+ *	 - limits and other configuration settings
+ *	 - structural fields: lists, spinlocks and so on.
+ *
+ *	Before these parts are initialized, the structure should be memset
+ *	to 0 or copied from a known clean structure.  That takes care of a lot
+ *	of fields not initialized explicitly.
+ */
+
+static void init_beancounter_struct(struct user_beancounter *ub, uid_t id)
+{
+	atomic_set(&ub->ub_refcount, 1);
+	spin_lock_init(&ub->ub_lock);
+	ub->ub_uid = id;
+}
+
+static void init_beancounter_nolimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++) {
+		ub->ub_parms[k].limit = UB_MAXVALUE;
+		ub->ub_parms[k].barrier = UB_MAXVALUE;
+	}
+}
+
+static void init_beancounter_syslimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++)
+		ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
+}
+
+void __init ub_init_early(void)
+{
+	struct user_beancounter *ub;
+	struct hlist_head *slot;
+
+	ub = &ub0;
+



<factor>

+	memset(ub, 0, sizeof(*ub));
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub, 0);
+



</factor>

???

+	spin_lock_init(&ub_hash_lock);
+	slot = &ub_hash[ub_hash_fun(ub->ub_uid)];
+	hlist_add_head(&ub->hash, slot);
+}
+
+void __init ub_init_late(void)
+{
+	struct user_beancounter *ub;
+
+	ub_cachep = kmem_cache_create("user_beancounters",
+			sizeof(struct user_beancounter),
+			0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (ub_cachep == NULL)
+		panic("Can't create ubc caches\n");
+
+	ub = &default_beancounter;



<factor>

+	memset(ub, 0, sizeof(default_beancounter));
+	init_beancounter_syslimits(ub);

<<<< this one is different from the above :)

+	init_beancounter_struct(ub, 0);
+



</factor>

+	ub = &default_subbeancounter;



<factor>

+	memset(ub, 0, sizeof(default_subbeancounter));
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub, 0);



</factor>

+}



Cheers,
	-Matt Helsley


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Follow-Ups:
- Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API)
  - From: Matt Helsley <[email protected]>

References:
- [RFC][PATCH] UBC: user resource beancounters
  - From: Kirill Korotaev <[email protected]>
- [RFC][PATCH 2/7] UBC: core (structures, API)
  - From: Kirill Korotaev <[email protected]>
- Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API)
  - From: Matt Helsley <[email protected]>

Prev by Date: Re: [patch 0/5] -fstack-protector feature for the kernel (try 2)
Next by Date: [PATCH] char/moxa.c: fix endianess and multiple-card issues
Previous by thread: Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API)
Next by thread: Re: [ckrm-tech] [RFC][PATCH 2/7] UBC: core (structures, API)
Index(es):
- Date
- Thread

[Index of Archives] [Kernel Newbies] [Netfilter] [Bugtraq] [Photo] [Stuff] [Gimp] [Yosemite News] [MIPS Linux] [ARM Linux] [Linux Security] [Linux RAID] [Video 4 Linux] [Linux for the blind] [Linux Resources]