OpenVZ Forum



Members   Search      Help    Register    Login    Home
Today's Messages (off)  | Unanswered Messages (on)

Forum: Devel
 Topic: [patch 04/10][NETNS][IP6_FIB] make the fib table per network namespace
[patch 04/10][NETNS][IP6_FIB] make the fib table per network namespace [message #23326] Thu, 15 November 2007 09:01
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
The patch makes the ip6 fib being accessed relativly
to the network name. 

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
---
 include/net/ip6_fib.h |    4 ++--
 net/ipv6/fib6_rules.c |    4 ++--
 net/ipv6/ip6_fib.c    |   46 +++++++++++++++++++++-------------------------
 net/ipv6/route.c      |   12 ++++++------
 4 files changed, 31 insertions(+), 35 deletions(-)

Index: linux-2.6-netns/include/net/ip6_fib.h
===================================================================
--- linux-2.6-netns.orig/include/net/ip6_fib.h
+++ linux-2.6-netns/include/net/ip6_fib.h
@@ -194,8 +194,8 @@ typedef struct rt6_info *(*pol_lookup_t)
  *	exported functions
  */
 
-extern struct fib6_table *	fib6_get_table(u32 id);
-extern struct fib6_table *	fib6_new_table(u32 id);
+extern struct fib6_table *	fib6_get_table(struct net *net, u32 id);
+extern struct fib6_table *	fib6_new_table(struct net *net, u32 id);
 extern struct dst_entry *	fib6_rule_lookup(struct flowi *fl, int flags,
 						 pol_lookup_t lookup);
 
Index: linux-2.6-netns/net/ipv6/fib6_rules.c
===================================================================
--- linux-2.6-netns.orig/net/ipv6/fib6_rules.c
+++ linux-2.6-netns/net/ipv6/fib6_rules.c
@@ -71,7 +71,7 @@ static int fib6_rule_action(struct fib_r
 		goto discard_pkt;
 	}
 
-	table = fib6_get_table(rule->table);
+	table = fib6_get_table(&init_net, rule->table);
 	if (table)
 		rt = lookup(table, flp, flags);
 
@@ -151,7 +151,7 @@ static int fib6_rule_configure(struct fi
 		if (rule->table == RT6_TABLE_UNSPEC)
 			goto errout;
 
-		if (fib6_new_table(rule->table) == NULL) {
+		if (fib6_new_table(&init_net, rule->table) == NULL) {
 			err = -ENOBUFS;
 			goto errout;
 		}
Index: linux-2.6-netns/net/ipv6/ip6_fib.c
===================================================================
--- linux-2.6-netns.orig/net/ipv6/ip6_fib.c
+++ linux-2.6-netns/net/ipv6/ip6_fib.c
@@ -172,7 +172,7 @@ static __inline__ void rt6_release(struc
 #define FIB_TABLE_HASHSZ 1
 #endif
 
-static void fib6_link_table(struct fib6_table *tb)
+static void fib6_link_table(struct net *net, struct fib6_table *tb)
 {
 	unsigned int h;
 
@@ -188,7 +188,7 @@ static void fib6_link_table(struct fib6_
 	 * No protection necessary, this is the only list mutatation
 	 * operation, tables never disappear once they exist.
 	 */
-	hlist_add_head_rcu(&tb->tb6_hlist, &init_net.fib_table_hash[h]);
+	hlist_add_head_rcu(&tb->tb6_hlist, &net->fib_table_hash[h]);
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -207,24 +207,24 @@ static struct fib6_table *fib6_alloc_tab
 	return table;
 }
 
-struct fib6_table *fib6_new_table(u32 id)
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
 {
 	struct fib6_table *tb;
 
 	if (id == 0)
 		id = RT6_TABLE_MAIN;
-	tb = fib6_get_table(id);
+	tb = fib6_get_table(net, id);
 	if (tb)
 		return tb;
 
 	tb = fib6_alloc_table(id);
 	if (tb != NULL)
-		fib6_link_table(tb);
+		fib6_link_table(net, tb);
 
 	return tb;
 }
 
-struct fib6_table *fib6_get_table(u32 id)
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
 {
 	struct fib6_table *tb;
 	struct hlist_head *head;
@@ -235,7 +235,7 @@ struct fib6_table *fib6_get_table(u32 id
 		id = RT6_TABLE_MAIN;
 	h = id & (FIB_TABLE_HASHSZ - 1);
 	rcu_read_lock();
-	head = &init_net.fib_table_hash[h];
+	head = &net->fib_table_hash[h];
 	hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
 		if (tb->tb6_id == id) {
 			rcu_read_unlock();
@@ -247,33 +247,33 @@ struct fib6_table *fib6_get_table(u32 id
 	return NULL;
 }
 
-static void __init fib6_tables_init(void)
+static void __init fib6_tables_init(struct net *net)
 {
-	fib6_link_table(init_net.fib6_main_tbl);
-	fib6_link_table(init_net.fib6_local_tbl);
+	fib6_link_table(net, net->fib6_main_tbl);
+	fib6_link_table(net, net->fib6_local_tbl);
 }
 
 #else
 
-struct fib6_table *fib6_new_table(u32 id)
+struct fib6_table *fib6_new_table(struct net *net, u32 id)
 {
-	return fib6_get_table(id);
+	return fib6_get_table(net, id);
 }
 
-struct fib6_table *fib6_get_table(u32 id)
+struct fib6_table *fib6_get_table(struct net *net, u32 id)
 {
-	return init_net.fib6_main_tbl;
+	return net->fib6_main_tbl;
 }
 
 struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
 				   pol_lookup_t lookup)
 {
-	return (struct dst_entry *) lookup(init_net.fib6_main_tbl, fl, flags);
+	return (struct dst_entry *) lookup(fl->fl_net->fib6_main_tbl, fl, flags);
 }
 
-static void __init fib6_tables_init(void)
+static void __init fib6_tables_init(struct net *net)
 {
-	fib6_link_table(init_net.fib6_main_tbl);
+	fib6_link_table(net, net->fib6_main_tbl);
 }
 
 #endif
@@ -357,9 +357,6 @@ static int inet6_dump_fib(struct sk_buff
 	struct hlist_head *head;
 	int res = 0;
 
-	if (net != &init_net)
-		return 0;
-
 	s_h = cb->args[0];
 	s_e = cb->args[1];
 
@@ -388,7 +385,7 @@ static int inet6_dump_fib(struct sk_buff
 
 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
 		e = 0;
-		head = &init_net.fib_table_hash[h];
+		head = &net->fib_table_hash[h];
 		hlist_for_each_entry(tb, node, head, tb6_hlist) {
 			if (e < s_e)
 				goto next;
@@ -1466,9 +1463,6 @@ static int fib6_net_init(struct net *net
 {
 	int ret;
 
-	if (net != &init_net)
-		return -EPERM;
-
 	ret = -ENOMEM;
 	net->fib_table_hash = kzalloc(sizeof(*net->fib_table_hash)*FIB_TABLE_HASHSZ,
 				      GFP_KERNEL);
@@ -1494,7 +1488,9 @@ static int fib6_net_init(struct net *net
 	net->fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 #endif
 
-	fib6_tables_init();
+	fib6_tables_init(net);
+
+	return 0;
 
 out_fib6_main_tbl:
 	kfree(net->fib_table_hash);
Index: linux-2.6-netns/net/ipv6/route.c
===================================================================
--- linux-2.6-netns.orig/net/ipv6/route.c
+++ linux-2.6-netns/net/ipv6/route.c
@@ -1065,7 +1065,7 @@ int ip6_route_add(struct fib6_config *cf
 	if (cfg->fc_metric == 0)
 		cfg->fc_metric = IP6_RT_PRIO_USER;
 
-	table = fib6_new_table(cfg->fc_table);
+	table = fib6_new_table(&init_net, cfg->fc_table);
 	if (table == NULL) {
 		err = -ENOBUFS;
 		goto out;
@@ -1268,7 +1268,7 @@ static int ip6_route_del(struct fib6_con
 	struct rt6_info *rt;
 	int err = -ESRCH;
 
-	table = fib6_get_table(cfg->fc_table);
+	table = fib6_get_table(&init_net, cfg->fc_table);
 	if (table == NULL)
 		return err;
 
@@ -1582,7 +1582,7 @@ static struct rt6_info *rt6_get_route_in
 	struct rt6_info *rt = NULL;
 	struct fib6_table *table;
 
-	table = fib6_get_table(RT6_TABLE_INFO);
+	table = fib6_get_table(&init_net, RT6_TABLE_INFO);
 	if (table == NULL)
 		return NULL;
 
@@ -1637,7 +1637,7 @@ struct rt6_info *rt6_get_dflt_router(str
 	struct rt6_info *rt;
 	struct fib6_table *table;
 
-	table = fib6_get_table(RT6_TABLE_DFLT);
+	table = fib6_get_table(&init_net, RT6_TABLE_DFLT);
 	if (table == NULL)
 		return NULL;
 
@@ -1679,7 +1679,7 @@ void rt6_purge_dflt_routers(void)
 	struct fib6_table *table;
 
 	/* NOTE: Keep consistent with rt6_get_dflt_router */
-	table = fib6_get_table(RT6_TABLE_DFLT);
+	table = fib6_get_table(&init_net, RT6_TABLE_DFLT);
 	if (table == NULL)
 		return;
 
@@ -1841,7 +1841,7 @@ struct rt6_info *addrconf_dst_alloc(stru
 
 	ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
 	rt->rt6i_dst.plen = 128;
-	rt->rt6i_table = fib6_get_table(RT6_TABLE_LOCAL);
+	rt->rt6i_table = fib6_get_table(&init_net, RT6_TABLE_LOCAL);
 
 	atomic_set(&rt->u.dst.__refcnt, 1);
 

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [patch 03/10][NETNS][IP6_FIB] move the fib table to the network namespace
[patch 03/10][NETNS][IP6_FIB] move the fib table to the network namespace [message #23322] Thu, 15 November 2007 09:01
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
Move the global definition to the fib table to the network namespace
structure and make their access to the initial network namespace.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
---
 include/net/net_namespace.h |    9 +++
 net/ipv6/ip6_fib.c          |  110 ++++++++++++++++++++++++++++----------------
 2 files changed, 80 insertions(+), 39 deletions(-)

Index: linux-2.6-netns/include/net/net_namespace.h
===================================================================
--- linux-2.6-netns.orig/include/net/net_namespace.h
+++ linux-2.6-netns/include/net/net_namespace.h
@@ -31,6 +31,15 @@ struct net {
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
 
+	/* ipv6 routing table */
+#ifdef CONFIG_IPV6
+	struct hlist_head       *fib_table_hash;
+	struct fib6_table       *fib6_main_tbl;
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	struct fib6_table       *fib6_local_tbl;
+#endif /* CONFIG_IPV6_MULTIPLE_TABLES */
+#endif /* CONFIG_IPV6 */
+
 	struct sock 		*rtnl;			/* rtnetlink socket */
 
 	/* List of all packet sockets. */
Index: linux-2.6-netns/net/ipv6/ip6_fib.c
===================================================================
--- linux-2.6-netns.orig/net/ipv6/ip6_fib.c
+++ linux-2.6-netns/net/ipv6/ip6_fib.c
@@ -166,14 +166,11 @@ static __inline__ void rt6_release(struc
 		dst_free(&rt->u.dst);
 }
 
-static struct fib6_table *fib6_main_tbl;
-
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 #define FIB_TABLE_HASHSZ 256
 #else
 #define FIB_TABLE_HASHSZ 1
 #endif
-static struct hlist_head *fib_table_hash;
 
 static void fib6_link_table(struct fib6_table *tb)
 {
@@ -191,13 +188,11 @@ static void fib6_link_table(struct fib6_
 	 * No protection necessary, this is the only list mutatation
 	 * operation, tables never disappear once they exist.
 	 */
-	hlist_add_head_rcu(&tb->tb6_hlist, &fib_table_hash[h]);
+	hlist_add_head_rcu(&tb->tb6_hlist, &init_net.fib_table_hash[h]);
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 
-static struct fib6_table *fib6_local_tbl;
-
 static struct fib6_table *fib6_alloc_table(u32 id)
 {
 	struct fib6_table *table;
@@ -232,6 +227,7 @@ struct fib6_table *fib6_new_table(u32 id
 struct fib6_table *fib6_get_table(u32 id)
 {
 	struct fib6_table *tb;
+	struct hlist_head *head;
 	struct hlist_node *node;
 	unsigned int h;
 
@@ -239,7 +235,8 @@ struct fib6_table *fib6_get_table(u32 id
 		id = RT6_TABLE_MAIN;
 	h = id & (FIB_TABLE_HASHSZ - 1);
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(tb, node, &fib_table_hash[h], tb6_hlist) {
+	head = &init_net.fib_table_hash[h];
+	hlist_for_each_entry_rcu(tb, node, head, tb6_hlist) {
 		if (tb->tb6_id == id) {
 			rcu_read_unlock();
 			return tb;
@@ -252,8 +249,8 @@ struct fib6_table *fib6_get_table(u32 id
 
 static void __init fib6_tables_init(void)
 {
-	fib6_link_table(fib6_main_tbl);
-	fib6_link_table(fib6_local_tbl);
+	fib6_link_table(init_net.fib6_main_tbl);
+	fib6_link_table(init_net.fib6_local_tbl);
 }
 
 #else
@@ -265,18 +262,18 @@ struct fib6_table *fib6_new_table(u32 id
 
 struct fib6_table *fib6_get_table(u32 id)
 {
-	return fib6_main_tbl;
+	return init_net.fib6_main_tbl;
 }
 
 struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
 				   pol_lookup_t lookup)
 {
-	return (struct dst_entry *) lookup(fib6_main_tbl, fl, flags);
+	return (struct dst_entry *) lookup(init_net.fib6_main_tbl, fl, flags);
 }
 
 static void __init fib6_tables_init(void)
 {
-	fib6_link_table(fib6_main_tbl);
+	fib6_link_table(init_net.fib6_main_tbl);
 }
 
 #endif
@@ -357,6 +354,7 @@ static int inet6_dump_fib(struct sk_buff
 	struct fib6_walker_t *w;
 	struct fib6_table *tb;
 	struct hlist_node *node;
+	struct hlist_head *head;
 	int res = 0;
 
 	if (net != &init_net)
@@ -390,7 +388,8 @@ static int inet6_dump_fib(struct sk_buff
 
 	for (h = s_h; h < FIB_TABLE_HASHSZ; h++, s_e = 0) {
 		e = 0;
-		hlist_for_each_entry(tb, node, &fib_table_hash[h], tb6_hlist) {
+		head = &init_net.fib_table_hash[h];
+		hlist_for_each_entry(tb, node, head, tb6_hlist) {
 			if (e < s_e)
 				goto next;
 			res = fib6_dump_table(tb, skb, cb);
@@ -1358,12 +1357,13 @@ void fib6_clean_all(int (*func)(struct r
 {
 	struct fib6_table *table;
 	struct hlist_node *node;
+	struct hlist_head *head;
 	unsigned int h;
 
 	rcu_read_lock();
 	for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
-		hlist_for_each_entry_rcu(table, node, &fib_table_hash[h],
-					 tb6_hlist) {
+		head = &init_net.fib_table_hash[h];
+		hlist_for_each_entry_rcu(table, node, head, tb6_hlist) {
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(&table->tb6_root, func, prune, arg);
 			write_unlock_bh(&table->tb6_lock);
@@ -1462,42 +1462,74 @@ void fib6_run_gc(unsigned long dummy)
 	spin_unlock_bh(&fib6_gc_lock);
 }
 
-void __init fib6_init(void)
+static int fib6_net_init(struct net *net)
 {
-	fib6_node_kmem = kmem_cache_create("fib6_nodes",
-					   sizeof(struct fib6_node),
-					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
-					   NULL);
+	int ret;
 
-	fib_table_hash = kzalloc(sizeof(*fib_table_hash)*FIB_TABLE_HASHSZ, GFP_KERNEL);
-	if (!fib_table_hash)
-		panic("IPV6: Failed to allocate fib_table_hash.\n");
-
-	fib6_main_tbl = kzalloc(sizeof(*fib6_main_tbl), GFP_KERNEL);
-	if (!fib6_main_tbl)
-		panic("IPV6: Failed to allocate fib6_main_tbl.\n");
-
-	fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
-	fib6_main_tbl->tb6_root.leaf = &ip6_null_entry;
-	fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+	if (net != &init_net)
+		return -EPERM;
+
+	ret = -ENOMEM;
+	net->fib_table_hash = kzalloc(sizeof(*net->fib_table_hash)*FIB_TABLE_HASHSZ,
+				      GFP_KERNEL);
+	if (!net->fib_table_hash)
+		goto out;
+
+	net->fib6_main_tbl = kzalloc(sizeof(*net->fib6_main_tbl), GFP_KERNEL);
+	if (!net->fib6_main_tbl)
+		goto out_fib6_main_tbl;
+
+	net->fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
+	net->fib6_main_tbl->tb6_root.leaf = &ip6_null_entry;
+	net->fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-	fib6_local_tbl = kzalloc(sizeof(*fib6_local_tbl), GFP_KERNEL);
-	if (!fib6_local_tbl)
-		panic("IPV6: Failed to allocate fib6_local_tbl.\n");
-
-	fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
-	fib6_local_tbl->tb6_root.leaf = &ip6_null_entry;
-	fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+	net->fib6_local_tbl = kzalloc(sizeof(*net->fib6_local_tbl), GFP_KERNEL);
+	if (!net->fib6_local_tbl) {
+		kfree(net->fib6_main_tbl);
+		goto out_fib6_main_tbl;
+	}
+	net->fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
+	net->fib6_local_tbl->tb6_root.leaf = &ip6_null_entry;
+	net->fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
 #endif
 
 	fib6_tables_init();
 
-	__rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib);
+out_fib6_main_tbl:
+	kfree(net->fib_table_hash);
+out:
+	return ret;
+ }
+
+static void fib6_net_exit(struct net *net)
+{
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	kfree(net->fib6_local_tbl);
+#endif
+	kfree(net->fib6_main_tbl);
+	kfree(net->fib_table_hash);
+}
+
+static struct pernet_operations fib6_net_ops = {
+	.init = fib6_net_init,
+	.exit = fib6_net_exit,
+};
+
+void __init fib6_init(void)
+{
+	fib6_node_kmem = kmem_cache_create("fib6_nodes",
+					   sizeof(struct fib6_node),
+					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+					   NULL);
+
+	register_pernet_subsys(&fib6_net_ops);
+        __rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib);
 }
 
 void fib6_gc_cleanup(void)
 {
 	del_timer(&ip6_fib_timer);
+	unregister_pernet_subsys(&fib6_net_ops);
 	kmem_cache_destroy(fib6_node_kmem);
 }

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [patch 02/10][NETNS][IP6_FIB] dynamically allocate fib tables
[patch 02/10][NETNS][IP6_FIB] dynamically allocate fib tables [message #23318] Thu, 15 November 2007 09:01
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
The fib tables are dynamically allocated at the init and exit functions.
That provides the ability to do multiple instanciations of these tables.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
---
 net/ipv6/ip6_fib.c |   43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

Index: linux-2.6-netns/net/ipv6/ip6_fib.c
===================================================================
--- linux-2.6-netns.orig/net/ipv6/ip6_fib.c
+++ linux-2.6-netns/net/ipv6/ip6_fib.c
@@ -166,22 +166,14 @@ static __inline__ void rt6_release(struc
 		dst_free(&rt->u.dst);
 }
 
-static struct fib6_table __fib6_main_tbl = {
-	.tb6_id		= RT6_TABLE_MAIN,
-	.tb6_root	= {
-		.leaf		= &ip6_null_entry,
-		.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
-	},
-};
-
-static struct fib6_table *fib6_main_tbl = &__fib6_main_tbl;
+static struct fib6_table *fib6_main_tbl;
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 #define FIB_TABLE_HASHSZ 256
 #else
 #define FIB_TABLE_HASHSZ 1
 #endif
-static struct hlist_head fib_table_hash[FIB_TABLE_HASHSZ];
+static struct hlist_head *fib_table_hash;
 
 static void fib6_link_table(struct fib6_table *tb)
 {
@@ -203,15 +195,8 @@ static void fib6_link_table(struct fib6_
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-static struct fib6_table __fib6_local_tbl = {
-	.tb6_id		= RT6_TABLE_LOCAL,
-	.tb6_root 	= {
-		.leaf		= &ip6_null_entry,
-		.fn_flags	= RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO,
-	},
-};
 
-static struct fib6_table *fib6_local_tbl = &__fib6_local_tbl;
+static struct fib6_table *fib6_local_tbl;
 
 static struct fib6_table *fib6_alloc_table(u32 id)
 {
@@ -1484,6 +1469,28 @@ void __init fib6_init(void)
 					   0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
 					   NULL);
 
+	fib_table_hash = kzalloc(sizeof(*fib_table_hash)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+	if (!fib_table_hash)
+		panic("IPV6: Failed to allocate fib_table_hash.\n");
+
+	fib6_main_tbl = kzalloc(sizeof(*fib6_main_tbl), GFP_KERNEL);
+	if (!fib6_main_tbl)
+		panic("IPV6: Failed to allocate fib6_main_tbl.\n");
+
+	fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
+	fib6_main_tbl->tb6_root.leaf = &ip6_null_entry;
+	fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+
+#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+	fib6_local_tbl = kzalloc(sizeof(*fib6_local_tbl), GFP_KERNEL);
+	if (!fib6_local_tbl)
+		panic("IPV6: Failed to allocate fib6_local_tbl.\n");
+
+	fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
+	fib6_local_tbl->tb6_root.leaf = &ip6_null_entry;
+	fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
+#endif
+
 	fib6_tables_init();
 
 	__rtnl_register(PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib);

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [patch 01/10][NETNS][IP6_FIB] make mindless changes
[patch 01/10][NETNS][IP6_FIB] make mindless changes [message #23319] Thu, 15 November 2007 09:01
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
This patch changes all references to the static global variables
fib6_main_tbl and fib6_local_tbl by a pointer. That provides the
minimal changes to dynamically allocate these tables for the network
namespaces. 

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
Signed-off-by: Benjamin Thery <benjamin.thery@bull.net>
---
 net/ipv6/ip6_fib.c |   18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

Index: linux-2.6-netns/net/ipv6/ip6_fib.c
===================================================================
--- linux-2.6-netns.orig/net/ipv6/ip6_fib.c
+++ linux-2.6-netns/net/ipv6/ip6_fib.c
@@ -166,7 +166,7 @@ static __inline__ void rt6_release(struc
 		dst_free(&rt->u.dst);
 }
 
-static struct fib6_table fib6_main_tbl = {
+static struct fib6_table __fib6_main_tbl = {
 	.tb6_id		= RT6_TABLE_MAIN,
 	.tb6_root	= {
 		.leaf		= &ip6_null_entry,
@@ -174,6 +174,8 @@ static struct fib6_table fib6_main_tbl =
 	},
 };
 
+static struct fib6_table *fib6_main_tbl = &__fib6_main_tbl;
+
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
 #define FIB_TABLE_HASHSZ 256
 #else
@@ -201,7 +203,7 @@ static void fib6_link_table(struct fib6_
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
-static struct fib6_table fib6_local_tbl = {
+static struct fib6_table __fib6_local_tbl = {
 	.tb6_id		= RT6_TABLE_LOCAL,
 	.tb6_root 	= {
 		.leaf		= &ip6_null_entry,
@@ -209,6 +211,8 @@ static struct fib6_table fib6_local_tbl 
 	},
 };
 
+static struct fib6_table *fib6_local_tbl = &__fib6_local_tbl;
+
 static struct fib6_table *fib6_alloc_table(u32 id)
 {
 	struct fib6_table *table;
@@ -263,8 +267,8 @@ struct fib6_table *fib6_get_table(u32 id
 
 static void __init fib6_tables_init(void)
 {
-	fib6_link_table(&fib6_main_tbl);
-	fib6_link_table(&fib6_local_tbl);
+	fib6_link_table(fib6_main_tbl);
+	fib6_link_table(fib6_local_tbl);
 }
 
 #else
@@ -276,18 +280,18 @@ struct fib6_table *fib6_new_table(u32 id
 
 struct fib6_table *fib6_get_table(u32 id)
 {
-	return &fib6_main_tbl;
+	return fib6_main_tbl;
 }
 
 struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
 				   pol_lookup_t lookup)
 {
-	return (struct dst_entry *) lookup(&fib6_main_tbl, fl, flags);
+	return (struct dst_entry *) lookup(fib6_main_tbl, fl, flags);
 }
 
 static void __init fib6_tables_init(void)
 {
-	fib6_link_table(&fib6_main_tbl);
+	fib6_link_table(fib6_main_tbl);
 }
 
 #endif

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 2/2][INET] (resend) Move the reqsk_queue_yank_listen_sk from header
[PATCH 2/2][INET] (resend) Move the reqsk_queue_yank_listen_sk from header [message #23284] Thu, 15 November 2007 03:43
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
This function is used in the net/core/request_sock.c only.
No need in keeping it in the header file.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 0a954ee..cff4608 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -124,18 +124,6 @@ struct request_sock_queue {
 extern int reqsk_queue_alloc(struct request_sock_queue *queue,
 			     unsigned int nr_table_entries);
 
-static inline struct listen_sock *reqsk_queue_yank_listen_sk(struct request_sock_queue *queue)
-{
-	struct listen_sock *lopt;
-
-	write_lock_bh(&queue->syn_wait_lock);
-	lopt = queue->listen_opt;
-	queue->listen_opt = NULL;
-	write_unlock_bh(&queue->syn_wait_lock);
-
-	return lopt;
-}
-
 extern void __reqsk_queue_destroy(struct request_sock_queue *queue);
 extern void reqsk_queue_destroy(struct request_sock_queue *queue);
 
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index dd78b85..45aed75 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -93,6 +93,19 @@ void __reqsk_queue_destroy(struct request_sock_queue *queue)
 
 EXPORT_SYMBOL(__reqsk_queue_destroy);
 
+static inline struct listen_sock *reqsk_queue_yank_listen_sk(
+		struct request_sock_queue *queue)
+{
+	struct listen_sock *lopt;
+
+	write_lock_bh(&queue->syn_wait_lock);
+	lopt = queue->listen_opt;
+	queue->listen_opt = NULL;
+	write_unlock_bh(&queue->syn_wait_lock);
+
+	return lopt;
+}
+
 void reqsk_queue_destroy(struct request_sock_queue *queue)
 {
 	/* make all the listen_opt local to us */
 Topic: [PATCH] [NETFILTER] ipt_SAME: add compat conversion functions
[PATCH] [NETFILTER] ipt_SAME: add compat conversion functions [message #23137] Tue, 13 November 2007 07:40
finist is currently offline finist
Messages: 438
Registered: January 2006
Location: Moscow, Russia
Senior Member
From: openvz.org
[NETFILTER]: ipt_SAME: add compat conversion functions

ipt_SAME should have the compat function cause its entry structure (ipt_same_info)
contains a pointer between data filled/checked in both kernel and userspace.

Signed-off-by: Konstantin Khorenko <khorenko@sw.ru>

---
Thank you,
	Konstantin Khorenko

SWsoft Virtuozzo/OpenVZ Linux kernel team


--- ./net/ipv4/netfilter/ipt_SAME.c.SAME	2007-11-06 13:55:16.000000000 +0300
+++ ./net/ipv4/netfilter/ipt_SAME.c	2007-11-09 16:50:38.000000000 +0300
@@ -152,6 +152,47 @@ same_target(struct sk_buff *skb,
 	return nf_nat_setup_info(ct, &newrange, hooknum);
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_ipt_same_info
+{
+	unsigned char info;
+	u_int32_t rangesize;
+	u_int32_t ipnum;
+	compat_uptr_t iparray;
+
+	/* hangs off end. */
+	struct nf_nat_range range[IPT_SAME_MAX_RANGE];
+};
+
+static void compat_from_user(void *dst, void *src)
+{
+	const struct compat_ipt_same_info *cl = src;
+	struct ipt_same_info l = {
+		.info		= cl->info,
+		.rangesize	= cl->rangesize,
+		.ipnum		= 0,
+		.iparray	= NULL,
+	};
+
+	memcpy(l.range, cl->range, sizeof(l.range));
+	memcpy(dst, &l, sizeof(l));
+}
+
+static int compat_to_user(void __user *dst, void *src)
+{
+	const struct ipt_same_info *l = src;
+	struct compat_ipt_same_info cl = {
+		.info		= l->info,
+		.rangesize	= l->rangesize,
+		.ipnum		= 0,
+		.iparray	= (compat_uptr_t)NULL,
+	};
+
+	memcpy(cl.range, l->range, sizeof(cl.range));
+	return copy_to_user(dst, &cl, sizeof(cl)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
 static struct xt_target same_reg __read_mostly = {
 	.name		= "SAME",
 	.family		= AF_INET,
@@ -161,6 +202,11 @@ static struct xt_target same_reg __read_
 	.hooks		= (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING),
 	.checkentry	= same_check,
 	.destroy	= same_destroy,
+#ifdef CONFIG_COMPAT
+	.compatsize	= sizeof(struct compat_ipt_same_info),
+	.compat_from_user = compat_from_user,
+	.compat_to_user	= compat_to_user,
+#endif
 	.me		= THIS_MODULE,
 };
 Topic: [RFC][PATCH][LLC] Use existing sock refcnt debugging
[RFC][PATCH][LLC] Use existing sock refcnt debugging [message #23019] Fri, 09 November 2007 09:43
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
Hi, Arnaldo.

I've grep-ed through the code and found one more place, where
the sk refcnt debugging is required, but is still performed in
an old fashion - this is the LLC2.

The problem in using the sk_refcnt_debug_xxx here is that these
socks do not provide the sk_destruct callback to catch the 
moment of the sock destruction.

Making this callback mandatory is not a good solution, as most 
often it will be empty and thus useless. Making this callback 
be set under the #ifdef SOCK_REFCNT_DEBUG is even more ugly 
than the previous one.

So, I propose to extend the sk_refcnt_debug_xxx set of helperf 
for those socks not having the sk_destruct callback by default,
like the LLC2 ones.

The new helper is sk_refcnt_debug_inc_undo(sk) sets the 
sk_destruct callback into the sk_refcnt_debug_dec() in case the
SOCK_REFCNT_DEBUG is on.

What do you think about it?

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/include/net/sock.h b/include/net/sock.h
index 5504fb9..1404ab9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -654,10 +654,21 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
 		printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n",
 		       sk->sk_prot->name, sk, atomic_read(&sk->sk_refcnt));
 }
+
+/*
+ * this one is to be used *only* for thouse socks, that
+ * do not have their own sk_destruct callback
+ */
+static inline void sk_refcnt_debug_inc_undo(struct sock *sk)
+{
+	sk_refcnt_debug_inc(sk);
+	sk->sk_destruct = sk_refcnt_debug_dec;
+}
 #else /* SOCK_REFCNT_DEBUG */
 #define sk_refcnt_debug_inc(sk) do { } while (0)
 #define sk_refcnt_debug_dec(sk) do { } while (0)
 #define sk_refcnt_debug_release(sk) do { } while (0)
+#define sk_refcnt_debug_inc_undo(sk) do { } while (0)
 #endif /* SOCK_REFCNT_DEBUG */
 
 /* Called with local bh disabled */
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 5c0b484..6ee8778 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -775,11 +775,6 @@ drop_unlock:
 	goto out;
 }
 
-#undef LLC_REFCNT_DEBUG
-#ifdef LLC_REFCNT_DEBUG
-static atomic_t llc_sock_nr;
-#endif
-
 /**
  *	llc_backlog_rcv - Processes rx frames and expired timers.
  *	@sk: LLC sock (p8022 connection)
@@ -875,11 +870,7 @@ struct sock *llc_sk_alloc(struct net *net, int family, gfp_t priority, struct pr
 		goto out;
 	llc_sk_init(sk);
 	sock_init_data(NULL, sk);
-#ifdef LLC_REFCNT_DEBUG
-	atomic_inc(&llc_sock_nr);
-	printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk,
-		__FUNCTION__, atomic_read(&llc_sock_nr));
-#endif
+	sk_refcnt_debug_inc_undo(sk);
 out:
 	return sk;
 }
@@ -905,18 +896,7 @@ void llc_sk_free(struct sock *sk)
 	skb_queue_purge(&sk->sk_receive_queue);
 	skb_queue_purge(&sk->sk_write_queue);
 	skb_queue_purge(&llc->pdu_unack_q);
-#ifdef LLC_REFCNT_DEBUG
-	if (atomic_read(&sk->sk_refcnt) != 1) {
-		printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n",
-			sk, __FUNCTION__, atomic_read(&sk->sk_refcnt));
-		printk(KERN_DEBUG "%d LLC sockets are still alive\n",
-			atomic_read(&llc_sock_nr));
-	} else {
-		atomic_dec(&llc_sock_nr);
-		printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk,
-			__FUNCTION__, atomic_read(&llc_sock_nr));
-	}
-#endif
+	sk_refcnt_debug_release(sk);
 	sock_put(sk);
 }
 Topic: [PATCH 0/2] fix for OOPS in pernet list operations if CONFIG_NET_NS undefined
[PATCH 0/2] fix for OOPS in pernet list operations if CONFIG_NET_NS undefined [message #22906] Wed, 07 November 2007 06:56
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
These patches are addressed to the oops reported by the Cedric Le Goater
a week ago. The pernet_operations were discarder during kernel boot and
this breaks further operations as this 

Though, the patch from Pavel Emelyanov was partially reverted
by the Eric W. Biederman [commit 2b008b0a8e96b726c603c5e1a5a7a509b5f61e35]

So, I revert the Eric patch (actually, Eric one can be simply dropped) and
fix original code. There is no need for such complex code if CONFIG_NET_NS
is not defined.
 Topic: Re: PID namespaces break initrd+hibernate combination?
Re: PID namespaces break initrd+hibernate combination? [message #22833] Mon, 05 November 2007 22:25
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
Nigel Cunningham wrote:
> Hi all.
>
> Please excuse me if this has already been answered. I'm not currently subscribed to LKML.
>
> I've just been preparing a new tux-on-ice release against Linus' current tree, and encountered a failure to freeze pid 1 when seeking to resume, using an initrd:
>
> [   74.192734] Freezing of tasks failed after 19.99 seconds (1 tasks refusing to freeze):
> [   74.193502]   task                        PC stack   pid father
> [   74.193504] swapper       S ffff810002023030  4968     1      0
> [   74.193512]  ffff81000203fdb0 0000000000000046 ffff810002023040 ffff810003249140
> [   74.194296]  ffff81000203fd80 ffffffff803150a1 ffff81000203fdb0 ffff810002023180
> [   74.195087]  ffff810002023030 0000000000000004 0000000000000001 0000000000000001
> [   74.195860] Call Trace:
> [   74.196123]  [<ffffffff803150a1>] security_task_wait+0x11/0x20
> [   74.196692]  [<ffffffff802320cd>] do_wait+0x51d/0xda0
> [   74.197187]  [<ffffffff802292f0>] default_wake_function+0x0/0x10
> [   74.197772]  [<ffffffff8023297c>] sys_wait4+0x2c/0x30
> [   74.198264]  [<ffffffff805f4bb5>] initrd_load+0x175/0x370
> [   74.198794]  [<ffffffff805f211f>] prepare_namespace+0x8f/0x1d0
> [   74.199362]  [<ffffffff805f174d>] kernel_init+0x1ad/0x2b0
> [   74.199889]  [<ffffffff8047e526>] _spin_unlock_irq+0x26/0x60
> [   74.200439]  [<ffffffff8022afc7>] finish_task_switch+0x67/0xc0
> [   74.201008]  [<ffffffff8020c548>] child_rip+0xa/0x12
> [   74.201494]  [<ffffffff80364770>] acpi_os_acquire_lock+0x9/0xb
> [   74.202063]  [<ffffffff805f15a0>] kernel_init+0x0/0x2b0
> [   74.202570]  [<ffffffff8020c53e>] child_rip+0x0/0x12
>
> I believe it might be related to pid namespaces, but am not completely sure yet (will do a git bisect if needs be).
>   
Hmm. prepare_namespace() seems more about mounting root filesystem and 
less about pid namespace.
but I could be wrong. Do you have any more console messages ? CCing the 
containers list
> So, then, I'm writing to ask: Is this a known issue? Is there any fix already available that I've not found in my googling?
>
> Regards,
>
> Nigel
>   


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 2/2][NETFILTER] Use the list_for_each_entry in nf_sockopt.c
[PATCH 2/2][NETFILTER] Use the list_for_each_entry in nf_sockopt.c [message #22700] Thu, 01 November 2007 11:58
Pavel Emelianov is currently offline Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
From: openvz.org
The list_head pointer, used to iterate over the list, is not used
at all, but to get the struct nf_sockopt_ops pointer (and actually
not in the 100% clean way).

So use the list_for_each_entry, removing one unneeded variable
from each place of use.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index a5e5e30..87bc144 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -23,14 +23,13 @@ static inline int overlap(int min1, int max1, int min2, int max2)
 /* Functions to register sockopt ranges (exclusive). */
 int nf_register_sockopt(struct nf_sockopt_ops *reg)
 {
-	struct list_head *i;
+	struct nf_sockopt_ops *ops;
 	int ret = 0;
 
 	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
 		return -EINTR;
 
-	list_for_each(i, &nf_sockopts) {
-		struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
+	list_for_each_entry(ops, &nf_sockopts, list) {
 		if (ops->pf == reg->pf
 		    && (overlap(ops->set_optmin, ops->set_optmax,
 				reg->set_optmin, reg->set_optmax)
@@ -64,7 +63,6 @@ EXPORT_SYMBOL(nf_unregister_sockopt);
 static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf,
 		int val, int get)
 {
-	struct list_head *i;
 	struct nf_sockopt_ops *ops;
 
 	if (sk->sk_net != &init_net)
@@ -73,8 +71,7 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf,
 	if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0)
 		return ERR_PTR(-EINTR);
 
-	list_for_each(i, &nf_sockopts) {
-		ops = (struct nf_sockopt_ops *)i;
+	list_for_each_entry(ops, &nf_sockopts, list) {
 		if (ops->pf == pf) {
 			if (!try_module_get(ops->owner))
 				goto out_nosup;
-- 
1.5.3.4
 Topic: Re: [dm-devel] Re: dm: bounce_pfn limit added
Re: [dm-devel] Re: dm: bounce_pfn limit added [message #22641] Wed, 31 October 2007 20:00
Alasdair G Kergon is currently offline Alasdair G Kergon
Messages: 6
Registered: September 2007
Junior Member
From: openvz.org
On Wed, Oct 31, 2007 at 05:00:16PM -0500, Kiyoshi Ueda wrote:
> How about the case that other dm device is stacked on the dm device?
> (e.g. dm-linear over dm-multipath over i2o with bounce_pfn=64GB, and
>       the multipath table is changed to i2o with bounce_pfn=1GB.)
 
Let's not broaden the problem out in that direction yet - that's a
known flaw in the way all these device restrictions are handled.
(Which would, it happens, also be resolved by the dm architectural
changes I'm contemplating.)

Yes, we could certainly take this patch - it won't do much harm (just
hit performance in some configurations).  But I am not yet convinced
that there isn't some further underlying problem with the way the
responsibility for this bouncing is divided up between the various
layers: I still don't feel I completely understand this problem yet.

- How does that bio_alloc() in blk_queue_bounce() guarantee never to
lead a deadlock (in the device-mapper context)?
- Are some functions failing to take account of the hw_segments
(and perhaps other) restrictions?
- Are things actually simpler if the bouncing is dealt with just once 
prior to entering the device stack (even though that may involve
bouncing some data that does not need it) or is it better to endeavour
to keep the bouncing as close to the final layer as possible?

Alasdair
-- 
agk@redhat.com
 Topic: [RFD] net list protected by rcu
[RFD] net list protected by rcu [message #22616] Wed, 31 October 2007 10:42
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
Hi,

Benjamin and I, we are currently looking for using IPV6 for the network 
namespaces.

There is a special case where we must browse the network namespace list 
to check the routes ages at a given time for garbage collecting.

fib6_run_gc
  => fib6_clean_all

In this function we browse the network namespace list with the usual 
macro: for_each_net, which should be protected by rtnl_lock.

The function fib6_run_gc is a timer callback, that means we are called 
from interrupt handler. But in this case, we can not use rtnl_lock 
because it locks a mutex and this is forbidden to do that from an 
interrupt handler.

If we put apart the fact there is perhaps a better solution than 
browsing the netns list (eg. make a gc timer per namespace), can we 
consider to simply use the RCU to lock the network namespace list ?

So we can remove the rtnl_lock calls in the network namespaces and just 
use rcu_read_lock for browsing the netns list in the network code. That 
will be more flexible, we can use it in interrupt handler, we can nest 
with another rcu_read_lock and we don't add more locking contention for 
the network.


_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] memory cgroup enhancements take 4 [1/8] fix zone handling in try_to_free_mem_cgroup_page
[PATCH] memory cgroup enhancements take 4 [1/8] fix zone handling in try_to_free_mem_cgroup_page [message #22587] Wed, 31 October 2007 06:24
KAMEZAWA Hiroyuki is currently offline KAMEZAWA Hiroyuki
Messages: 463
Registered: September 2006
Senior Member
From: openvz.org
Because NODE_DATA(node)->node_zonelists[] is guaranteed to contain
all necessary zones, it is not necessary to use for_each_online_node.

And this for_each_online_node() makes reclaim routine start always
from node 0. This is not good. This patch makes reclaim start from
caller's node and just use usual (default) zonelist order.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>

 mm/vmscan.c |   10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

Index: devel-2.6.23-mm1/mm/vmscan.c
===================================================================
--- devel-2.6.23-mm1.orig/mm/vmscan.c
+++ devel-2.6.23-mm1/mm/vmscan.c
@@ -1375,15 +1375,13 @@ unsigned long try_to_free_mem_cgroup_pag
 		.mem_cgroup = mem_cont,
 		.isolate_pages = mem_cgroup_isolate_pages,
 	};
-	int node;
+	int node = numa_node_id();
 	struct zone **zones;
 	int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
 
-	for_each_online_node(node) {
-		zones = NODE_DATA(node)->node_zonelists[target_zone].zones;
-		if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
-			return 1;
-	}
+	zones = NODE_DATA(node)->node_zonelists[target_zone].zones;
+	if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+		return 1;
 	return 0;
 }
 #endif

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [IPV6] cleanup : remove proc_net_remove called twice
[IPV6] cleanup : remove proc_net_remove called twice [message #22537] Tue, 30 October 2007 08:55
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
The file /proc/net/if_inet6 is removed twice.
First time in:
         inet6_exit
              ->addrconf_cleanup
And followed a few lines after by:
         inet6_exit
              -> if6_proc_exit

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
---
  net/ipv6/addrconf.c |    4 ----
  1 file changed, 4 deletions(-)

Index: net-2.6/net/ipv6/addrconf.c
===================================================================
--- net-2.6.orig/net/ipv6/addrconf.c
+++ net-2.6/net/ipv6/addrconf.c
@@ -4288,8 +4288,4 @@ void __exit addrconf_cleanup(void)
         del_timer(&addr_chk_timer);

         rtnl_unlock();
-
-#ifdef CONFIG_PROC_FS
-       proc_net_remove(&init_net, "if_inet6");
-#endif
  }
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH][NETNS] fix net released by rcu callback
[PATCH][NETNS] fix net released by rcu callback [message #22517] Tue, 30 October 2007 06:33
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
When a network namespace reference is held by a network subsystem,
and when this reference is decremented in a rcu update callback, we
must ensure that there is no more outstanding rcu update before
trying to free the network namespace.

In the normal case, the rcu_barrier is called when the network namespace
is exiting in the cleanup_net function.

But when a network namespace creation fails, and the subsystems are
undone (like the cleanup), the rcu_barrier is missing.

This patch adds the missing rcu_barrier.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>
---
  net/core/net_namespace.c |    2 ++
  1 file changed, 2 insertions(+)

Index: net-2.6/net/core/net_namespace.c
===================================================================
--- net-2.6.orig/net/core/net_namespace.c
+++ net-2.6/net/core/net_namespace.c
@@ -112,6 +112,8 @@ out_undo:
                 if (ops->exit)
                         ops->exit(net);
         }
+
+       rcu_barrier();
         goto out;
  }
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Fw: [PATCH 2/2] Warn when container-init defaults fatal signals
Fw: [PATCH 2/2] Warn when container-init defaults fatal signals [message #22494] Mon, 29 October 2007 19:05
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
Resend to the Eric's correct address...

Suka

----- Forwarded message from sukadev@us.ibm.com -----

| Date: Sat, 27 Oct 2007 12:09:28 -0700
| From: sukadev@us.ibm.com
| To: eric@us.ibm.com, Pavel Emelianov <xemul@openvz.org>,        Oleg Nesterov
|  <oleg@tv-sign.ru>
| Cc: Containers <containers@lists.osdl.org>, clg@fr.ibm.com
| Subject: [PATCH 2/2] Warn when container-init defaults fatal signals
| 
| 
| From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| Subject: [PATCH 2/2] Warn when container-init defaults fatal signals
| 
| Print a warning the first time a container-init (other than global init)
| forks a child process without explicitly ignoring or handling a fatal signal.
| Comments in the patch below explain the gory background :-)
| 
| Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>
| 
| ---
|  kernel/fork.c |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
|  1 file changed, 51 insertions(+)
| 
| Index: 2.6.23-mm1/kernel/fork.c
| ===================================================================
| --- 2.6.23-mm1.orig/kernel/fork.c	2007-10-27 11:46:38.000000000 -0700
| +++ 2.6.23-mm1/kernel/fork.c	2007-10-27 11:48:36.000000000 -0700
| @@ -966,6 +966,53 @@ static void rt_mutex_init_task(struct ta
|  }
| 
|  /*
| + * Container-init process must appear like a normal process to its sibling
| + * in the parent namespace and should be killable (or not) in the usual way.
| + *
| + * But it must be immune to any unwanted signals from within its own namespace.
| + *
| + * At the time of sending the signal, sig_init_ignore() checks and ignores
| + * if receiver is container-init and the signal is unwanted.
| + *
| + * A limitation with the check in sig_init_ignore() is that if the signal is
| + * blocked by the container-init at the time of the check, we cannot ignore
| + * the signal because the container-init may install a handler for the signal
| + * before unblocking it.
| + *
| + * But if the container-init unblocks the signal without installing the handler,
| + * the unwanted signal will still be delivered to the container-init. If the
| + * unwanted signal is fatal (i.e default action is to terminate), we end up
| + * terminating the container-init and hence the container.
| + *
| + * There does not seem to be an easy/clean way to address this blocked-signal
| + * issue in the kernel.  For now, it appears easier to let the container-init
| + * decide what it wants to do with signals i.e have it _explicitly_ ignore or
| + * handle all fatal signals.
| + *
| + * Following routine prints a warning if the container-init does not
| + * explicitly ignore or handle fatal signals.
| + *
| + * Return 1 if the warning is printed.  Return 0 otherwise.
| + */
| +static int check_fatal_signals(struct task_struct *task)
| +{
| +	int i;
| +
| +	if (!is_container_init(task))
| +		return 0;
| +
| +	for (i = 1; i < _NSIG; i++) {
| +		if (!sig_fatal(task, i))
| +			continue;
| +
| +		printk(KERN_WARNING "Container init %d does not handle/ignore "
| +				"all fatal signals\n", task_pid_nr(task));
| +		return 1;
| +	}
| +	return 0;
| +}
| +
| +/*
|   * This creates a new process as a copy of the old one,
|   * but does not actually start it yet.
|   *
| @@ -983,6 +1030,10 @@ static struct task_struct *copy_process(
|  	int retval;
|  	struct task_struct *p;
|  	int cgroup_callbacks_done = 0;
| +	static int fatal_signal_warned;
| +
| +	if (!is_global_init(current) && !fatal_signal_warned)
| +		fatal_signal_warned = check_fatal_signals(current);
| 
|  	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
|  		return ERR_PTR(-EINVAL);
| _______________________________________________
| Containers mailing list
| Containers@lists.linux-foundation.org
| https://lists.linux-foundation.org/mailman/listinfo/containers

----- End forwarded message -----
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] Report usage in CFS cgroup
[PATCH] Report usage in CFS cgroup [message #22472] Mon, 29 October 2007 13:19
menage is currently offline menage
Messages: 5
Registered: August 2007
Junior Member
From: openvz.org
Report CPU usage in CFS Cgroup directories

Adds a cpu.usage file to the CFS cgroup that reports CPU usage in
milliseconds for that cgroup's tasks

Signed-off-by: Paul Menage <menage@google.com>

---
 kernel/sched.c |   36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

Index: container-2.6.23-mm1/kernel/sched.c
===================================================================
--- container-2.6.23-mm1.orig/kernel/sched.c
+++ container-2.6.23-mm1/kernel/sched.c
@@ -7005,15 +7005,41 @@ static u64 cpu_shares_read_uint(struct c
 	return (u64) tg->shares;
 }
 
-static struct cftype cpu_shares = {
-	.name = "shares",
-	.read_uint = cpu_shares_read_uint,
-	.write_uint = cpu_shares_write_uint,
+static u64 cpu_usage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	int i;
+	u64 res = 0;
+	for_each_possible_cpu(i) {
+		unsigned long flags;
+		/*
+		 * Lock to prevent races with updating 64-bit counters
+		 * on 32-bit arches.
+		 */
+		spin_lock_irqsave(&cpu_rq(i)->lock, flags);
+		res += tg->se[i]->sum_exec_runtime;
+		spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
+	}
+	/* Convert from ns to ms */
+	do_div(res, 1000000);
+	return res;
+}
+
+static struct cftype cpu_files[] = {
+	{
+		.name = "shares",
+		.read_uint = cpu_shares_read_uint,
+		.write_uint = cpu_shares_write_uint,
+	},
+	{
+		.name = "usage",
+		.read_uint = cpu_usage_read,
+	},
 };
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 {
-	return cgroup_add_file(cont, ss, &cpu_shares);
+	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
 }
 
 struct cgroup_subsys cpu_cgroup_subsys = {
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH] watchdog: spin_lock_init() fixes
[PATCH] watchdog: spin_lock_init() fixes [message #22460] Mon, 29 October 2007 10:40
Alexey Dobriyan is currently offline Alexey Dobriyan
Messages: 195
Registered: August 2006
Senior Member
From: openvz.org
Some watchdog drivers initialize global spinlocks in module's init function
which is tolerable, but some do it in PCI probe function. So, switch to
static initialization to fix theoretical bugs and, more importantly, stop
giving people bad examples.

Signed-off-by: Alexey Dobriyan <adobriyan@sw.ru>
---

 drivers/watchdog/alim1535_wdt.c   |    4 +---
 drivers/watchdog/davinci_wdt.c    |    4 +---
 drivers/watchdog/i6300esb.c       |    4 +---
 drivers/watchdog/ib700wdt.c       |    4 +---
 drivers/watchdog/machzwd.c        |    7 ++-----
 drivers/watchdog/mpc83xx_wdt.c    |    5 +----
 drivers/watchdog/pc87413_wdt.c    |    4 +---
 drivers/watchdog/pnx4008_wdt.c    |    4 +---
 drivers/watchdog/sbc8360.c        |    3 +--
 drivers/watchdog/sc1200wdt.c      |    3 +--
 drivers/watchdog/sc520_wdt.c      |    4 +---
 drivers/watchdog/smsc37b787_wdt.c |    4 +---
 drivers/watchdog/w83627hf_wdt.c   |    4 +---
 drivers/watchdog/w83697hf_wdt.c   |    4 +---
 drivers/watchdog/w83877f_wdt.c    |    4 +---
 drivers/watchdog/w83977f_wdt.c    |    4 +---
 drivers/watchdog/wafer5823wdt.c   |    4 +---
 drivers/watchdog/wdt977.c         |    4 +---
 drivers/watchdog/wdt_pci.c        |    3 +--
 19 files changed, 20 insertions(+), 57 deletions(-)

--- a/drivers/watchdog/alim1535_wdt.c
+++ b/drivers/watchdog/alim1535_wdt.c
@@ -31,7 +31,7 @@ static unsigned long ali_is_open;
 static char ali_expect_release;
 static struct pci_dev *ali_pci;
 static u32 ali_timeout_bits;	/* stores the computed timeout */
-static spinlock_t ali_lock;	/* Guards the hardware */
+static DEFINE_SPINLOCK(ali_lock);	/* Guards the hardware */
 
 /* module parameters */
 static int timeout = WATCHDOG_TIMEOUT;
@@ -398,8 +398,6 @@ static int __init watchdog_init(void)
 {
 	int ret;
 
-	spin_lock_init(&ali_lock);
-
 	/* Check whether or not the hardware watchdog is there */
 	if (ali_find_watchdog() != 0) {
 		return -ENODEV;
--- a/drivers/watchdog/davinci_wdt.c
+++ b/drivers/watchdog/davinci_wdt.c
@@ -61,7 +61,7 @@
 
 static int heartbeat = DEFAULT_HEARTBEAT;
 
-static spinlock_t io_lock;
+static DEFINE_SPINLOCK(io_lock);
 static unsigned long wdt_status;
 #define WDT_IN_USE        0
 #define WDT_OK_TO_CLOSE   1
@@ -200,8 +200,6 @@ static int davinci_wdt_probe(struct platform_device *pdev)
 	int ret = 0, size;
 	struct resource *res;
 
-	spin_lock_init(&io_lock);
-
 	if (heartbeat < 1 || heartbeat > MAX_HEARTBEAT)
 		heartbeat = DEFAULT_HEARTBEAT;
 
--- a/drivers/watchdog/i6300esb.c
+++ b/drivers/watchdog/i6300esb.c
@@ -77,7 +77,7 @@
 
 /* internal variables */
 static void __iomem *BASEADDR;
-static spinlock_t esb_lock; /* Guards the hardware */
+static DEFINE_SPINLOCK(esb_lock); /* Guards the hardware */
 static unsigned long timer_alive;
 static struct pci_dev *esb_pci;
 static unsigned short triggered; /* The status of the watchdog upon boot */
@@ -456,8 +456,6 @@ static int __init watchdog_init (void)
 {
         int ret;
 
-        spin_lock_init(&esb_lock);
-
         /* Check whether or not the hardware watchdog is there */
         if (!esb_getdevice () || esb_pci == NULL)
                 return -ENODEV;
--- a/drivers/watchdog/ib700wdt.c
+++ b/drivers/watchdog/ib700wdt.c
@@ -48,7 +48,7 @@
 
 static struct platform_device *ibwdt_platform_device;
 static unsigned long ibwdt_is_open;
-static spinlock_t ibwdt_lock;
+static DEFINE_SPINLOCK(ibwdt_lock);
 static char expect_close;
 
 /* Module information */
@@ -308,8 +308,6 @@ static int __devinit ibwdt_probe(struct platform_device *dev)
 {
 	int res;
 
-	spin_lock_init(&ibwdt_lock);
-
 #if WDT_START != WDT_STOP
 	if (!request_region(WDT_STOP, 1, "IB700 WDT")) {
 		printk (KERN_ERR PFX "STOP method I/O %X is not available.\n", WDT_STOP);
--- a/drivers/watchdog/machzwd.c
+++ b/drivers/watchdog/machzwd.c
@@ -123,8 +123,8 @@ static void zf_ping(unsigned long data);
 static int zf_action = GEN_RESET;
 static unsigned long zf_is_open;
 static char zf_expect_close;
-static spinlock_t zf_lock;
-static spinlock_t zf_port_lock;
+static DEFINE_SPINLOCK(zf_lock);
+static DEFINE_SPINLOCK(zf_port_lock);
 static DEFINE_TIMER(zf_timer, zf_ping, 0, 0);
 static unsigned long next_heartbeat = 0;
 
@@ -438,9 +438,6 @@ static int __init zf_init(void)
 
 	zf_show_action(action);
 
-	spin_lock_init(&zf_lock);
-	spin_lock_init(&zf_port_lock);
-
 	if(!request_region(ZF_IOBASE, 3, "MachZ ZFL WDT")){
 		printk(KERN_ERR "cannot reserve I/O ports at %d\n",
 							ZF_IOBASE);
--- a/drivers/watchdog/mpc83xx_wdt.c
+++ b/drivers/watchdog/mpc83xx_wdt.c
@@ -56,7 +56,7 @@ static int prescale = 1;
 static unsigned int timeout_sec;
 
 static unsigned long wdt_is_open;
-static spinlock_t wdt_spinlock;
+static DEFINE_SPINLOCK(wdt_spinlock);
 
 static void mpc83xx_wdt_keepalive(void)
 {
@@ -185,9 +185,6 @@ static int __devinit mpc83xx_wdt_probe(struct platform_device *dev)
 	printk(KERN_INFO "WDT driver for MPC83xx initialized. "
 		"mode:%s timeout=%d (%d seconds)\n",
 		reset ? "reset":"interrupt", timeout, timeout_sec);
-
-	spin_lock_init(&wdt_spinlock);
-
 	return 0;
 
 err_unmap:
--- a/drivers/watchdog/pc87413_wdt.c
+++ b/drivers/watchdog/pc87413_wdt.c
@@ -61,7 +61,7 @@ static unsigned long timer_enabled = 0;  /* is the timer enabled? */
 
 static char expect_close;                /* is the close expected? */
 
-static spinlock_t io_lock;               /* to guard the watchdog from io races */
+static DEFINE_SPINLOCK(io_lock);/* to guard the watchdog from io races */
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 
@@ -561,8 +561,6 @@ static int __init pc87413_init(void)
 {
 	int ret;
 
-	spin_lock_init(&io_lock);
-
 	printk(KERN_INFO PFX "Version " VERSION " at io 0x%X\n", WDT_INDEX_IO_PORT);
 
 	/* request_region(io, 2, "pc87413"); */
--- a/drivers/watchdog/pnx4008_wdt.c
+++ b/drivers/watchdog/pnx4008_wdt.c
@@ -80,7 +80,7 @@
 static int nowayout = WATCHDOG_NOWAYOUT;
 static int heartbeat = DEFAULT_HEARTBEAT;
 
-static spinlock_t io_lock;
+static DEFINE_SPINLOCK(io_lock);
 static unsigned long wdt_status;
 #define WDT_IN_USE        0
 #define WDT_OK_TO_CLOSE   1
@@ -254,8 +254,6 @@ static int pnx4008_wdt_probe(struct platform_device *pdev)
 	int ret = 0, size;
 	struct resource *res;
 
-	spin_lock_init(&io_lock);
-
 	if (heartbeat < 1 || heartbeat > MAX_HEARTBEAT)
 		heartbeat = DEFAULT_HEARTBEAT;
 
--- a/drivers/watchdog/sbc8360.c
+++ b/drivers/watchdog/sbc8360.c
@@ -54,7 +54,7 @@
 #include <asm/system.h>
 
 static unsigned long sbc8360_is_open;
-static spinlock_t sbc8360_lock;
+static DEFINE_SPINLOCK(sbc8360_lock);
 static char expect_close;
 
 #define PFX "sbc8360: "
@@ -359,7 +359,6 @@ static int __init sbc8360_init(void)
 		goto out_noreboot;
 	}
 
-	spin_lock_init(&sbc8360_lock);
 	res = misc_register(&sbc8360_miscdev);
 	if (res) {
 		printk(KERN_ERR PFX "failed to register misc device\n");
--- a/drivers/watchdog/sc1200wdt.c
+++ b/drivers/watchdog/sc1200wdt.c
@@ -74,7 +74,7 @@ static int io = -1;
 static int io_len = 2;		/* for non plug and play */
 static struct semaphore open_sem;
 static char expect_close;
-static spinlock_t sc1200wdt_lock;	/* io port access serialisation */
+static DEFINE_SPINLOCK(sc1200wdt_lock);	/* io port access serialisation */
 
 #if defined CONFIG_PNP
 static int isapnp = 1;
@@ -375,7 +375,6 @@ static int __init sc1200wdt_init(void)
 
 	printk("%s\n", banner);
 
-	spin_lock_init(&sc1200wdt_lock);
 	sema_init(&open_sem, 1);
 
 #if defined CONFIG_PNP
--- a/drivers/watchdog/sc520_wdt.c
+++ b/drivers/watchdog/sc520_wdt.c
@@ -125,7 +125,7 @@ static DEFINE_TIMER(timer, wdt_timer_ping, 0, 0);
 static unsigned long next_heartbeat;
 static unsigned long wdt_is_open;
 static char wdt_expect_close;
-static spinlock_t wdt_spinlock;
+static DEFINE_SPINLOCK(wdt_spinlock);
 
 /*
  *	Whack the dog
@@ -383,8 +383,6 @@ static int __init sc520_wdt_init(void)
 {
 	int rc = -EBUSY;
 
-	spin_lock_init(&wdt_spinlock);
-
 	/* Check that the timeout value is within it's range ; if not reset to the default */
 	if (wdt_set_heartbeat(timeout)) {
 		wdt_set_heartbeat(WATCHDOG_TIMEOUT);
--- a/drivers/watchdog/smsc37b787_wdt.c
+++ b/drivers/watchdog/smsc37b787_wdt.c
@@ -83,7 +83,7 @@ static unsigned long timer_enabled = 0;   /* is the timer enabled? */
 
 static char expect_close;       /* is the close expected? */
 
-static spinlock_t io_lock;	/* to guard the watchdog from io races */
+static DEFINE_SPINLOCK(io_lock);/* to guard the watchdog from io races */
 
 static int nowayout = WATCHDOG_NOWAYOUT;
 
@@ -540,8 +540,6 @@ static int __init wb_smsc_wdt_init(void)
 {
 	int ret;
 
-	spin_lock_init(&io_lock);
-
 	printk("SMsC 37B787 watchdog component driver " VERSION " initialising...\n");
 
 	if (!request_region(IOPORT, IOPORT_SIZE, "SMsC 37B787 watchdog")) {
--- a/drivers/watchdog/w83627hf_wdt.c
+++ b/drivers/watchdog/w83627hf_wdt.c
@@ -48,7 +48,7 @@
 
 static unsigned long wdt_is_open;
 static char expect_close;
-static spinlock_t io_lock;
+static DEFINE_SPINLOCK(io_lock);
 
 /* You must set this - there is no sane way to probe for this board. */
 static int wdt_io = 0x2E;
@@ -328,8 +328,6 @@ wdt_init(void)
 {
 	int ret;
 
-	spin_lock_init(&io_lock);
-
 	printk(KERN_INFO "WDT driver for the Winbond(TM) W83627HF/THF/HG Super I/O chip initialising.\n");
 
 	if (wdt_set_heartbeat(timeout)) {
--- a/drivers/watchdog/w83697hf_wdt.c
+++ b/drivers/watchdog/w83697hf_wdt.c
@@ -47,7 +47,7 @@
 
 static unsigned long wdt_is_open;
 static char expect_close;
-static spinlock_t io_lock;
+static DEFINE_SPINLOCK(io_lock);
 
 /* You must set this - there is no sane way to probe for this board. */
 static int wdt_io = 0x2e;
@@ -376,8 +376,6 @@ wdt_init(void)
 {
 	int ret, i, found = 0;
 
-	spin_lock_init(&io_lock);
-
 	printk (KERN_INFO PFX "WDT driver for W83697HF/HG initializi
...

 Topic: dm: struct io_restriction reordered
dm: struct io_restriction reordered [message #22442] Mon, 29 October 2007 02:31
vaverin is currently offline vaverin
Messages: 626
Registered: September 2005
Senior Member
From: openvz.org
it saves some bytes memory

Signed-off-by:	Vasily Averin <vvs@sw.ru>

--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -110,12 +110,12 @@ struct target_type {
 };

 struct io_restrictions {
+	unsigned long		seg_boundary_mask;
 	unsigned int		max_sectors;
+	unsigned int		max_segment_size;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
 	unsigned short		hardsect_size;
-	unsigned int		max_segment_size;
-	unsigned long		seg_boundary_mask;
 	unsigned char		no_cluster; /* inverted so that 0 is default */
 };
 Topic: i2o: debug messages corrected
i2o: debug messages corrected [message #22440] Mon, 29 October 2007 02:31
vaverin is currently offline vaverin
Messages: 626
Registered: September 2005
Senior Member
From: openvz.org
max_phys_segments and max_sectors were swapped

Signed-off-by:	Vasily Averin <vvs@sw.ru>

--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -1076,8 +1076,8 @@ static int i2o_block_probe(struct device *dev)
 	blk_queue_max_sectors(queue, max_sectors);
 	blk_queue_max_hw_segments(queue, i2o_sg_tablesize(c, body_size));

-	osm_debug("max sectors = %d\n", queue->max_phys_segments);
-	osm_debug("phys segments = %d\n", queue->max_sectors);
+	osm_debug("max sectors = %d\n", queue->max_sectors);
+	osm_debug("phys segments = %d\n", queue->max_phys_segments);
 	osm_debug("max hw segments = %d\n", queue->max_hw_segments);

 	/*
 Topic: [PATCH 2/2] Warn when container-init defaults fatal signals
[PATCH 2/2] Warn when container-init defaults fatal signals [message #22421] Sat, 27 October 2007 15:09
Sukadev Bhattiprolu is currently offline Sukadev Bhattiprolu
Messages: 413
Registered: August 2006
Senior Member
From: openvz.org
From: Sukadev Bhattiprolu <sukadev@us.ibm.com>
Subject: [PATCH 2/2] Warn when container-init defaults fatal signals

Print a warning the first time a container-init (other than global init)
forks a child process without explicitly ignoring or handling a fatal signal.
Comments in the patch below explain the gory background :-)

Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com>

---
 kernel/fork.c |   51 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

Index: 2.6.23-mm1/kernel/fork.c
===================================================================
--- 2.6.23-mm1.orig/kernel/fork.c	2007-10-27 11:46:38.000000000 -0700
+++ 2.6.23-mm1/kernel/fork.c	2007-10-27 11:48:36.000000000 -0700
@@ -966,6 +966,53 @@ static void rt_mutex_init_task(struct ta
 }
 
 /*
+ * Container-init process must appear like a normal process to its sibling
+ * in the parent namespace and should be killable (or not) in the usual way.
+ *
+ * But it must be immune to any unwanted signals from within its own namespace.
+ *
+ * At the time of sending the signal, sig_init_ignore() checks and ignores
+ * if receiver is container-init and the signal is unwanted.
+ *
+ * A limitation with the check in sig_init_ignore() is that if the signal is
+ * blocked by the container-init at the time of the check, we cannot ignore
+ * the signal because the container-init may install a handler for the signal
+ * before unblocking it.
+ *
+ * But if the container-init unblocks the signal without installing the handler,
+ * the unwanted signal will still be delivered to the container-init. If the
+ * unwanted signal is fatal (i.e default action is to terminate), we end up
+ * terminating the container-init and hence the container.
+ *
+ * There does not seem to be an easy/clean way to address this blocked-signal
+ * issue in the kernel.  For now, it appears easier to let the container-init
+ * decide what it wants to do with signals i.e have it _explicitly_ ignore or
+ * handle all fatal signals.
+ *
+ * Following routine prints a warning if the container-init does not
+ * explicitly ignore or handle fatal signals.
+ *
+ * Return 1 if the warning is printed.  Return 0 otherwise.
+ */
+static int check_fatal_signals(struct task_struct *task)
+{
+	int i;
+
+	if (!is_container_init(task))
+		return 0;
+
+	for (i = 1; i < _NSIG; i++) {
+		if (!sig_fatal(task, i))
+			continue;
+
+		printk(KERN_WARNING "Container init %d does not handle/ignore "
+				"all fatal signals\n", task_pid_nr(task));
+		return 1;
+	}
+	return 0;
+}
+
+/*
  * This creates a new process as a copy of the old one,
  * but does not actually start it yet.
  *
@@ -983,6 +1030,10 @@ static struct task_struct *copy_process(
 	int retval;
 	struct task_struct *p;
 	int cgroup_callbacks_done = 0;
+	static int fatal_signal_warned;
+
+	if (!is_global_init(current) && !fatal_signal_warned)
+		fatal_signal_warned = check_fatal_signals(current);
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: Help required regarding tool for OpenVZ
Help required regarding tool for OpenVZ [message #22290] Wed, 24 October 2007 10:50
KoolK is currently offline KoolK
Messages: 8
Registered: September 2007
Junior Member
From: openvz.org
Hi,

I'm new to OpenVZ. I'm interested in making a tool which can give
performance statistics for OpenVZ, which can help users to get details about
CPU usage, Memory usage etc for each VE.

  For this task one file which can act as source is /proc/vz/vzstat. Can
somebody guide me on which all performance information, the user or
performance analyzer might want know. It would be great if somebody can also
point out other user information tools which are required to be developed
for OpenVZ.

Thanks in advance...

With regards,
Khyati
 Topic: [PATCH] small memory leak with FIB rules
[PATCH] small memory leak with FIB rules [message #22278] Wed, 24 October 2007 08:52
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
This patch fixes a small memory leak. Default fib rules can be deleted by
the user if the rule does not carry FIB_RULE_PERMANENT flag, f.e. by
	ip rule flush

Such a rule will not be freed as the ref-counter has 2 on start and becomes
clearly unreachable after removal.

Signed-off-by: Denis V. Lunev <den@openvz.org>

---------

 include/net/fib_rules.h |    3 ++
 net/core/fib_rules.c    |   22 ++++++++++++++++++++
 net/decnet/dn_rules.c   |   13 +-----------
 net/ipv4/fib_rules.c    |   51 ++++++++++++++++++------------------------------
 net/ipv6/fib6_rules.c   |   37 ++++++++++++++--------------------
 5 files changed, 62 insertions(+), 64 deletions(-)

---------

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -107,4 +107,7 @@ extern int			fib_rules_unregister(struct fib_rules_ops *);
 extern int			fib_rules_lookup(struct fib_rules_ops *,
 						 struct flowi *, int flags,
 						 struct fib_lookup_arg *);
+extern int			fib_default_rule_add(struct fib_rules_ops *,
+			   			     u32 pref, u32 table,
+						     u32 flags);
 #endif
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 13de6f5..848132b 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -18,6 +18,28 @@
 static LIST_HEAD(rules_ops);
 static DEFINE_SPINLOCK(rules_mod_lock);
 
+int fib_default_rule_add(struct fib_rules_ops *ops,
+			 u32 pref, u32 table, u32 flags)
+{
+	struct fib_rule *r;
+
+	r = kzalloc(ops->rule_size, GFP_KERNEL);
+	if (r == NULL)
+		return -ENOMEM;
+
+	atomic_set(&r->refcnt, 1);
+	r->action = FR_ACT_TO_TBL;
+	r->pref = pref;
+	r->table = table;
+	r->flags = flags;
+
+	/* The lock is not required here, the list in unreacheable
+	 * at the moment this function is called */
+	list_add_tail(&r->list, &ops->rules_list);
+	return 0;
+}
+EXPORT_SYMBOL(fib_default_rule_add);
+
 static void notify_rule_change(int event, struct fib_rule *rule,
 			       struct fib_rules_ops *ops, struct nlmsghdr *nlh,
 			       u32 pid);
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index ddd3f04..ffebea0 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -48,15 +48,6 @@ struct dn_fib_rule
 	u8			flags;
 };
 
-static struct dn_fib_rule default_rule = {
-	.common = {
-		.refcnt =		ATOMIC_INIT(2),
-		.pref =			0x7fff,
-		.table =		RT_TABLE_MAIN,
-		.action =		FR_ACT_TO_TBL,
-	},
-};
-
 
 int dn_fib_lookup(struct flowi *flp, struct dn_fib_res *res)
 {
@@ -262,8 +253,8 @@ static struct fib_rules_ops dn_fib_rules_ops = {
 
 void __init dn_fib_rules_init(void)
 {
-	list_add_tail(&default_rule.common.list,
-			&dn_fib_rules_ops.rules_list);
+	BUG_ON(fib_default_rule_add(&dn_fib_rules_ops, 0x7fff,
+			            RT_TABLE_MAIN, 0));
 	fib_rules_register(&dn_fib_rules_ops);
 }
 
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index f16839c..a0ada3a 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -49,33 +49,6 @@ struct fib4_rule
 #endif
 };
 
-static struct fib4_rule default_rule = {
-	.common = {
-		.refcnt =	ATOMIC_INIT(2),
-		.pref =		0x7FFF,
-		.table =	RT_TABLE_DEFAULT,
-		.action =	FR_ACT_TO_TBL,
-	},
-};
-
-static struct fib4_rule main_rule = {
-	.common = {
-		.refcnt =	ATOMIC_INIT(2),
-		.pref =		0x7FFE,
-		.table =	RT_TABLE_MAIN,
-		.action =	FR_ACT_TO_TBL,
-	},
-};
-
-static struct fib4_rule local_rule = {
-	.common = {
-		.refcnt =	ATOMIC_INIT(2),
-		.table =	RT_TABLE_LOCAL,
-		.action =	FR_ACT_TO_TBL,
-		.flags =	FIB_RULE_PERMANENT,
-	},
-};
-
 #ifdef CONFIG_NET_CLS_ROUTE
 u32 fib_rules_tclass(struct fib_result *res)
 {
@@ -319,11 +292,27 @@ static struct fib_rules_ops fib4_rules_ops = {
 	.owner		= THIS_MODULE,
 };
 
-void __init fib4_rules_init(void)
+static int __init fib_default_rules_init(void)
 {
-	list_add_tail(&local_rule.common.list, &fib4_rules_ops.rules_list);
-	list_add_tail(&main_rule.common.list, &fib4_rules_ops.rules_list);
-	list_add_tail(&default_rule.common.list, &fib4_rules_ops.rules_list);
+	int err;
+
+	err = fib_default_rule_add(&fib4_rules_ops, 0,
+				   RT_TABLE_LOCAL, FIB_RULE_PERMANENT);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(&fib4_rules_ops, 0x7FFE,
+				   RT_TABLE_MAIN, 0);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(&fib4_rules_ops, 0x7FFF,
+				   RT_TABLE_DEFAULT, 0);
+	if (err < 0)
+		return err;
+	return 0;
+}
 
+void __init fib4_rules_init(void)
+{
+	BUG_ON(fib_default_rules_init());
 	fib_rules_register(&fib4_rules_ops);
 }
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index 706622a..428c6b0 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -31,25 +31,6 @@ struct fib6_rule
 
 static struct fib_rules_ops fib6_rules_ops;
 
-static struct fib6_rule main_rule = {
-	.common = {
-		.refcnt =	ATOMIC_INIT(2),
-		.pref =		0x7FFE,
-		.action =	FR_ACT_TO_TBL,
-		.table =	RT6_TABLE_MAIN,
-	},
-};
-
-static struct fib6_rule local_rule = {
-	.common = {
-		.refcnt =	ATOMIC_INIT(2),
-		.pref =		0,
-		.action =	FR_ACT_TO_TBL,
-		.table =	RT6_TABLE_LOCAL,
-		.flags =	FIB_RULE_PERMANENT,
-	},
-};
-
 struct dst_entry *fib6_rule_lookup(struct flowi *fl, int flags,
 				   pol_lookup_t lookup)
 {
@@ -270,11 +251,23 @@ static struct fib_rules_ops fib6_rules_ops = {
 	.owner			= THIS_MODULE,
 };
 
-void __init fib6_rules_init(void)
+static int __init fib6_default_rules_init(void)
 {
-	list_add_tail(&local_rule.common.list, &fib6_rules_ops.rules_list);
-	list_add_tail(&main_rule.common.list, &fib6_rules_ops.rules_list);
+	int err;
+
+	err = fib_default_rule_add(&fib6_rules_ops, 0,
+				   RT6_TABLE_LOCAL, FIB_RULE_PERMANENT);
+	if (err < 0)
+		return err;
+	err = fib_default_rule_add(&fib6_rules_ops, 0x7FFE, RT6_TABLE_MAIN, 0);
+	if (err < 0)
+		return err;
+	return 0;
+}
 
+void __init fib6_rules_init(void)
+{
+	BUG_ON(fib6_default_rules_init());
 	fib_rules_register(&fib6_rules_ops);
 }
 Topic: Re: [RFC] what the hell is going on with /proc/self?
Re: [RFC] what the hell is going on with /proc/self? [message #22235] Tue, 23 October 2007 22:57
ebiederm is currently offline ebiederm
Messages: 1354
Registered: February 2006
Senior Member
From: openvz.org
Al Viro <viro@ftp.linux.org.uk> writes:

> On Tue, Oct 23, 2007 at 03:20:39PM -0500, Matt Mackall wrote:
>> On Tue, Oct 23, 2007 at 03:03:36AM +0100, Al Viro wrote:
>> > 	What is the proc_base_stuff[] nonsense about?  AFAICS, that
>> > went in with no reason whatsoever in
>> > commit 801199ce805a2412bbcd9bfe213092ec656013dd
>> > Author: Eric W. Biederman <ebiederm@xmission.com>
>> > Date:   Mon Oct 2 02:18:48 2006 -0700
>> > 
>> > 	Rationale is very weak and patch adds considerable complexity
>> > for no good reason.  Besides, it's obfuscated just for the hell of it:
>> > 	if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
>> > instead of
>> > 	if (result != ERR_PTR(-ENOENT))
>> > etc.
>> > 
>> > 	Unless there are _real_ plans that would justify that animal,
>> > I'm going to get rid of it in the pending patch series (/proc/self
>> > cleanups, saner dentry retention for non-process parts, etc.).
>> 
>> Seems obvious to cc: Eric.
>
> Doh...  Sorry, thought I'd done that.  Eric, my apologies.

No problem.

It has been a while so let me see if I can dredge up what goes
on there, partly I ran out of steam when working on that.

One useful aspect of that change to use common infrastructure was in
removing the hard coded inode numbers from /proc.  By going through
proc_fill_cache  I was able to ensure the inode numbers matched
up for /proc/self no matter what they were.

Another aspect of the change that I didn't feel comfortable using
to justify it then and but I do now think is important now is that if
the pid namespace goes away (aka a secondary init and all their
children exit) /proc/self disappears.

I believe that my original patch was smaller and had a bunch more
code reuse until I discovered that the I would goof up the
security modules if I called security_task_to_inode on /proc/self.

So it is my desire (and I think it a reasonable one) that if all
of the tasks in a pid namespace that correspond to a mount of proc
exit then all of the files associated with the pid namespace itself
should disappear.

Getting all of the files that are process related into the
infrastructure of fs/proc/base.c is one way to achieve the process
related files disappear.  Especially as it seemed well connected with
something the concept of splitting proc up into it's constituent
filesystems.  /proc/<pid> /proc/sys and /proc/{generic}.

The more I look at that the amount of /proc that doesn't become
namespace related and thus desirable to be show per process is getting
quite small, so a different technique then using the infrastructure
in fs/proc/base.c may be desirable.

When eventually we get to the device namespace (bleh) we get
things like /proc/scsi/ /proc/tty/ /proc/ide/ and /proc/devices
that should become per namespace as well.  Which probably means
at least half of the existing of the existing /proc/{generic}
stuff looks to become per namespace.

So in practice the things that I see needing to happen with /proc
right now.
- Figure out how to move /proc/net into /proc/<pid>/net leaving
  behind a /proc/net symlink to /proc/self/net.
- Fix proc_kill_inodes to cope with multiple super blocks.
- /proc/self I still think needs to get the pid and not the tgid
  as sometimes I think we get incorrect behavior when coming from
  a thread.
- /proc/sysvipc should really become a /proc/self/sysvipc symlink.
- /proc/sys needs to become /proc/self/sys/ and the work pushed
  down in that direction.

For the stuff that isn't per namespace enhancing the caching does
have the challenge that we are likely to hit the current bug in
proc_kill_inodes because the inodes will stay around longer.
Although ideally if this could be it's own filesystem we would
only have a single instance of those dentries.

Similarly it would be nice if per namespace things like
/proc/<pid>/mounts, /proc/<pid>/net, /proc/<pid>/sys,
/proc/<pid>/sysvipc could share the same dentry trees,
across different /proc/<pid> instances.  Especially as that
would allow using stat to detect if two processes were sharing
the same namespace.  The only thing that suggests itself is making
kernel mounts that are per namespace, for these things.  Which is one
of the reasons I'm interested in splitting /proc up into separate
filesystems.

So in summary.  I really don't care how the internals of /proc look,
or which path we take.  As long as we do improve /proc and ultimately
sort through the per namespace details.

Eric
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 0/2] CFS CGroup: cleanup & usage reporting
[PATCH 0/2] CFS CGroup: cleanup & usage reporting [message #22175] Mon, 22 October 2007 20:49
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
These two patches consist of a small cleanup to CFS, and adding a control file reporting CPU usage in milliseconds in each CGroup directory. They're just bundled together since the second patch depends slightly on the cleanups in the first patch.

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 1/3] [NETNS49] Add struct net to flush_cache in fib_rules_ops
[PATCH 1/3] [NETNS49] Add struct net to flush_cache in fib_rules_ops [message #21970] Thu, 18 October 2007 06:02
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
commit d5daed837d7b0d2492a4226eea248cc7d92bcf81
Author: Denis V. Lunev <den@openvz.org>
Date:   Thu Oct 18 12:41:20 2007 +0400

    Add struct net to flush_cache in fib_rules_ops
    
    Signed-off-by: Denis V. Lunev <den@openvz.org>

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index ba8caa9..163c920 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -61,7 +61,8 @@ struct fib_rules_ops
 
 	/* Called after modifications to the rules set, must flush
 	 * the route cache if one exists. */
-	void			(*flush_cache)(struct fib_rules_ops *ops);
+	void			(*flush_cache)(struct net *net,
+						struct fib_rules_ops *ops);
 
 	int			nlgroup;
 	const struct nla_policy	*policy;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 407d4bf..220bee6 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -43,10 +43,10 @@ static void rules_ops_put(struct fib_rules_ops *ops)
 		module_put(ops->owner);
 }
 
-static void flush_route_cache(struct fib_rules_ops *ops)
+static void flush_route_cache(struct net *net, struct fib_rules_ops *ops)
 {
 	if (ops->flush_cache)
-		ops->flush_cache(ops);
+		ops->flush_cache(net, ops);
 }
 
 int fib_rules_register(struct net *net, struct fib_rules_ops *ops)
@@ -323,7 +323,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		list_add_rcu(&rule->list, ops->rules_list);
 
 	notify_rule_change(net, RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
-	flush_route_cache(ops);
+	flush_route_cache(net, ops);
 	rules_ops_put(ops);
 	return 0;
 
@@ -415,7 +415,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		notify_rule_change(net, RTM_DELRULE, rule, ops, nlh,
 				   NETLINK_CB(skb).pid);
 		fib_rule_put(rule);
-		flush_route_cache(ops);
+		flush_route_cache(net, ops);
 		rules_ops_put(ops);
 		return 0;
 	}
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 9f7a206..2850b66 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -243,7 +243,7 @@ static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops)
 	return 0;
 }
 
-static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
+static void dn_fib_rule_flush_cache(struct net *net, struct fib_rules_ops *ops)
 {
 	dn_rt_cache_flush(-1);
 }
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6bf5b33..c93b278 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -279,7 +279,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
 	       + nla_total_size(4); /* flow */
 }
 
-static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+static void fib4_rule_flush_cache(struct net *net, struct fib_rules_ops *ops)
 {
 	rt_cache_flush(-1);
 }
 Topic: [PATCH 3/3] [NETNS49] support for per/namespace routing cache cleanup v2
[PATCH 3/3] [NETNS49] support for per/namespace routing cache cleanup v2 [message #21969] Thu, 18 October 2007 06:02
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
commit 4fb98b7f7e461b7ce5b0b3c1420ffa5f68e125f5
Author: Denis V. Lunev <den@openvz.org>
Date:   Thu Oct 18 13:39:44 2007 +0400

    /proc/sys/net/route/flush should be accessible inside the net namespace.
    Though, the complete opening of this file will result in a DoS or
    significant entire host slowdown if a namespace process will continually
    flush routes.
    
    This patch introduces per/namespace route flush facility.
    
    Each namespace wanted to flush a cache copies global generation count to
    itself and starts the timer. The cache is dropped for a specific namespace
    iff the namespace counter is greater or equal global ones.
    
    So, in general, unwanted namespaces do nothing. They hold very old low
    counter and they are unaffected by the requested cleanup.
    
    Changes from V1:
    - added struct net * parameter to rt_cache_flush (thanks Daniel)
    - rt_secret_rebuild drop all the cache
    
    Signed-of-by: Denis V. Lunev <den@openvz.org>

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 85abf14..b492ce8 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -143,6 +143,8 @@ struct net {
 
 	/* iptable_filter.c */
 	struct xt_table		*ip_packet_filter;
+
+	unsigned long		rt_flush_required;
 };
 
 extern struct net init_net;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ed1842b..3d900eb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -643,23 +643,97 @@ static void rt_check_expire(unsigned long dummy)
 	mod_timer(&rt_periodic_timer, jiffies + ip_rt_gc_interval);
 }
 
+
+static DEFINE_SPINLOCK(rt_flush_lock);
+
+#ifdef CONFIG_NET_NS
+static unsigned long rt_flush_gen;
+
+/* called under rt_flush_lock */
+static void rt_flush_required_set(struct net *net)
+{
+	/*
+	 * If the global generation rt_flush_gen is equal to G, then
+	 * the pass considering entries labelled by G is yet to come.
+	 */
+	net->rt_flush_required = rt_flush_gen;
+}
+
+static unsigned long rt_flush_required_reset(void)
+{
+	unsigned long g;
+
+	spin_lock_bh(&rt_flush_lock);
+	g = rt_flush_gen++;
+	spin_unlock_bh(&rt_flush_lock);
+	return g;
+}
+
+static int rt_flush_required_check(struct net *net, unsigned long gen)
+{
+	/* can be checked without the lock */
+	return net->rt_flush_required >= gen;
+}
+
+#else
+
+static void rt_flush_required_reset(struct net *net)
+{
+}
+
+static unsigned long rt_flush_required_reset(void)
+{
+	return 0;
+}
+#endif
+
+
 /* This can run from both BH and non-BH contexts, the latter
  * in the case of a forced flush event.
  */
-static void rt_run_flush(unsigned long dummy)
+static void rt_run_flush(unsigned long cleanup_all)
 {
 	int i;
 	struct rtable *rth, *next;
+	unsigned long gen;
 
 	rt_deadline = 0;
 
 	get_random_bytes(&rt_hash_rnd, 4);
+	gen = rt_flush_required_reset();
 
 	for (i = rt_hash_mask; i >= 0; i--) {
+		struct rtable **prev, *p, *tail;
+
 		spin_lock_bh(rt_hash_lock_addr(i));
 		rth = rt_hash_table[i].chain;
-		if (rth)
+		if (rth == NULL)
+			goto done;
+
+		if (cleanup_all) {
 			rt_hash_table[i].chain = NULL;
+			goto done;
+		}
+
+		/* defer releasing the head of the list after spin_unlock */
+		for (tail = rth; tail; tail = tail->u.dst.rt_next)
+			if (!rt_flush_required_check(tail->fl.fl_net, gen))
+				break;
+		if (rth != tail)
+			rt_hash_table[i].chain = tail;
+
+		/* call rt_free on entries after the tail requiring flush */
+		prev = &rt_hash_table[i].chain;
+		for (p = *prev; p; p = next) {
+			next = p->u.dst.rt_next;
+			if (!rt_flush_required_check(p->fl.fl_net, gen)) {
+				prev = &p->u.dst.rt_next;
+			} else {
+				*prev = next;
+				rt_free(p);
+			}
+		}
+done:
 		spin_unlock_bh(rt_hash_lock_addr(i));
 
 		for (; rth; rth = next) {
@@ -669,8 +743,6 @@ static void rt_run_flush(unsigned long dummy)
 	}
 }
 
-static DEFINE_SPINLOCK(rt_flush_lock);
-
 static int __rt_cache_flush(int delay)
 {
 	unsigned long now = jiffies;
@@ -698,6 +770,8 @@ static int __rt_cache_flush(int delay)
 			delay = tmo;
 	}
 
+	rt_flush_required_set(current->nsproxy->net_ns);
+
 	if (delay <= 0) {
 		spin_unlock_bh(&rt_flush_lock);
 		return 1;
@@ -722,7 +796,7 @@ static void rt_secret_rebuild(unsigned long dummy)
 	unsigned long now = jiffies;
 
 	__rt_cache_flush(0);
-	rt_run_flush(0);
+	rt_run_flush(1);
 	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
 }
 Topic: [PATCH 3/3] [NETNS49] support for per/namespace routing cache cleanup v2
[PATCH 3/3] [NETNS49] support for per/namespace routing cache cleanup v2 [message #21968] Thu, 18 October 2007 06:02
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
 Topic: [PATCH] [NETNS49] Add struct net to flush_cache in fib_rules_ops
[PATCH] [NETNS49] Add struct net to flush_cache in fib_rules_ops [message #21966] Thu, 18 October 2007 06:00
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
commit d5daed837d7b0d2492a4226eea248cc7d92bcf81
Author: Denis V. Lunev <den@openvz.org>
Date:   Thu Oct 18 12:41:20 2007 +0400

    Add struct net to flush_cache in fib_rules_ops
    
    Signed-off-by: Denis V. Lunev <den@openvz.org>

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index ba8caa9..163c920 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -61,7 +61,8 @@ struct fib_rules_ops
 
 	/* Called after modifications to the rules set, must flush
 	 * the route cache if one exists. */
-	void			(*flush_cache)(struct fib_rules_ops *ops);
+	void			(*flush_cache)(struct net *net,
+						struct fib_rules_ops *ops);
 
 	int			nlgroup;
 	const struct nla_policy	*policy;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 407d4bf..220bee6 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -43,10 +43,10 @@ static void rules_ops_put(struct fib_rules_ops *ops)
 		module_put(ops->owner);
 }
 
-static void flush_route_cache(struct fib_rules_ops *ops)
+static void flush_route_cache(struct net *net, struct fib_rules_ops *ops)
 {
 	if (ops->flush_cache)
-		ops->flush_cache(ops);
+		ops->flush_cache(net, ops);
 }
 
 int fib_rules_register(struct net *net, struct fib_rules_ops *ops)
@@ -323,7 +323,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		list_add_rcu(&rule->list, ops->rules_list);
 
 	notify_rule_change(net, RTM_NEWRULE, rule, ops, nlh, NETLINK_CB(skb).pid);
-	flush_route_cache(ops);
+	flush_route_cache(net, ops);
 	rules_ops_put(ops);
 	return 0;
 
@@ -415,7 +415,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		notify_rule_change(net, RTM_DELRULE, rule, ops, nlh,
 				   NETLINK_CB(skb).pid);
 		fib_rule_put(rule);
-		flush_route_cache(ops);
+		flush_route_cache(net, ops);
 		rules_ops_put(ops);
 		return 0;
 	}
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 9f7a206..2850b66 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -243,7 +243,7 @@ static u32 dn_fib_rule_default_pref(struct fib_rules_ops *ops)
 	return 0;
 }
 
-static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops)
+static void dn_fib_rule_flush_cache(struct net *net, struct fib_rules_ops *ops)
 {
 	dn_rt_cache_flush(-1);
 }
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 6bf5b33..c93b278 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -279,7 +279,7 @@ static size_t fib4_rule_nlmsg_payload(struct fib_rule *rule)
 	       + nla_total_size(4); /* flow */
 }
 
-static void fib4_rule_flush_cache(struct fib_rules_ops *ops)
+static void fib4_rule_flush_cache(struct net *net, struct fib_rules_ops *ops)
 {
 	rt_cache_flush(-1);
 }
 Topic: [PATCH 1/1] namespaces: introduce sys_hijack (v6)
[PATCH 1/1] namespaces: introduce sys_hijack (v6) [message #21917] Wed, 17 October 2007 17:54
serue is currently offline serue
Messages: 750
Registered: February 2006
Senior Member
From: openvz.org
Ok hopefully friday I'll have some time to figure out the proper
way to do hijack on a cgroup (and maybe do a s390 or powerpc
version of the syscall), but in the meantime here is the the
current incarnation of the hijack patch.

-serge

>From 4b1f38e451519d58b710dc746137348f392b160f Mon Sep 17 00:00:00 2001
From: sergeh@us.ibm.com <hallyn@kernel.(none)>
Date: Tue, 16 Oct 2007 09:36:49 -0700
Subject: [PATCH 1/1] namespaces: introduce sys_hijack (v6)

Move most of do_fork() into a new do_fork_task() which acts on
a new argument, task, rather than on current.  do_fork() becomes
a call to do_fork_task(current, ...).

Introduce sys_hijack (for x86 only so far).  It is like clone, but
 in place of a stack pointer (which is assumed null) it accepts a
pid.  The process identified by that pid is the one which is
actually cloned.  Some state - include the file table, the signals
and sighand (and hence tty), and the ->parent are taken from the
calling process.

The effect is a sort of namespace enter.  The following program
uses sys_hijack to 'enter' all namespaces of the specified pid.
For instance in one terminal, do

	mount -t cgroup -ons /cgroup
	hostname
	  qemu
	ns_exec -u /bin/sh
	  hostname serge
          echo $$
            1073
	  cat /proc/$$/cgroup
	    ns:/node_1073

In another terminal then do

	hostname
	  qemu
	cat /proc/$$/cgroup
	  ns:/
	hijack 1073
	  hostname
	    serge
	  cat /proc/$$/cgroup
	    ns:/node_1073

Changelog:
	Aug 23: send a stop signal to the hijacked process
		(like ptrace does).
	Oct 09: Update for 2.6.23-rc8-mm2 (mainly pidns)
		Don't take task_lock under rcu_read_lock
		Send hijacked process to cgroup_fork() as
		the first argument.
		Removed some unneeded task_locks.
	Oct 16: Fix bug introduced into alloc_pid.
	Oct 16: Add 'int which' argument to sys_hijack to
		allow later expansion to use cgroup in place
		of pid to specify what to hijack.

==============================================================
hijack.c
==============================================================

int do_clone_task(void)
{
	execl("/bin/sh", "/bin/sh", NULL);
}

int main(int argc, char *argv[])
{
	int pid;
	int ret;
	int status;

	if (argc < 2)
		return 1;
	pid = atoi(argv[1]);

	ret = syscall(327, SIGCHLD, 1, (unsigned long) pid);

	if  (ret == 0) {
		return do_clone_task();
	} else if (ret < 0) {
		perror("sys_hijack");
	} else {
		printf("waiting on cloned process %d\n", ret);
		while(waitpid(-1, &status, __WALL) != -1)
				;
		printf("cloned process %d exited with %d\n", ret, status);
	}

	return ret;
}
==============================================================

Signed-off-by: Serge Hallyn <serue@us.ibm.com>
---
 arch/i386/kernel/process.c       |   75 +++++++++++++++++++++++++++++++++++++-
 arch/i386/kernel/syscall_table.S |    1 +
 arch/s390/kernel/process.c       |   12 +++++-
 include/asm-i386/unistd.h        |    3 +-
 include/linux/cgroup.h           |    5 ++-
 include/linux/ptrace.h           |    1 +
 include/linux/sched.h            |    8 ++++
 include/linux/syscalls.h         |    1 +
 kernel/cgroup.c                  |    8 ++--
 kernel/fork.c                    |   67 +++++++++++++++++++++++++---------
 kernel/ptrace.c                  |    7 ++++
 11 files changed, 159 insertions(+), 29 deletions(-)

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index bfcd01e..cc18a23 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -455,8 +455,15 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 	unsigned long unused,
 	struct task_struct * p, struct pt_regs * regs)
 {
+	return copy_a_thread(current, nr, clone_flags, esp, unused,
+		p, regs);
+}
+
+int copy_a_thread(struct task_struct *tsk, int nr, unsigned long clone_flags,
+	unsigned long esp, unsigned long unused,
+	struct task_struct * p, struct pt_regs * regs)
+{
 	struct pt_regs * childregs;
-	struct task_struct *tsk;
 	int err;
 
 	childregs = task_pt_regs(p);
@@ -471,7 +478,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
 
 	savesegment(gs,p->thread.gs);
 
-	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
 						IO_BITMAP_BYTES, GFP_KERNEL);
@@ -783,6 +789,71 @@ asmlinkage int sys_clone(struct pt_regs regs)
 	return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, child_tidptr);
 }
 
+static int hijack_pid(struct pt_regs regs)
+{
+	unsigned long clone_flags = regs.ebx;
+	pid_t pid = regs.edx;
+	struct task_struct *task;
+	int ret = -EINVAL;
+
+	rcu_read_lock();
+	task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+	
+	if (task) {
+		task_lock(task);
+		put_task_struct(task);
+	}
+
+	if (task) {
+		if (!ptrace_may_attach_locked(task)) {
+			ret = -EPERM;
+			goto out_put_task;
+		}
+		if (task->ptrace) {
+			ret = -EBUSY;
+			goto out_put_task;
+		}
+		force_sig_specific(SIGSTOP, task);
+
+		task_unlock(task);
+		ret = do_fork_task(task, clone_flags, regs.esp, &regs, 0,
+			NULL, NULL);
+		wake_up_process(task);
+		task = NULL;
+	}
+
+out_put_task:
+	if (task)
+		task_unlock(task);
+	return ret;
+}
+
+static int hijack_cgroup(struct pt_regs regs)
+{
+	unsigned long clone_flags = regs.ebx;
+	int fd = regs.edx;
+
+	return -ENOSYS;
+}
+
+asmlinkage int sys_hijack(struct pt_regs regs)
+{
+	int which = regs.ecx;
+
+	switch (which) {
+	case HIJACK_PID:
+		return hijack_pid(regs);
+	case HIJACK_CGROUP:
+		return hijack_cgroup(regs);
+	default:
+		return -EINVAL;
+	}
+
+}
+
 /*
  * This is trivial, and on the face of it looks like it
  * could equally well be done in user mode.
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index df6e41e..495930c 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -326,3 +326,4 @@ ENTRY(sys_call_table)
 	.long sys_fallocate
 	.long sys_revokeat		/* 325 */
 	.long sys_frevoke
+	.long sys_hijack
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 70c5737..f256e7a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -223,6 +223,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	unsigned long unused,
         struct task_struct * p, struct pt_regs * regs)
 {
+	return copy_a_thread(current, nr, clone_flags, new_stackp, unused,
+				 p, regs);
+}
+
+int copy_a_thread(struct task_struct *task, int nr, unsigned long clone_flags,
+	unsigned long new_stackp, unsigned long unused,
+        struct task_struct * p, struct pt_regs * regs)
+{
         struct fake_frame
           {
 	    struct stack_frame sf;
@@ -251,8 +259,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
 	 * save fprs to current->thread.fp_regs to merge them with
 	 * the emulated registers and then copy the result to the child.
 	 */
-	save_fp_regs(&current->thread.fp_regs);
-	memcpy(&p->thread.fp_regs, &current->thread.fp_regs,
+	save_fp_regs(&task->thread.fp_regs);
+	memcpy(&p->thread.fp_regs, &task->thread.fp_regs,
 	       sizeof(s390_fp_regs));
         p->thread.user_seg = __pa((unsigned long) p->mm->pgd) | _SEGMENT_TABLE;
 	/* Set a new TLS ?  */
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 006c1b3..fe6eeb4 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -332,10 +332,11 @@
 #define __NR_fallocate		324
 #define __NR_revokeat		325
 #define __NR_frevoke		326
+#define __NR_hijack		327
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 327
+#define NR_syscalls 328
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 8747932..cb6d335 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -26,7 +26,7 @@ extern int cgroup_init(void);
 extern void cgroup_init_smp(void);
 extern void cgroup_lock(void);
 extern void cgroup_unlock(void);
-extern void cgroup_fork(struct task_struct *p);
+extern void cgroup_fork(struct task_struct *parent, struct task_struct *p);
 extern void cgroup_fork_callbacks(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
@@ -309,7 +309,8 @@ void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
 static inline void cgroup_init_smp(void) {}
-static inline void cgroup_fork(struct task_struct *p) {}
+static inline void cgroup_fork(struct task_struct *parent,
+					 struct task_struct *p) {}
 static inline void cgroup_fork_callbacks(struct task_struct *p) {}
 static inline void cgroup_post_fork(struct task_struct *p) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index ae8146a..727a4a9 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -97,6 +97,7 @@ extern void __ptrace_link(struct task_struct *child,
 extern void __ptrace_unlink(struct task_struct *child);
 extern void ptrace_untrace(struct task_struct *child);
 extern int ptrace_may_attach(struct task_struct *task);
+extern int ptrace_may_attach_locked(struct task_struct *task);
 
 static inline void ptrace_link(struct task_struct *child,
 			       struct task_struct *new_parent)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4f21af1..eb20ccd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -28,6 +28,12 @@
 #define CLONE_NEWPID		0x20000000	/* New pid namespace */
 
 /*
+ * Hijack flags
+ */
+#define HIJACK_PID	1	/* 'id' is a pid */
+#define HIJACK_CGROUP	2	/* 'id' is an open fd for a cgroup dir */
+
+/*
  * Scheduling policies
  */
 #
...

 Topic: Support for autofs in RHEL5-based VPS
Support for autofs in RHEL5-based VPS [message #21924] Wed, 17 October 2007 17:36
dranch is currently offline dranch
Messages: 33
Registered: August 2007
Member
From: openvz.org
Hello OpenVZ developers,

Does anyone have any ideas of why autofs doesn't work in the current 
OpenVZ patches on a RHEL 2.6.18 kernel?  I noticed that there were some 
autofs patches posted to this back in February 2007 but I don't know if 
they specifically solve this issue.  I have an on-going forum thread 
here which details the issues, there seems to be a workaround using a 
vanilla 2.6.18 kernel and AMD, etc.

   http://forum.openvz.org/index.php?t=msg&goto=21652&#msg_21652


Thoughts?

--David
 Topic: Guidance required...
Guidance required... [message #21763] Mon, 15 October 2007 23:26
KoolK is currently offline KoolK
Messages: 8
Registered: September 2007
Junior Member
From: openvz.org
Hi,



I have joint devel list recently. I want to contribute to OpenVZ. I have
read few papers on openVZ and one of the papers "Performance Evaluation of
Virtualization Technologies for Server Consolidation" ( HP Labs) -  says
that ,  "For OpenVZ, there is no existing tool to directly measure the CPU
consumption by a particular container. We use the data provided from
/proc/vz/vestat to measure the amount of CPU time spent by a particular
VE." also on wiki - roadmap one topic suggested is "fix broken loadavg/cpu
usage statistics" - According to my understanding both refers to same. But
when I used top in VPSs, it worked and gave CPU usage statistics for
particular VPS. So I doubt the problem is already solved?



Can anybody guide me that is this problem solved? If No, can somebody
elaborate this problem?

It that problem is already solved, please suggest me some small staring
task.


Thanks in advance,

Khyati Sanghvi

M.Tech Student,

IIITB - India.
 Topic: [PATCH RFC] cgroups: implement device whitelist cgroup+lsm
[PATCH RFC] cgroups: implement device whitelist cgroup+lsm [message #21597] Thu, 11 October 2007 16:04
serue is currently offline serue
Messages: 750
Registered: February 2006
Senior Member
From: openvz.org
Here is an LSM-based alternative to Pavel's device control
cgroup, purely for discussion, not for any sort of code
review (please :).

thanks,
-serge

>From 4266131c40b629e3b04c0d9d01569a95fa967e3e Mon Sep 17 00:00:00 2001
From: Serge E. Hallyn <serue@us.ibm.com>
Date: Thu, 11 Oct 2007 15:27:48 -0400
Subject: [PATCH RFC] cgroups: implement device whitelist cgroup+lsm

Implement a cgroup using the LSM interface to enforce open and mknod
on device files.  Not a line of this code is expected to be used in a
final version, this is just a proof of concept.

No stacking is implemented, so to test this you must have

	CGROUPS=y
	SECURITY=y

but all other LSMs =n (no capabilities, no selinux, no rootplug).

This implements a simple device access whitelist.  A whitelist entry
has 4 fields.  'type' is a (all), c (char), or b (block).  'all' means it
applies to all types, all major numbers, and all minor numbers.  Major and
minor are obvious.  Access is a composition of r (read), w (write), and
m (mknod).

The root devcgroup starts with rwm to 'all'.  A child devcg gets a copy
of the parent.  Admins can then add and remove devices to the whitelist.
Once CAP_HOST_ADMIN is introduced it will be needed to add entries as
well or remove entries from another cgroup, though just CAP_SYS_ADMIN
will suffice to remove entries for your own group.

An entry is added by doing "echo <type> <maj> <min> <access>" > devcg.allow,
for instance:

	echo b 7 0 mrw > /cgroups/1/devcg.allow

An entry is removed by doing likewise into devcg.deny.  Since this is a
pure whitelist, not acls, you can only remove entries which exist in the
whitelist.  You must explicitly

	echo a 0 0 mrw > /cgroups/1/devcg.deny

to remove the "allow all" entry which is automatically inherited from
the root cgroup.

While composing this with the ns_cgroup may seem logical, it may not
be the right thing to do.  Note that each newly created devcg gets
a copy of the parent whitelist.  So if you had done

	mount -t cgroup -o ns,devcg none /cgroups

then once a process in /cgroup/1 had done an unshare(CLONE_NEWNS)
it would be under /cgroup/1/node_<pid>
if an admin did

	echo b 7 0 m > /cgroups/1/devcg.deny

then the entry would still be in the whitelist for /cgroups/1/node_<pid>.
Something to discuss if we get that far before nixing this whole idea.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
---
 include/linux/cgroup_subsys.h |    6 +
 init/Kconfig                  |    7 +
 kernel/Makefile               |    1 +
 kernel/dev_cgroup.c           |  554 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 568 insertions(+), 0 deletions(-)
 create mode 100644 kernel/dev_cgroup.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index d822977..cf55cb2 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -36,3 +36,9 @@ SUBSYS(mem_cgroup)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_DEV
+SUBSYS(devcg)
+#endif
+
+/* */
diff --git a/init/Kconfig b/init/Kconfig
index 6bb603a..0b3b684 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -319,6 +319,13 @@ config CPUSETS
 
 	  Say N if unsure.
 
+config CGROUP_DEV
+	bool "Device controller for cgroups"
+	depends on CGROUPS && SECURITY && EXPERIMENTAL
+	help
+	  Provides a cgroup implementing whitelists for devices which
+	  a process in the cgroup can mknod or open.
+
 config FAIR_GROUP_SCHED
 	bool "Fair group CPU scheduler"
 	default y
diff --git a/kernel/Makefile b/kernel/Makefile
index 76f782f..6ded46d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -43,6 +43,7 @@ obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
+obj-$(CONFIG_CGROUP_DEV) += dev_cgroup.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
diff --git a/kernel/dev_cgroup.c b/kernel/dev_cgroup.c
new file mode 100644
index 0000000..87c8fb4
--- /dev/null
+++ b/kernel/dev_cgroup.c
@@ -0,0 +1,554 @@
+/*
+ * dev_cgroup.c - device cgroup subsystem
+ *
+ * Copyright 2007 IBM Corp
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/security.h>
+
+#include <asm/uaccess.h>
+
+#define ACC_MKNOD 1
+#define ACC_READ  2
+#define ACC_WRITE 4
+
+#define DEV_BLOCK 1
+#define DEV_CHAR  2
+#define DEV_ALL   4  /* this represents all devices */
+
+/*
+ * whitelist locking rules:
+ * cgroup_lock() cannot be taken under cgroup->lock.
+ * cgroup->lock can be taken with or without cgroup_lock().
+ *
+ * modifications always require cgroup_lock
+ * modifications to a list which is visible require the
+ *   cgroup->lock *and* cgroup_lock()
+ * walking the list requires cgroup->lock or cgroup_lock().
+ *
+ * reasoning: dev_whitelist_copy() needs to kmalloc, so needs
+ *   a mutex, which the cgroup_lock() is.  Since modifying
+ *   a visible list requires both locks, either lock can be
+ *   taken for walking the list.  Since the wh->spinlock is taken
+ *   for modifying a public-accessible list, the spinlock is
+ *   sufficient for just walking the list.
+ */
+
+struct dev_whitelist_item {
+	u32 major, minor;
+	short type;
+	short access;
+	struct list_head list;
+};
+
+struct dev_cgroup {
+	struct cgroup_subsys_state css;
+	struct list_head whitelist;
+	spinlock_t lock;
+};
+
+struct cgroup_subsys devcg_subsys;
+
+static inline struct dev_cgroup *cgroup_to_devcg(
+		struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, devcg_subsys_id),
+			    struct dev_cgroup, css);
+}
+
+/*
+ * Once 64-bit caps and CAP_HOST_ADMIN exist, we will be
+ * requiring (CAP_HOST_ADMIN|CAP_MKNOD) to create a device
+ * not in the whitelist, * (CAP_HOST_ADMIN|CAP_SYS_ADMIN)
+ * to edit the whitelist,
+ */
+static int devcg_can_attach(struct cgroup_subsys *ss,
+		struct cgroup *new_cgroup, struct task_struct *task)
+{
+	struct cgroup *orig;
+
+	if (current != task) {
+		if (!cgroup_is_descendant(new_cgroup))
+			return -EPERM;
+	}
+
+	if (atomic_read(&new_cgroup->count) != 0)
+		return -EPERM;
+
+	orig = task_cgroup(task, devcg_subsys_id);
+	if (orig && orig != new_cgroup->parent)
+		return -EPERM;
+
+	return 0;
+}
+
+/*
+ * called under cgroup_lock()
+ */
+int dev_whitelist_copy(struct list_head *dest, struct list_head *orig)
+{
+	struct dev_whitelist_item *wh, *tmp, *new;
+
+	list_for_each_entry(wh, orig, list) {
+		new = kmalloc(sizeof(*wh), GFP_KERNEL);
+		if (!new)
+			goto free_and_exit;
+		new->major = wh->major;
+		new->minor = wh->minor;
+		new->type = wh->type;
+		new->access = wh->access;
+		list_add_tail(&new->list, dest);
+	}
+
+	return 0;
+
+free_and_exit:
+	list_for_each_entry_safe(wh, tmp, dest, list) {
+		list_del(&wh->list);
+		kfree(wh);
+	}
+	return -ENOMEM;
+}
+
+/* Stupid prototype - don't bother combining existing entries */
+/*
+ * called under cgroup_lock()
+ * since the list is visible to other tasks, we need the spinlock also
+ */
+void dev_whitelist_add(struct dev_cgroup *dev_cgroup, struct dev_whitelist_item *wh)
+{
+	spin_lock(&dev_cgroup->lock);
+	list_add_tail(&wh->list, &dev_cgroup->whitelist);
+	spin_unlock(&dev_cgroup->lock);
+}
+
+/*
+ * called under cgroup_lock()
+ * since the list is visible to other tasks, we need the spinlock also
+ */
+void dev_whitelist_rm(struct dev_cgroup *dev_cgroup, struct dev_whitelist_item *wh)
+{
+	struct dev_whitelist_item *walk, *tmp;
+
+	spin_lock(&dev_cgroup->lock);
+	list_for_each_entry_safe(walk, tmp, &dev_cgroup->whitelist, list) {
+		if (walk->type & DEV_ALL) {
+			list_del(&walk->list);
+			kfree(walk);
+			continue;
+		}
+		if (walk->type != wh->type)
+			continue;
+		if (walk->major != wh->major || walk->minor != wh->minor)
+			continue;
+		walk->access &= ~wh->access;
+		if (!walk->access) {
+			list_del(&walk->list);
+			kfree(walk);
+		}
+	}
+	spin_unlock(&dev_cgroup->lock);
+}
+
+/*
+ * Rules: you can only create a cgroup if
+ *     1. you are capable(CAP_SYS_ADMIN)
+ *     2. the target cgroup is a descendant of your own cgroup
+ *
+ * Note: called from kernel/cgroup.c with cgroup_lock() held.
+ */
+static struct cgroup_subsys_state *devcg_create(struct cgroup_subsys *ss,
+						struct cgroup *cgroup)
+{
+	struct dev_cgroup *dev_cgroup, *parent_dev_cgroup;
+	struct cgroup *parent_cgroup;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	if (!cgroup_is_descendant(cgroup))
+		return ERR_PTR(-EPERM);
+
+	dev_cgroup = kzalloc(sizeof(*dev_cgroup), GFP_KERNEL);
+	if (!dev_cgroup)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&dev_cgroup->whitelist);
+	parent_cgroup = cgroup->parent;
+
+	if (parent_cgroup == NULL) {
+		struct dev_whitelist_item *wh;
+		wh = kmalloc(sizeof(*wh), GFP_KERNEL);
+		wh->minor = wh->major = 0;
+		wh->type = DEV_ALL;
+		wh->access = ACC_MKNOD | ACC_READ | ACC_WRITE;
+		list_add(&wh->list, &dev_cgroup->whitelist);
+	} else {
+		parent_dev_cgroup = cgroup_to_devcg(parent_cgroup);
+		ret = dev_whitelist_copy(&dev_cgroup->whitelist,
+				&parent_dev_cgroup->whitelist);
+		if (ret) {
+			kfree(dev_cgroup);
+			return ERR_PTR(ret);
+		}
+	}
+
+	spin_lock_init(&dev_cgroup->lock);
+	return &dev_cgroup->css;
+}
+
+static void devcg_destroy(struct cgroup_subsys *ss,
+			struct cgroup *cgroup)
+{
+	struct dev_cgroup *dev_cgroup;
+	struct dev_whitelist_item *wh, *tmp;
+
+	dev_cgroup = cgroup_to_devcg(cgroup);
+	list_for_each_entry_safe(wh, tmp, &dev_cgroup->whitelist, list) {
+		list_del(&wh->list);
+		kfree(wh);
+	}
+	kfree(dev_cgroup);
+}
+
+#define DEVCG_ALLOW
...

 Topic: [patch 2/2][NETNS49][IPV4][UDPLITE] activate udplite per network namespace
[patch 2/2][NETNS49][IPV4][UDPLITE] activate udplite per network namespace [message #21474] Tue, 09 October 2007 13:00
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
This patch activates the udplite protocol for multiple
network namespaces.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>

---
 net/ipv4/udplite.c |   12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

Index: linux-2.6-netns/net/ipv4/udplite.c
===================================================================
--- linux-2.6-netns.orig/net/ipv4/udplite.c
+++ linux-2.6-netns/net/ipv4/udplite.c
@@ -30,18 +30,11 @@ static int udplite_v4_get_port(struct so
 
 static int udplite_rcv(struct sk_buff *skb)
 {
-	if (skb->dev->nd_net != &init_net) {
-		kfree_skb(skb);
-		return 0;
-	}
 	return __udp4_lib_rcv(skb, udplite_hash, IPPROTO_UDPLITE);
 }
 
 static void udplite_err(struct sk_buff *skb, u32 info)
 {
-	if (skb->dev->nd_net != &init_net)
-		return;
-
 	return __udp4_lib_err(skb, info, udplite_hash);
 }
 
@@ -83,7 +76,7 @@ static struct inet_protosw udplite4_prot
 	.ops		=  &inet_dgram_ops,
 	.capability	= -1,
 	.no_check	=  0,		/* must checksum (RFC 3828) */
-	.flags		=  INET_PROTOSW_PERMANENT,
+	.flags		=  INET_PROTOSW_PERMANENT | INET_PROTOSW_NETNS,
 };
 
 #ifdef CONFIG_PROC_FS
@@ -99,9 +92,6 @@ static struct udp_seq_afinfo udplite4_se
 
 static int udplite4_proc_net_init(struct net *net)
 {
-	if (net != &init_net)
-		return -EPERM;
-
 	if (udp_proc_register(net, &udplite4_seq_afinfo)) /* udplite4_proc_init() */
 		printk(KERN_ERR "%s: Cannot register /proc!\n", __FUNCTION__);
 

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [patch 1/2][NETNS49][IPV4][UDPLITE] add udp_proc_register per net
[patch 1/2][NETNS49][IPV4][UDPLITE] add udp_proc_register per net [message #21473] Tue, 09 October 2007 13:00
Daniel Lezcano is currently offline Daniel Lezcano
Messages: 417
Registered: June 2006
Senior Member
From: openvz.org
This patch add a pernet subsystem to register /proc/net/udplite
per network namespace.

Signed-off-by: Daniel Lezcano <dlezcano@fr.ibm.com>

---
 net/ipv4/udplite.c |   24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

Index: linux-2.6-netns/net/ipv4/udplite.c
===================================================================
--- linux-2.6-netns.orig/net/ipv4/udplite.c
+++ linux-2.6-netns/net/ipv4/udplite.c
@@ -96,6 +96,27 @@ static struct udp_seq_afinfo udplite4_se
 	.seq_show	= udp4_seq_show,
 	.seq_fops	= &udplite4_seq_fops,
 };
+
+static int udplite4_proc_net_init(struct net *net)
+{
+	if (net != &init_net)
+		return -EPERM;
+
+	if (udp_proc_register(net, &udplite4_seq_afinfo)) /* udplite4_proc_init() */
+		printk(KERN_ERR "%s: Cannot register /proc!\n", __FUNCTION__);
+
+	return 0;
+}
+
+static void udplite4_proc_net_exit(struct net *net)
+{
+	udp_proc_unregister(net, &udplite4_seq_afinfo);
+}
+
+static struct pernet_operations udplite4_proc_net_ops = {
+	.init = udplite4_proc_net_init,
+	.exit = udplite4_proc_net_exit,
+};
 #endif
 
 void __init udplite4_register(void)
@@ -109,8 +130,7 @@ void __init udplite4_register(void)
 	inet_register_protosw(&udplite4_protosw);
 
 #ifdef CONFIG_PROC_FS
-	if (udp_proc_register(&init_net, &udplite4_seq_afinfo)) /* udplite4_proc_init() */
-		printk(KERN_ERR "%s: Cannot register /proc!\n", __FUNCTION__);
+	register_pernet_subsys(&udplite4_proc_net_ops);
 #endif
 	return;
 

-- 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: bar/lim for privvmpages
bar/lim for privvmpages [message #21414] Mon, 08 October 2007 09:24
Dietmar Maurer is currently offline Dietmar Maurer
Messages: 52
Registered: March 2007
Member
From: openvz.org
Hi all,

Documentation says:

> There should be a safety gap between the |barrier| and the |limit| for
> |privvmpages| parameter to reduce the number of memory allocation

But I cant find any hints how large that should be. I already looked at
the code in vzctl-3.0.18/src/validate.c, but was unable to find
something.

I want a privvmpages limit of 1GB - whats the suggested barriere for
that?

- Dietmar
 Topic: [PATCH] virtualization of sysv msg queues is incomplete
[PATCH] virtualization of sysv msg queues is incomplete [message #21411] Mon, 08 October 2007 07:04
Kirill Korotaev is currently offline Kirill Korotaev
Messages: 137
Registered: January 2006
Senior Member
From: openvz.org
Virtualization of sysv msg queues is incomplete:
msg_hdrs and msg_bytes variables visible from userspace are global.
Let's make them per-namespace.

Signed-Off-By: Alexey Kuznetsov <alexey@openvz.org>
Signed-Off-By: Kirill Korotaev <dev@openvz.org>

---
 include/linux/ipc.h |    2 ++
 ipc/msg.c           |   21 ++++++++++-----------
 2 files changed, 12 insertions(+), 11 deletions(-)

--- ./include/linux/ipc.h.ve1012	2007-10-08 14:35:40.000000000 +0400
+++ ./include/linux/ipc.h	2007-10-08 14:40:31.000000000 +0400
@@ -111,6 +111,8 @@ struct ipc_namespace {
 	int		msg_ctlmax;
 	int		msg_ctlmnb;
 	int		msg_ctlmni;
+	atomic_t	msg_bytes;
+	atomic_t	msg_hdrs;
 
 	size_t		shm_ctlmax;
 	size_t		shm_ctlall;
--- ./ipc/msg.c.ve1012	2007-10-08 14:35:40.000000000 +0400
+++ ./ipc/msg.c	2007-10-08 14:41:41.000000000 +0400
@@ -66,9 +66,6 @@ struct msg_sender {
 #define SEARCH_NOTEQUAL		3
 #define SEARCH_LESSEQUAL	4
 
-static atomic_t msg_bytes =	ATOMIC_INIT(0);
-static atomic_t msg_hdrs =	ATOMIC_INIT(0);
-
 static struct ipc_ids init_msg_ids;
 
 #define msg_ids(ns)	(*((ns)->ids[IPC_MSG_IDS]))
@@ -89,6 +86,8 @@ static void __msg_init_ns(struct ipc_nam
 	ns->msg_ctlmax = MSGMAX;
 	ns->msg_ctlmnb = MSGMNB;
 	ns->msg_ctlmni = MSGMNI;
+	atomic_set(&ns->msg_bytes, 0);
+	atomic_set(&ns->msg_hdrs, 0);
 	ipc_init_ids(ids);
 }
 
@@ -277,10 +276,10 @@ static void freeque(struct ipc_namespace
 		struct msg_msg *msg = list_entry(tmp, struct msg_msg, m_list);
 
 		tmp = tmp->next;
-		atomic_dec(&msg_hdrs);
+		atomic_dec(&ns->msg_hdrs);
 		free_msg(msg);
 	}
-	atomic_sub(msq->q_cbytes, &msg_bytes);
+	atomic_sub(msq->q_cbytes, &ns->msg_bytes);
 	security_msg_queue_free(msq);
 	ipc_rcu_putref(msq);
 }
@@ -447,8 +446,8 @@ asmlinkage long sys_msgctl(int msqid, in
 		mutex_lock(&msg_ids(ns).mutex);
 		if (cmd == MSG_INFO) {
 			msginfo.msgpool = msg_ids(ns).in_use;
-			msginfo.msgmap = atomic_read(&msg_hdrs);
-			msginfo.msgtql = atomic_read(&msg_bytes);
+			msginfo.msgmap = atomic_read(&ns->msg_hdrs);
+			msginfo.msgtql = atomic_read(&ns->msg_bytes);
 		} else {
 			msginfo.msgmap = MSGMAP;
 			msginfo.msgpool = MSGPOOL;
@@ -719,8 +718,8 @@ long do_msgsnd(int msqid, long mtype, vo
 		list_add_tail(&msg->m_list, &msq->q_messages);
 		msq->q_cbytes += msgsz;
 		msq->q_qnum++;
-		atomic_add(msgsz, &msg_bytes);
-		atomic_inc(&msg_hdrs);
+		atomic_add(msgsz, &ns->msg_bytes);
+		atomic_inc(&ns->msg_hdrs);
 	}
 
 	err = 0;
@@ -824,8 +823,8 @@ long do_msgrcv(int msqid, long *pmtype, 
 			msq->q_rtime = get_seconds();
 			msq->q_lrpid = task_tgid_vnr(current);
 			msq->q_cbytes -= msg->m_ts;
-			atomic_sub(msg->m_ts, &msg_bytes);
-			atomic_dec(&msg_hdrs);
+			atomic_sub(msg->m_ts, &ns->msg_bytes);
+			atomic_dec(&ns->msg_hdrs);
 			ss_wakeup(&msq->q_senders, 0);
 			msg_unlock(msq);
 			break;

_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
 Topic: [PATCH 2/5] make netlink processing routines semi-synchronious (inspired by rtnl) v2
[PATCH 2/5] make netlink processing routines semi-synchronious (inspired by rtnl) v2 [message #21350] Fri, 05 October 2007 10:46
den is currently offline den
Messages: 493
Registered: December 2005
Senior Member
From: openvz.org
The code in netfilter/nfnetlink.c and in ./net/netlink/genetlink.c looks
like outdated copy/paste from rtnetlink.c. Push them into sync with the
original.

Changes from v1:
- deleted comment in nfnetlink_rcv_msg by request of Patrick McHardy

Signed-off-by: Denis V. Lunev <den@openvz.org>
Acked-by: Patrick McHardy <kaber@trash.net>

--- ./net/netfilter/nfnetlink.c.nlk3	2007-10-01 09:47:53.000000000 +0400
+++ ./net/netfilter/nfnetlink.c	2007-10-01 17:13:09.000000000 +0400
@@ -44,26 +44,14 @@ static struct sock *nfnl = NULL;
 static const struct nfnetlink_subsystem *subsys_table[NFNL_SUBSYS_COUNT];
 static DEFINE_MUTEX(nfnl_mutex);
 
-static void nfnl_lock(void)
+static inline void nfnl_lock(void)
 {
 	mutex_lock(&nfnl_mutex);
 }
 
-static int nfnl_trylock(void)
-{
-	return !mutex_trylock(&nfnl_mutex);
-}
-
-static void __nfnl_unlock(void)
-{
-	mutex_unlock(&nfnl_mutex);
-}
-
-static void nfnl_unlock(void)
+static inline void nfnl_unlock(void)
 {
 	mutex_unlock(&nfnl_mutex);
-	if (nfnl->sk_receive_queue.qlen)
-		nfnl->sk_data_ready(nfnl, 0);
 }
 
 int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n)
@@ -147,9 +135,7 @@ static int nfnetlink_rcv_msg(struct sk_b
 	ss = nfnetlink_get_subsys(type);
 	if (!ss) {
 #ifdef CONFIG_KMOD
-		/* don't call nfnl_unlock, since it would reenter
-		 * with further packet processing */
-		__nfnl_unlock();
+		nfnl_unlock();
 		request_module("nfnetlink-subsys-%d", NFNL_SUBSYS_ID(type));
 		nfnl_lock();
 		ss = nfnetlink_get_subsys(type);
@@ -188,10 +174,9 @@ static void nfnetlink_rcv(struct sock *s
 	unsigned int qlen = 0;
 
 	do {
-		if (nfnl_trylock())
-			return;
+		nfnl_lock();
 		qlen = netlink_run_queue(sk, qlen, nfnetlink_rcv_msg);
-		__nfnl_unlock();
+		nfnl_unlock();
 	} while (qlen);
 }
 
--- ./net/netlink/genetlink.c.nlk3	2007-08-26 19:30:38.000000000 +0400
+++ ./net/netlink/genetlink.c	2007-10-01 16:05:29.000000000 +0400
@@ -22,22 +22,14 @@ struct sock *genl_sock = NULL;
 
 static DEFINE_MUTEX(genl_mutex); /* serialization of message processing */
 
-static void genl_lock(void)
+static inline void genl_lock(void)
 {
 	mutex_lock(&genl_mutex);
 }
 
-static int genl_trylock(void)
-{
-	return !mutex_trylock(&genl_mutex);
-}
-
-static void genl_unlock(void)
+static inline void genl_unlock(void)
 {
 	mutex_unlock(&genl_mutex);
-
-	if (genl_sock && genl_sock->sk_receive_queue.qlen)
-		genl_sock->sk_data_ready(genl_sock, 0);
 }
 
 #define GENL_FAM_TAB_SIZE	16
@@ -483,8 +475,7 @@ static void genl_rcv(struct sock *sk, in
 	unsigned int qlen = 0;
 
 	do {
-		if (genl_trylock())
-			return;
+		genl_lock();
 		qlen = netlink_run_queue(sk, qlen, genl_rcv_msg);
 		genl_unlock();
 	} while (qlen && genl_sock && genl_sock->sk_receive_queue.qlen);
 Topic: post1
post1 [message #21330] Fri, 05 October 2007 02:52
Anoop Thomas is currently offline Anoop Thomas
Messages: 1
Registered: October 2007
Junior Member
From: openvz.org
geeks,
   
  I am a newbbie and like to develop n learn with u guys.
   

       
---------------------------------
 Explore your hobbies and interests. Click here to begin.
 Topic: [PATCH] Simplify memory controller and resource counter I/O
[PATCH] Simplify memory controller and resource counter I/O [message #21325] Fri, 05 October 2007 00:35
Paul Menage is currently offline Paul Menage
Messages: 642
Registered: September 2006
Senior Member
From: openvz.org
Simplify the memory controller and resource counter I/O routines

This patch strips out some I/O boilerplate from resource counters and
the memory controller. It also adds locking to the resource counter
reads and writes, and forbids writes to the root memory cgroup's limit
file.

cgroup_write_uint() is extended to call memparse() rather than just
simple_strtoull()

Signed-off-by: Paul Menage <menage@google.com>

---

 include/linux/cgroup.h      |    2 
 include/linux/res_counter.h |   13 +----
 kernel/cgroup.c             |    2 
 kernel/res_counter.c        |   64 +++++----------------------
 mm/memcontrol.c             |  103 ++++++++++----------------------------------
 5 files changed, 45 insertions(+), 139 deletions(-)

Index: container-2.6.23-rc8-mm2/include/linux/res_counter.h
===================================================================
--- container-2.6.23-rc8-mm2.orig/include/linux/res_counter.h
+++ container-2.6.23-rc8-mm2/include/linux/res_counter.h
@@ -46,17 +46,12 @@ struct res_counter {
  *
  * @counter:     the counter in question
  * @member:  the field to work with (see RES_xxx below)
- * @buf:     the buffer to opeate on,...
- * @nbytes:  its size...
- * @pos:     and the offset.
+ * @val:     the value passed by the user (for write)
  */
 
-ssize_t res_counter_read(struct res_counter *counter, int member,
-		const char __user *buf, size_t nbytes, loff_t *pos,
-		int (*read_strategy)(unsigned long long val, char *s));
-ssize_t res_counter_write(struct res_counter *counter, int member,
-		const char __user *buf, size_t nbytes, loff_t *pos,
-		int (*write_strategy)(char *buf, unsigned long long *val));
+unsigned long long res_counter_read(struct res_counter *counter, int member);
+int res_counter_write(struct res_counter *counter, int member,
+		      unsigned long long val);
 
 /*
  * the field descriptors. one for each member of res_counter
Index: container-2.6.23-rc8-mm2/kernel/res_counter.c
===================================================================
--- container-2.6.23-rc8-mm2.orig/kernel/res_counter.c
+++ container-2.6.23-rc8-mm2/kernel/res_counter.c
@@ -75,58 +75,22 @@ res_counter_member(struct res_counter *c
 	return NULL;
 }
 
-ssize_t res_counter_read(struct res_counter *counter, int member,
-		const char __user *userbuf, size_t nbytes, loff_t *pos,
-		int (*read_strategy)(unsigned long long val, char *st_buf))
+unsigned long long res_counter_read(struct res_counter *counter, int member)
 {
-	unsigned long long *val;
-	char buf[64], *s;
-
-	s = buf;
-	val = res_counter_member(counter, member);
-	if (read_strategy)
-		s += read_strategy(*val, s);
-	else
-		s += sprintf(s, "%llu\n", *val);
-	return simple_read_from_buffer((void __user *)userbuf, nbytes,
-			pos, buf, s - buf);
+	unsigned long long val;
+	unsigned long flags;
+	spin_lock_irqsave(&counter->lock, flags);
+	val = *res_counter_member(counter, member);
+	spin_unlock_irqrestore(&counter->lock, flags);
+	return val;
 }
 
-ssize_t res_counter_write(struct res_counter *counter, int member,
-		const char __user *userbuf, size_t nbytes, loff_t *pos,
-		int (*write_strategy)(char *st_buf, unsigned long long *val))
+int res_counter_write(struct res_counter *counter, int member,
+		      unsigned long long val)
 {
-	int ret;
-	char *buf, *end;
-	unsigned long long tmp, *val;
-
-	buf = kmalloc(nbytes + 1, GFP_KERNEL);
-	ret = -ENOMEM;
-	if (buf == NULL)
-		goto out;
-
-	buf[nbytes] = '\0';
-	ret = -EFAULT;
-	if (copy_from_user(buf, userbuf, nbytes))
-		goto out_free;
-
-	ret = -EINVAL;
-
-	if (write_strategy) {
-		if (write_strategy(buf, &tmp)) {
-			goto out_free;
-		}
-	} else {
-		tmp = simple_strtoull(buf, &end, 10);
-		if (*end != '\0')
-			goto out_free;
-	}
-
-	val = res_counter_member(counter, member);
-	*val = tmp;
-	ret = nbytes;
-out_free:
-	kfree(buf);
-out:
-	return ret;
+	unsigned long flags;
+	spin_lock_irqsave(&counter->lock, flags);
+	*res_counter_member(counter, member) = val;
+	spin_unlock_irqrestore(&counter->lock, flags);
+	return 0;
 }
Index: container-2.6.23-rc8-mm2/mm/memcontrol.c
===================================================================
--- container-2.6.23-rc8-mm2.orig/mm/memcontrol.c
+++ container-2.6.23-rc8-mm2/mm/memcontrol.c
@@ -432,112 +432,59 @@ void mem_cgroup_uncharge(struct page_cgr
 	}
 }
 
-int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
-{
-	*tmp = memparse(buf, &buf);
-	if (*buf != '\0')
-		return -EINVAL;
-
-	/*
-	 * Round up the value to the closest page size
-	 */
-	*tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
-	return 0;
-}
-
-static ssize_t mem_cgroup_read(struct cgroup *cont,
-			struct cftype *cft, struct file *file,
-			char __user *userbuf, size_t nbytes, loff_t *ppos)
+static unsigned long long mem_cgroup_read(struct cgroup *cont,
+					  struct cftype *cft)
 {
 	return res_counter_read(&mem_cgroup_from_cont(cont)->res,
-				cft->private, userbuf, nbytes, ppos,
-				NULL);
+				cft->private);
 }
 
-static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
-				struct file *file, const char __user *userbuf,
-				size_t nbytes, loff_t *ppos)
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+			    unsigned long long val)
 {
+	/* Don't allow the limit to be set for the root cgroup */
+	if (!cont->parent)
+		return -EINVAL;
 	return res_counter_write(&mem_cgroup_from_cont(cont)->res,
-				cft->private, userbuf, nbytes, ppos,
-				mem_cgroup_write_strategy);
+				 cft->private, PAGE_ALIGN(val));
 }
 
-static ssize_t mem_control_type_write(struct cgroup *cont,
-			struct cftype *cft, struct file *file,
-			const char __user *userbuf,
-			size_t nbytes, loff_t *pos)
-{
-	int ret;
-	char *buf, *end;
-	unsigned long tmp;
-	struct mem_cgroup *mem;
-
-	mem = mem_cgroup_from_cont(cont);
-	buf = kmalloc(nbytes + 1, GFP_KERNEL);
-	ret = -ENOMEM;
-	if (buf == NULL)
-		goto out;
-
-	buf[nbytes] = 0;
-	ret = -EFAULT;
-	if (copy_from_user(buf, userbuf, nbytes))
-		goto out_free;
-
-	ret = -EINVAL;
-	tmp = simple_strtoul(buf, &end, 10);
-	if (*end != '\0')
-		goto out_free;
-
-	if (tmp <= MEM_CGROUP_TYPE_UNSPEC || tmp >= MEM_CGROUP_TYPE_MAX)
-		goto out_free;
-
-	mem->control_type = tmp;
-	ret = nbytes;
-out_free:
-	kfree(buf);
-out:
-	return ret;
+static int mem_control_type_write(struct cgroup *cont, struct cftype *cft,
+				  u64 val)
+{
+	if (val <= MEM_CGROUP_TYPE_UNSPEC || val >= MEM_CGROUP_TYPE_MAX)
+		return -EINVAL;
+	mem_cgroup_from_cont(cont)->control_type = val;
+	return 0;
 }
 
-static ssize_t mem_control_type_read(struct cgroup *cont,
-				struct cftype *cft,
-				struct file *file, char __user *userbuf,
-				size_t nbytes, loff_t *ppos)
+static u64 mem_control_type_read(struct cgroup *cont,
+				 struct cftype *cft)
 {
-	unsigned long val;
-	char buf[64], *s;
-	struct mem_cgroup *mem;
-
-	mem = mem_cgroup_from_cont(cont);
-	s = buf;
-	val = mem->control_type;
-	s += sprintf(s, "%lu\n", val);
-	return simple_read_from_buffer((void __user *)userbuf, nbytes,
-			ppos, buf, s - buf);
+	return mem_cgroup_from_cont(cont)->control_type;
 }
 
 static struct cftype mem_cgroup_files[] = {
 	{
 		.name = "usage_in_bytes",
 		.private = RES_USAGE,
-		.read = mem_cgroup_read,
+		.read_uint = mem_cgroup_read,
 	},
 	{
 		.name = "limit_in_bytes",
 		.private = RES_LIMIT,
-		.write = mem_cgroup_write,
-		.read = mem_cgroup_read,
+		.write_uint = mem_cgroup_write,
+		.read_uint = mem_cgroup_read,
 	},
 	{
 		.name = "failcnt",
 		.private = RES_FAILCNT,
-		.read = mem_cgroup_read,
+		.read_uint = mem_cgroup_read,
 	},
 	{
 		.name = "control_type",
-		.write = mem_control_type_write,
-		.read = mem_control_type_read,
+		.write_uint = mem_control_type_write,
+		.read_uint = mem_control_type_read,
 	},
 };
 
Index: container-2.6.23-rc8-mm2/include/linux/cgroup.h
===================================================================
--- container-2.6.23-rc8-mm2.orig/include/linux/cgroup.h
+++ container-2.6.23-rc8-mm2/include/linux/cgroup.h
@@ -199,7 +199,7 @@ struct cftype {
 
 	/*
 	 * write_uint() is a shortcut for the common case of accepting
-	 * a single integer (as parsed by simple_strtoull) from
+	 * a single integer (as parsed by lib/cmdline.c:memparse()) from
 	 * userspace. Use in place of write(); return 0 or error.
 	 */
 	int (*write_uint) (struct cgroup *cont, struct cftype *cft, u64 val);
Index: container-2.6.23-rc8-mm2/kernel/cgroup.c
===================================================================
--- container-2.6.23-rc8-mm2.orig/kernel/cgroup.c
+++ container-2.6.23-rc8-mm2/kernel/cgroup.c
@@ -1296,7 +1296,7 @@ static ssize_t cgroup_write_uint(struct 
 	/* strip newline if necessary */
 	if (nbytes && (buffer[nbytes-1] == '\n'))
 		buffer[nbytes-1] = 0;
-	val = simple_strtoull(buffer, &end, 0);
+	val = memparse(buffer, &end);
 	if (*end)
 		return -EINVAL;
 
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
Pages (31): [ «    6  7  8  9  10  11  12  13  14  15  16  17  18  19  20  21    »]


Current Time: Sun May 19 14:58:36 EDT 2013
Powered by FUDforum Powered by Parallels Virtuozzo Containers