| Home » Mailing lists » Devel » [PATCH 1/5] net: Modify all rtnetlink methods to only work in the initial namespace Goto Forum:
	| 
		
			| [PATCH 1/5] net: Modify all rtnetlink methods to only work in the initial namespace [message #20979] | Sat, 29 September 2007 01:00  |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Before I can enable rtnetlink to work in all network namespaces
I need to be certain that something won't break.  So this
patch deliberately disables all of the rtnletlink methods in everything
except the initial network namespace.  After the methods have been
audited this extra check can be disabled.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/bridge/br_netlink.c |    9 +++++++++
 net/core/fib_rules.c    |   11 +++++++++++
 net/core/neighbour.c    |   18 ++++++++++++++++++
 net/core/rtnetlink.c    |   19 +++++++++++++++++++
 net/decnet/dn_dev.c     |   12 ++++++++++++
 net/decnet/dn_fib.c     |    8 ++++++++
 net/decnet/dn_route.c   |    8 ++++++++
 net/decnet/dn_table.c   |    4 ++++
 net/ipv4/devinet.c      |   12 ++++++++++++
 net/ipv4/fib_frontend.c |   12 ++++++++++++
 net/ipv4/route.c        |    4 ++++
 net/ipv6/addrconf.c     |   31 +++++++++++++++++++++++++++++++
 net/ipv6/ip6_fib.c      |    4 ++++
 net/ipv6/route.c        |   12 ++++++++++++
 net/sched/act_api.c     |   10 ++++++++++
 net/sched/cls_api.c     |   10 ++++++++++
 net/sched/sch_api.c     |   21 +++++++++++++++++++++
 17 files changed, 205 insertions(+), 0 deletions(-)
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 53ab8e0..a4ffa2b 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
+#include <net/sock.h>
 #include "br_private.h"
 
 static inline size_t br_nlmsg_size(void)
@@ -107,9 +108,13 @@ errout:
  */
 static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = skb->sk->sk_net;
 	struct net_device *dev;
 	int idx;
 
+	if (net != &init_net)
+		return 0;
+
 	idx = 0;
 	for_each_netdev(&init_net, dev) {
 		/* not a bridge port */
@@ -135,12 +140,16 @@ skip:
  */
 static int br_rtm_setlink(struct sk_buff *skb,  struct nlmsghdr *nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct ifinfomsg *ifm;
 	struct nlattr *protinfo;
 	struct net_device *dev;
 	struct net_bridge_port *p;
 	u8 new_state;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	if (nlmsg_len(nlh) < sizeof(*ifm))
 		return -EINVAL;
 
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 13de6f5..357bfa0 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -206,6 +206,9 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct nlattr *tb[FRA_MAX+1];
 	int err = -EINVAL, unresolved = 0;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
 
@@ -336,12 +339,16 @@ errout:
 
 static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct fib_rule_hdr *frh = nlmsg_data(nlh);
 	struct fib_rules_ops *ops = NULL;
 	struct fib_rule *rule, *tmp;
 	struct nlattr *tb[FRA_MAX+1];
 	int err = -EINVAL;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*frh)))
 		goto errout;
 
@@ -517,9 +524,13 @@ skip:
 
 static int fib_nl_dumprule(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = skb->sk->sk_net;
 	struct fib_rules_ops *ops;
 	int idx = 0, family;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	family = rtnl_msg_family(cb->nlh);
 	if (family != AF_UNSPEC) {
 		/* Protocol specific dump request */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index c52df85..27001db 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1448,6 +1448,9 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err = -EINVAL;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	if (nlmsg_len(nlh) < sizeof(*ndm))
 		goto out;
 
@@ -1514,6 +1517,9 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct net_device *dev = NULL;
 	int err;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL);
 	if (err < 0)
 		goto out;
@@ -1788,11 +1794,15 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
 
 static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct neigh_table *tbl;
 	struct ndtmsg *ndtmsg;
 	struct nlattr *tb[NDTA_MAX+1];
 	int err;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX,
 			  nl_neightbl_policy);
 	if (err < 0)
@@ -1912,11 +1922,15 @@ errout:
 
 static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = skb->sk->sk_net;
 	int family, tidx, nidx = 0;
 	int tbl_skip = cb->args[0];
 	int neigh_skip = cb->args[1];
 	struct neigh_table *tbl;
 
+	if (net != &init_net)
+		return 0;
+
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
 
 	read_lock(&neigh_tbl_lock);
@@ -2041,9 +2055,13 @@ out:
 
 static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = skb->sk->sk_net;
 	struct neigh_table *tbl;
 	int t, family, s_t;
 
+	if (net != &init_net)
+		return 0;
+
 	read_lock(&neigh_tbl_lock);
 	family = ((struct rtgenmsg *) nlmsg_data(cb->nlh))->rtgen_family;
 	s_t = cb->args[0];
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 8bc68e6..56fc4f9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -707,6 +707,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_idx = cb->args[0];
 	struct net_device *dev;
 
+	if (net != &init_net)
+		return 0;
+
 	idx = 0;
 	for_each_netdev(net, dev) {
 		if (idx < s_idx)
@@ -909,6 +912,9 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[IFLA_MAX+1];
 	char ifname[IFNAMSIZ];
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		goto errout;
@@ -957,6 +963,9 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[IFLA_MAX+1];
 	int err;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		return err;
@@ -1038,6 +1047,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
 	int err;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 #ifdef CONFIG_KMOD
 replay:
 #endif
@@ -1164,6 +1176,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct sk_buff *nskb;
 	int err;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		return err;
@@ -1199,9 +1214,13 @@ errout:
 
 static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = skb->sk->sk_net;
 	int idx;
 	int s_idx = cb->family;
 
+	if (net != &init_net)
+		return 0;
+
 	if (s_idx == 0)
 		s_idx = 1;
 	for (idx=1; idx<NPROTO; idx++) {
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index 26130af..6e82f7a 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -647,12 +647,16 @@ static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = {
 
 static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct nlattr *tb[IFA_MAX+1];
 	struct dn_dev *dn_db;
 	struct ifaddrmsg *ifm;
 	struct dn_ifaddr *ifa, **ifap;
 	int err = -EADDRNOTAVAIL;
 
+	if (net != &init_net)
+		goto errout;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
 	if (err < 0)
 		goto errout;
@@ -679,6 +683,7 @@ errout:
 
 static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct nlattr *tb[IFA_MAX+1];
 	struct net_device *dev;
 	struct dn_dev *dn_db;
@@ -686,6 +691,9 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct dn_ifaddr *ifa;
 	int err;
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy);
 	if (err < 0)
 		return err;
@@ -791,11 +799,15 @@ errout:
 
 static int dn_nl_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 {
+	struct net *net = skb->sk->sk_net;
 	int idx, dn_idx = 0, skip_ndevs, skip_naddr;
 	struct net_device *dev;
 	struct dn_dev *dn_db;
 	struct dn_ifaddr *ifa;
 
+	if (net != &init_net)
+		return 0;
+
 	skip_ndevs = cb->args[0];
 	skip_naddr = cb->args[1];
 
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index 3760a20..5413e1b 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -506,10 +506,14 @@ static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta)
 
 static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct dn_fib_table *tb;
 	struct rtattr **rta = arg;
 	struct rtmsg *r = NLMSG_DATA(nlh);
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	if (dn_fib_check_attr(r, rta))
 		return -EINVAL;
 
@@ -522,10 +526,14 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *
 
 static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
+	struct net *net = skb->sk->sk_net;
 	struct dn_fib_table *tb;
 	struct rtattr **rta = arg;
 	struct rtmsg *r = NLMSG_DATA(nlh);
 
+	if (net != &init_net)
+		return -EINVAL;
+
 	if (dn_fib_check_attr(r, rta))
 		return -EINVAL;
 
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index b7ebf99..a16374b 100644
--- a/...
 
 |  
	|  |  |  
	| 
		
			| [PATCH 2/5] net: Make rtnetlink infrastructure network namespace aware [message #20980 is a reply to message #20979] | Sat, 29 September 2007 01:02   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| After this patch none of the netlink callback support anything
except the initial network namespace but the rtnetlink infrastructure
now handles multiple network namespaces.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/linux/rtnetlink.h   |    8 ++--
 include/net/net_namespace.h |    3 +
 net/bridge/br_netlink.c     |    4 +-
 net/core/fib_rules.c        |    4 +-
 net/core/neighbour.c        |    4 +-
 net/core/rtnetlink.c        |   96 +++++++++++++++++++++++++++++++++++++------
 net/decnet/dn_dev.c         |    4 +-
 net/decnet/dn_route.c       |    2 +-
 net/decnet/dn_table.c       |    4 +-
 net/ipv4/devinet.c          |    4 +-
 net/ipv4/fib_semantics.c    |    4 +-
 net/ipv4/ipmr.c             |    4 +-
 net/ipv4/route.c            |    2 +-
 net/ipv6/addrconf.c         |   14 +++---
 net/ipv6/route.c            |    6 +-
 net/sched/act_api.c         |    8 ++--
 net/sched/cls_api.c         |    2 +-
 net/sched/sch_api.c         |    4 +-
 net/wireless/wext.c         |    5 ++-
 19 files changed, 129 insertions(+), 53 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 9c21e45..518247e 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -582,11 +582,11 @@ extern int __rtattr_parse_nested_compat(struct rtattr *tb[], int maxattr,
 ({	data = RTA_PAYLOAD(rta) >= len ? RTA_DATA(rta) : NULL; \
 	__rtattr_parse_nested_compat(tb, max, rta, len); })
 
-extern int rtnetlink_send(struct sk_buff *skb, u32 pid, u32 group, int echo);
-extern int rtnl_unicast(struct sk_buff *skb, u32 pid);
-extern int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
+extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
+extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
+extern int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 		       struct nlmsghdr *nlh, gfp_t flags);
-extern void rtnl_set_sk_err(u32 group, int error);
+extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, u32 ts, u32 tsage, long expires,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 934c840..f75607a 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -10,6 +10,7 @@
 
 struct proc_dir_entry;
 struct net_device;
+struct sock;
 struct net {
 	atomic_t		count;		/* To decided when the network
 						 *  namespace should be freed.
@@ -29,6 +30,8 @@ struct net {
 	struct list_head 	dev_base_head;
 	struct hlist_head 	*dev_name_head;
 	struct hlist_head	*dev_index_head;
+
+	struct sock 		*rtnl;			/* rtnetlink socket */
 };
 
 #ifdef CONFIG_NET
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a4ffa2b..f5d6933 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -97,10 +97,10 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
+	err = rtnl_notify(skb, &init_net,0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
 errout:
 	if (err < 0)
-		rtnl_set_sk_err(RTNLGRP_LINK, err);
+		rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err);
 }
 
 /*
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 357bfa0..03c803c 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -577,10 +577,10 @@ static void notify_rule_change(int event, struct fib_rule *rule,
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, pid, ops->nlgroup, nlh, GFP_KERNEL);
+	err = rtnl_notify(skb, &init_net, pid, ops->nlgroup, nlh, GFP_KERNEL);
 errout:
 	if (err < 0)
-		rtnl_set_sk_err(ops->nlgroup, err);
+		rtnl_set_sk_err(&init_net, ops->nlgroup, err);
 }
 
 static void attach_rules(struct list_head *rules, struct net_device *dev)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 27001db..c452584 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2466,10 +2466,10 @@ static void __neigh_notify(struct neighbour *n, int type, int flags)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
+	err = rtnl_notify(skb, &init_net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
 errout:
 	if (err < 0)
-		rtnl_set_sk_err(RTNLGRP_NEIGH, err);
+		rtnl_set_sk_err(&init_net, RTNLGRP_NEIGH, err);
 }
 
 #ifdef CONFIG_ARPD
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 56fc4f9..fc49104 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -60,7 +60,6 @@ struct rtnl_link
 };
 
 static DEFINE_MUTEX(rtnl_mutex);
-static struct sock *rtnl;
 
 void rtnl_lock(void)
 {
@@ -74,9 +73,38 @@ void __rtnl_unlock(void)
 
 void rtnl_unlock(void)
 {
-	mutex_unlock(&rtnl_mutex);
-	if (rtnl && rtnl->sk_receive_queue.qlen)
+	struct net *net;
+
+	/*
+	 * Loop through all of the rtnl sockets until none of them (in
+	 * a live network namespace) have queue packets.
+	 *
+	 * We have to be careful with the locking here as
+	 * sk_data_ready aka rtnetlink_rcv takes the rtnl_mutex.
+	 *
+	 * To ensure the network namespace does not exit while
+	 * we are processing packets on it's rtnl socket we
+	 * grab a reference to the network namespace, ignoring
+	 * it if the network namespace has already exited.
+	 */
+retry:
+	for_each_net(net) {
+		struct sock *rtnl = net->rtnl;
+
+		if (!rtnl || !rtnl->sk_receive_queue.qlen)
+			continue;
+
+		if (!maybe_get_net(net))
+			continue;
+
+		mutex_unlock(&rtnl_mutex);
 		rtnl->sk_data_ready(rtnl, 0);
+		mutex_lock(&rtnl_mutex);
+		put_net(net);
+		goto retry;
+	}
+	mutex_unlock(&rtnl_mutex);
+
 	netdev_run_todo();
 }
 
@@ -462,8 +490,9 @@ size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size)
 	return ret;
 }
 
-int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
+int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, int echo)
 {
+	struct sock *rtnl = net->rtnl;
 	int err = 0;
 
 	NETLINK_CB(skb).dst_group = group;
@@ -475,14 +504,17 @@ int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 	return err;
 }
 
-int rtnl_unicast(struct sk_buff *skb, u32 pid)
+int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
 {
+	struct sock *rtnl = net->rtnl;
+
 	return nlmsg_unicast(rtnl, skb, pid);
 }
 
-int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
+int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
 		struct nlmsghdr *nlh, gfp_t flags)
 {
+	struct sock *rtnl = net->rtnl;
 	int report = 0;
 
 	if (nlh)
@@ -491,8 +523,10 @@ int rtnl_notify(struct sk_buff *skb, u32 pid, u32 group,
 	return nlmsg_notify(rtnl, skb, pid, group, report, flags);
 }
 
-void rtnl_set_sk_err(u32 group, int error)
+void rtnl_set_sk_err(struct net *net, u32 group, int error)
 {
+	struct sock *rtnl = net->rtnl;
+
 	netlink_set_err(rtnl, 0, group, error);
 }
 
@@ -1205,7 +1239,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 		kfree_skb(nskb);
 		goto errout;
 	}
-	err = rtnl_unicast(nskb, NETLINK_CB(skb).pid);
+	err = rtnl_unicast(nskb, net, NETLINK_CB(skb).pid);
 errout:
 	dev_put(dev);
 
@@ -1256,10 +1290,10 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	err = rtnl_notify(skb, &init_net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
 errout:
 	if (err < 0)
-		rtnl_set_sk_err(RTNLGRP_LINK, err);
+		rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err);
 }
 
 /* Protected by RTNL sempahore.  */
@@ -1270,6 +1304,7 @@ static int rtattr_max;
 
 static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
+	struct net *net = skb->sk->sk_net;
 	rtnl_doit_func doit;
 	int sz_idx, kind;
 	int min_len;
@@ -1298,6 +1333,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return -EPERM;
 
 	if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) {
+		struct sock *rtnl;
 		rtnl_dumpit_func dumpit;
 
 		dumpit = rtnl_get_dumpit(family, type);
@@ -1305,6 +1341,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EOPNOTSUPP;
 
 		__rtnl_unlock();
+		rtnl = net->rtnl;
 		err = netlink_dump_start(rtnl, skb, nlh, dumpit, NULL);
 		rtnl_lock();
 		return err;
@@ -1383,6 +1420,40 @@ static struct notifier_block rtnetlink_dev_notifier = {
 	.notifier_call	= rtnetlink_event,
 };
 
+
+static int rtnetlink_net_init(struct net *net)
+{
+	struct sock *sk;
+	sk = netlink_kernel_create(net, NETLINK_ROUTE, RTNLGRP_MAX,
+				   rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
+	if (!sk)
+		return -ENOMEM;
+
+	/* Don't hold an extra reference on the namespace */
+	put_net(sk->sk_net);
+	net->rtnl = sk;
+	return 0;
+}
+
+static void rtnetlink_net_exit(struct net *net)
+{
+	struct sock *sk = net->rtnl;
+	if (sk) {
+		/* At the last minute lie and say this is a socket for the
+		 * initial network namespace.  So the socket will be safe to
+		 * free.
+		 */
+		sk->sk_net = get_net(&init_net);
+		sock_put(sk);
+		net->rtnl = NULL;
+	}
+}
+
+static struct pernet_operations rtnetlink_net_ops = {
+	.init = rtnetlink_net_init,
+	.exit = rtnetlink_net_exit,
+};
+
 void __init rtnetlink_init(void)
 {
 	int i;
@@ -1395,10 +1466,9 @@ void __init rtnetlink_init(void)
 	if (!rta_buf)
 		panic("rtnetlink_init: cannot allocate rta_buf\n");
 
-	rtnl = netlink_kernel_create(&init_net, NETLINK_ROUTE, RTNLGRP_MAX,
-				     rtnetlink_rcv, &rtnl_mutex, THIS_MODULE);
-	if (rtnl == NULL)
+	if (register_pernet_subsys(&rtnetlink_net_ops))
 		panic("rtnetlink_init: cannot initialize rtnetlink\n");
+
 	netlink_set_nonroot(NETLINK_ROUTE, NL_NONROO...
 
 |  
	|  |  |  
	| 
		
			| [PATCH 3/5] net: Make the netlink methods in rtnetlink handle multiple network namespaces [message #20981 is a reply to message #20980] | Sat, 29 September 2007 01:04   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| After the previous prep work this just consists of removing checks
limiting the code to work in the initial network namespace, and
updating rtmsg_ifinfo so we can generate events for devices in
something other then the initial network namespace.
Referring to network other network devices like the IFLA_LINK
and IFLA_MASTER attributes do, gets interesting if those network
devices happen to be in other network namespaces.  Currently
ifindex numbers are allocated globally so I have taken the path
of least resistance and not still report the information even
though the devices they are talking about are invisible.
If applications start getting confused or when ifindex
numbers become local to the network namespace we may need
to do something different in the future.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/core/rtnetlink.c |   27 +++------------------------
 1 files changed, 3 insertions(+), 24 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index fc49104..809a9fb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -741,9 +741,6 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
 	int s_idx = cb->args[0];
 	struct net_device *dev;
 
-	if (net != &init_net)
-		return 0;
-
 	idx = 0;
 	for_each_netdev(net, dev) {
 		if (idx < s_idx)
@@ -946,9 +943,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[IFLA_MAX+1];
 	char ifname[IFNAMSIZ];
 
-	if (net != &init_net)
-		return -EINVAL;
-
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		goto errout;
@@ -997,9 +991,6 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *tb[IFLA_MAX+1];
 	int err;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		return err;
@@ -1081,9 +1072,6 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	struct nlattr *linkinfo[IFLA_INFO_MAX+1];
 	int err;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 #ifdef CONFIG_KMOD
 replay:
 #endif
@@ -1210,9 +1198,6 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
 	struct sk_buff *nskb;
 	int err;
 
-	if (net != &init_net)
-		return -EINVAL;
-
 	err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy);
 	if (err < 0)
 		return err;
@@ -1248,13 +1233,9 @@ errout:
 
 static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	struct net *net = skb->sk->sk_net;
 	int idx;
 	int s_idx = cb->family;
 
-	if (net != &init_net)
-		return 0;
-
 	if (s_idx == 0)
 		s_idx = 1;
 	for (idx=1; idx<NPROTO; idx++) {
@@ -1276,6 +1257,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 {
+	struct net *net = dev->nd_net;
 	struct sk_buff *skb;
 	int err = -ENOBUFS;
 
@@ -1290,10 +1272,10 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
 		kfree_skb(skb);
 		goto errout;
 	}
-	err = rtnl_notify(skb, &init_net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
+	err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
 errout:
 	if (err < 0)
-		rtnl_set_sk_err(&init_net, RTNLGRP_LINK, err);
+		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
 }
 
 /* Protected by RTNL sempahore.  */
@@ -1392,9 +1374,6 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 {
 	struct net_device *dev = ptr;
 
-	if (dev->nd_net != &init_net)
-		return NOTIFY_DONE;
-
 	switch (event) {
 	case NETDEV_UNREGISTER:
 		rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
-- 
1.5.3.rc6.17.g1911
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| [PATCH 4/5] net: Make AF_PACKET handle multiple network namespaces [message #20982 is a reply to message #20981] | Sat, 29 September 2007 01:07   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| This is done by making packet_sklist_lock and packet_sklist per
network namespace and adding an additional filter condition on
received packets to ensure they came from the proper network
namespace.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 include/net/net_namespace.h |    4 +
 net/packet/af_packet.c      |  129 +++++++++++++++++++++++++++---------------
 2 files changed, 87 insertions(+), 46 deletions(-)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f75607a..5e9fb47 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -32,6 +32,10 @@ struct net {
 	struct hlist_head	*dev_index_head;
 
 	struct sock 		*rtnl;			/* rtnetlink socket */
+
+	/* List of all packet sockets. */
+	rwlock_t		packet_sklist_lock;
+	struct hlist_head	packet_sklist;
 };
 
 #ifdef CONFIG_NET
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e11000a..1c3a5a8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -135,10 +135,6 @@ dev->hard_header == NULL (ll header is added by device, we cannot control it)
    packet classifier depends on it.
  */
 
-/* List of all packet sockets. */
-static HLIST_HEAD(packet_sklist);
-static DEFINE_RWLOCK(packet_sklist_lock);
-
 static atomic_t packet_socks_nr;
 
 
@@ -252,9 +248,6 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
 	struct sock *sk;
 	struct sockaddr_pkt *spkt;
 
-	if (dev->nd_net != &init_net)
-		goto out;
-
 	/*
 	 *	When we registered the protocol we saved the socket in the data
 	 *	field for just this event.
@@ -276,6 +269,9 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto out;
 
+	if (dev->nd_net != sk->sk_net)
+		goto out;
+
 	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
 		goto oom;
 
@@ -347,7 +343,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
 	 */
 
 	saddr->spkt_device[13] = 0;
-	dev = dev_get_by_name(&init_net, saddr->spkt_device);
+	dev = dev_get_by_name(sk->sk_net, saddr->spkt_device);
 	err = -ENODEV;
 	if (dev == NULL)
 		goto out_unlock;
@@ -455,15 +451,15 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
 	int skb_len = skb->len;
 	unsigned int snaplen, res;
 
-	if (dev->nd_net != &init_net)
-		goto drop;
-
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
 
 	sk = pt->af_packet_priv;
 	po = pkt_sk(sk);
 
+	if (dev->nd_net != sk->sk_net)
+		goto drop;
+
 	skb->dev = dev;
 
 	if (dev->header_ops) {
@@ -572,15 +568,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
 	struct sk_buff *copy_skb = NULL;
 	struct timeval tv;
 
-	if (dev->nd_net != &init_net)
-		goto drop;
-
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
 
 	sk = pt->af_packet_priv;
 	po = pkt_sk(sk);
 
+	if (dev->nd_net != sk->sk_net)
+		goto drop;
+
 	if (dev->header_ops) {
 		if (sk->sk_type != SOCK_DGRAM)
 			skb_push(skb, skb->data - skb_mac_header(skb));
@@ -738,7 +734,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 
-	dev = dev_get_by_index(&init_net, ifindex);
+	dev = dev_get_by_index(sk->sk_net, ifindex);
 	err = -ENXIO;
 	if (dev == NULL)
 		goto out_unlock;
@@ -805,15 +801,17 @@ static int packet_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct packet_sock *po;
+	struct net *net;
 
 	if (!sk)
 		return 0;
 
+	net = sk->sk_net;
 	po = pkt_sk(sk);
 
-	write_lock_bh(&packet_sklist_lock);
+	write_lock_bh(&net->packet_sklist_lock);
 	sk_del_node_init(sk);
-	write_unlock_bh(&packet_sklist_lock);
+	write_unlock_bh(&net->packet_sklist_lock);
 
 	/*
 	 *	Unhook packet receive handler.
@@ -927,7 +925,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int add
 		return -EINVAL;
 	strlcpy(name,uaddr->sa_data,sizeof(name));
 
-	dev = dev_get_by_name(&init_net, name);
+	dev = dev_get_by_name(sk->sk_net, name);
 	if (dev) {
 		err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
 		dev_put(dev);
@@ -954,7 +952,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len
 
 	if (sll->sll_ifindex) {
 		err = -ENODEV;
-		dev = dev_get_by_index(&init_net, sll->sll_ifindex);
+		dev = dev_get_by_index(sk->sk_net, sll->sll_ifindex);
 		if (dev == NULL)
 			goto out;
 	}
@@ -983,9 +981,6 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
 	int err;
 
-	if (net != &init_net)
-		return -EAFNOSUPPORT;
-
 	if (!capable(CAP_NET_RAW))
 		return -EPERM;
 	if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
@@ -1031,9 +1026,9 @@ static int packet_create(struct net *net, struct socket *sock, int protocol)
 		po->running = 1;
 	}
 
-	write_lock_bh(&packet_sklist_lock);
-	sk_add_node(sk, &packet_sklist);
-	write_unlock_bh(&packet_sklist_lock);
+	write_lock_bh(&net->packet_sklist_lock);
+	sk_add_node(sk, &net->packet_sklist);
+	write_unlock_bh(&net->packet_sklist_lock);
 	return(0);
 out:
 	return err;
@@ -1151,7 +1146,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
 		return -EOPNOTSUPP;
 
 	uaddr->sa_family = AF_PACKET;
-	dev = dev_get_by_index(&init_net, pkt_sk(sk)->ifindex);
+	dev = dev_get_by_index(sk->sk_net, pkt_sk(sk)->ifindex);
 	if (dev) {
 		strlcpy(uaddr->sa_data, dev->name, 15);
 		dev_put(dev);
@@ -1176,7 +1171,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
 	sll->sll_family = AF_PACKET;
 	sll->sll_ifindex = po->ifindex;
 	sll->sll_protocol = po->num;
-	dev = dev_get_by_index(&init_net, po->ifindex);
+	dev = dev_get_by_index(sk->sk_net, po->ifindex);
 	if (dev) {
 		sll->sll_hatype = dev->type;
 		sll->sll_halen = dev->addr_len;
@@ -1228,7 +1223,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
 	rtnl_lock();
 
 	err = -ENODEV;
-	dev = __dev_get_by_index(&init_net, mreq->mr_ifindex);
+	dev = __dev_get_by_index(sk->sk_net, mreq->mr_ifindex);
 	if (!dev)
 		goto done;
 
@@ -1282,7 +1277,7 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
 			if (--ml->count == 0) {
 				struct net_device *dev;
 				*mlp = ml->next;
-				dev = dev_get_by_index(&init_net, ml->ifindex);
+				dev = dev_get_by_index(sk->sk_net, ml->ifindex);
 				if (dev) {
 					packet_dev_mc(dev, ml, -1);
 					dev_put(dev);
@@ -1310,7 +1305,7 @@ static void packet_flush_mclist(struct sock *sk)
 		struct net_device *dev;
 
 		po->mclist = ml->next;
-		if ((dev = dev_get_by_index(&init_net, ml->ifindex)) != NULL) {
+		if ((dev = dev_get_by_index(sk->sk_net, ml->ifindex)) != NULL) {
 			packet_dev_mc(dev, ml, -1);
 			dev_put(dev);
 		}
@@ -1466,12 +1461,10 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
 	struct sock *sk;
 	struct hlist_node *node;
 	struct net_device *dev = data;
+	struct net *net = dev->nd_net;
 
-	if (dev->nd_net != &init_net)
-		return NOTIFY_DONE;
-
-	read_lock(&packet_sklist_lock);
-	sk_for_each(sk, node, &packet_sklist) {
+	read_lock(&net->packet_sklist_lock);
+	sk_for_each(sk, node, &net->packet_sklist) {
 		struct packet_sock *po = pkt_sk(sk);
 
 		switch (msg) {
@@ -1510,7 +1503,7 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
 			break;
 		}
 	}
-	read_unlock(&packet_sklist_lock);
+	read_unlock(&net->packet_sklist_lock);
 	return NOTIFY_DONE;
 }
 
@@ -1878,12 +1871,12 @@ static struct notifier_block packet_netdev_notifier = {
 };
 
 #ifdef CONFIG_PROC_FS
-static inline struct sock *packet_seq_idx(loff_t off)
+static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
 {
 	struct sock *s;
 	struct hlist_node *node;
 
-	sk_for_each(s, node, &packet_sklist) {
+	sk_for_each(s, node, &net->packet_sklist) {
 		if (!off--)
 			return s;
 	}
@@ -1892,21 +1885,24 @@ static inline struct sock *packet_seq_idx(loff_t off)
 
 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
 {
-	read_lock(&packet_sklist_lock);
-	return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+	struct net *net = seq->private;
+	read_lock(&net->packet_sklist_lock);
+	return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
 }
 
 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct net *net = seq->private;
 	++*pos;
 	return  (v == SEQ_START_TOKEN)
-		? sk_head(&packet_sklist)
+		? sk_head(&net->packet_sklist)
 		: sk_next((struct sock*)v) ;
 }
 
 static void packet_seq_stop(struct seq_file *seq, void *v)
 {
-	read_unlock(&packet_sklist_lock);
+	struct net *net = seq->private;
+	read_unlock(&net->packet_sklist_lock);
 }
 
 static int packet_seq_show(struct seq_file *seq, void *v)
@@ -1942,7 +1938,26 @@ static const struct seq_operations packet_seq_ops = {
 
 static int packet_seq_open(struct inode *inode, struct file *file)
 {
-	return seq_open(file, &packet_seq_ops);
+	struct seq_file *seq;
+	int res;
+	res = seq_open(file, &packet_seq_ops);
+	if (!res) {
+		seq = file->private_data;
+		seq->private = get_proc_net(inode);
+		if (!seq->private) {
+			seq_release(inode, file);
+			res = -ENXIO;
+		}
+	}
+	return res;
+}
+
+static int packet_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq=  file->private_data;
+	struct net *net = seq->private;
+	put_net(net);
+	return seq_release(inode, file);
 }
 
 static const struct file_operations packet_seq_fops = {
@@ -1950,15 +1965,37 @@ static const struct file_operations...
 
 |  
	|  |  |  
	| 
		
			| [PATCH 5/5] net: Make AF_UNIX per network namespace safe. [message #20983 is a reply to message #20982] | Sat, 29 September 2007 01:08   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Because of the global nature of garbage collection, and because of the
cost of per namespace hash tables unix_socket_table has been kept
global.  With a filter added on lookups so we don't see sockets from
the wrong namespace.
Currently I don't fold the namesapce into the hash so multiple
namespaces using the same socket name will be guaranteed a hash
collision.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 net/unix/af_unix.c |  113 +++++++++++++++++++++++++++++++++++++++------------
 1 files changed, 86 insertions(+), 27 deletions(-)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 10e7312..2792ef2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -270,7 +270,8 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 	spin_unlock(&unix_table_lock);
 }
 
-static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
+static struct sock *__unix_find_socket_byname(struct net *net,
+					      struct sockaddr_un *sunname,
 					      int len, int type, unsigned hash)
 {
 	struct sock *s;
@@ -279,6 +280,9 @@ static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
 	sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
 		struct unix_sock *u = unix_sk(s);
 
+		if (s->sk_net != net)
+			continue;
+
 		if (u->addr->len == len &&
 		    !memcmp(u->addr->name, sunname, len))
 			goto found;
@@ -288,21 +292,22 @@ found:
 	return s;
 }
 
-static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
+static inline struct sock *unix_find_socket_byname(struct net *net,
+						   struct sockaddr_un *sunname,
 						   int len, int type,
 						   unsigned hash)
 {
 	struct sock *s;
 
 	spin_lock(&unix_table_lock);
-	s = __unix_find_socket_byname(sunname, len, type, hash);
+	s = __unix_find_socket_byname(net, sunname, len, type, hash);
 	if (s)
 		sock_hold(s);
 	spin_unlock(&unix_table_lock);
 	return s;
 }
 
-static struct sock *unix_find_socket_byinode(struct inode *i)
+static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i)
 {
 	struct sock *s;
 	struct hlist_node *node;
@@ -312,6 +317,9 @@ static struct sock *unix_find_socket_byinode(struct inode *i)
 		    &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 		struct dentry *dentry = unix_sk(s)->dentry;
 
+		if (s->sk_net != net)
+			continue;
+
 		if(dentry && dentry->d_inode == i)
 		{
 			sock_hold(s);
@@ -630,9 +638,6 @@ out:
 
 static int unix_create(struct net *net, struct socket *sock, int protocol)
 {
-	if (net != &init_net)
-		return -EAFNOSUPPORT;
-
 	if (protocol && protocol != PF_UNIX)
 		return -EPROTONOSUPPORT;
 
@@ -676,6 +681,7 @@ static int unix_release(struct socket *sock)
 static int unix_autobind(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct unix_sock *u = unix_sk(sk);
 	static u32 ordernum = 1;
 	struct unix_address * addr;
@@ -702,7 +708,7 @@ retry:
 	spin_lock(&unix_table_lock);
 	ordernum = (ordernum+1)&0xFFFFF;
 
-	if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
+	if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 				      addr->hash)) {
 		spin_unlock(&unix_table_lock);
 		/* Sanity yield. It is unusual case, but yet... */
@@ -722,7 +728,8 @@ out:	mutex_unlock(&u->readlock);
 	return err;
 }
 
-static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
+static struct sock *unix_find_other(struct net *net,
+				    struct sockaddr_un *sunname, int len,
 				    int type, unsigned hash, int *error)
 {
 	struct sock *u;
@@ -740,7 +747,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
 		err = -ECONNREFUSED;
 		if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
 			goto put_fail;
-		u=unix_find_socket_byinode(nd.dentry->d_inode);
+		u=unix_find_socket_byinode(net, nd.dentry->d_inode);
 		if (!u)
 			goto put_fail;
 
@@ -756,7 +763,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
 		}
 	} else {
 		err = -ECONNREFUSED;
-		u=unix_find_socket_byname(sunname, len, type, hash);
+		u=unix_find_socket_byname(net, sunname, len, type, hash);
 		if (u) {
 			struct dentry *dentry;
 			dentry = unix_sk(u)->dentry;
@@ -778,6 +785,7 @@ fail:
 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
 	struct dentry * dentry = NULL;
@@ -852,7 +860,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 
 	if (!sunaddr->sun_path[0]) {
 		err = -EADDRINUSE;
-		if (__unix_find_socket_byname(sunaddr, addr_len,
+		if (__unix_find_socket_byname(net, sunaddr, addr_len,
 					      sk->sk_type, hash)) {
 			unix_release_addr(addr);
 			goto out_unlock;
@@ -918,6 +926,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 			      int alen, int flags)
 {
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
 	struct sock *other;
 	unsigned hash;
@@ -934,7 +943,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 			goto out;
 
 restart:
-		other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
+		other=unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 		if (!other)
 			goto out;
 
@@ -1014,6 +1023,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 {
 	struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct unix_sock *u = unix_sk(sk), *newu, *otheru;
 	struct sock *newsk = NULL;
 	struct sock *other = NULL;
@@ -1053,7 +1063,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
 
 restart:
 	/*  Find listening sock. */
-	other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
+	other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
 	if (!other)
 		goto out;
 
@@ -1329,6 +1339,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
 {
 	struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
 	struct sock *sk = sock->sk;
+	struct net *net = sk->sk_net;
 	struct unix_sock *u = unix_sk(sk);
 	struct sockaddr_un *sunaddr=msg->msg_name;
 	struct sock *other = NULL;
@@ -1392,7 +1403,7 @@ restart:
 		if (sunaddr == NULL)
 			goto out_free;
 
-		other = unix_find_other(sunaddr, namelen, sk->sk_type,
+		other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
 					hash, &err);
 		if (other==NULL)
 			goto out_free;
@@ -1998,12 +2009,18 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl
 
 
 #ifdef CONFIG_PROC_FS
-static struct sock *unix_seq_idx(int *iter, loff_t pos)
+struct unix_iter_state {
+	struct net *net;
+	int i;
+};
+static struct sock *unix_seq_idx(struct unix_iter_state *iter, loff_t pos)
 {
 	loff_t off = 0;
 	struct sock *s;
 
-	for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
+	for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) {
+		if (s->sk_net != iter->net)
+			continue;
 		if (off == pos)
 			return s;
 		++off;
@@ -2014,17 +2031,24 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos)
 
 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
 {
+	struct unix_iter_state *iter = seq->private;
 	spin_lock(&unix_table_lock);
-	return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
+	return *pos ? unix_seq_idx(iter, *pos - 1) : ((void *) 1);
 }
 
 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+	struct unix_iter_state *iter = seq->private;
+	struct sock *sk = v;
 	++*pos;
 
 	if (v == (void *)1)
-		return first_unix_socket(seq->private);
-	return next_unix_socket(seq->private, v);
+		sk = first_unix_socket(&iter->i);
+	else
+		sk = next_unix_socket(&iter->i, sk);
+	while (sk && (sk->sk_net != iter->net))
+		sk = next_unix_socket(&iter->i, sk);
+	return sk;
 }
 
 static void unix_seq_stop(struct seq_file *seq, void *v)
@@ -2088,7 +2112,7 @@ static int unix_seq_open(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq;
 	int rc = -ENOMEM;
-	int *iter = kmalloc(sizeof(int), GFP_KERNEL);
+	struct unix_iter_state *iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 
 	if (!iter)
 		goto out;
@@ -2099,7 +2123,12 @@ static int unix_seq_open(struct inode *inode, struct file *file)
 
 	seq	     = file->private_data;
 	seq->private = iter;
-	*iter = 0;
+	iter->i = 0;
+	iter->net = get_proc_net(inode);
+	if (!iter->net) {
+		seq_release_private(inode, file);
+		rc = -ENXIO;
+	}
 out:
 	return rc;
 out_kfree:
@@ -2107,12 +2136,20 @@ out_kfree:
 	goto out;
 }
 
+static int unix_seq_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq = file->private_data;
+	struct unix_iter_state *iter = seq->private;
+	put_net(iter->net);
+	return seq_release_private(inode, file);
+}
+
 static const struct file_operations unix_seq_fops = {
 	.owner		= THIS_MODULE,
 	.open		= unix_seq_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release_private,
+	.release	= unix_seq_release,
 };
 
 #endif
@@ -2123,6 +2160,30 @@ static struct net_proto_family unix_family_ops = {
 	.owner	= THIS_MODULE,
 };
 
+
+static int unix_net_init(struct net *net)
+{
+	int error = -ENOMEM;
+
+#ifdef CONFIG_PROC_FS
+	if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops))
+		goto out;
+#endif
+	error = 0;
+out:
+	return 0;
+}
+
+static void unix_net_exit(struct net *net)
+{
+	proc_net_remove(net, "unix"...
 
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re:  Re: [PATCH 2/5] net: Make rtnetlink infrastructure network namespace aware [message #21026 is a reply to message #21017] | Mon, 01 October 2007 08:26   |  
			| 
				
				
					|  den Messages: 494
 Registered: December 2005
 | Senior Member |  |  |  
	| Patrick McHardy wrote:
> Eric W. Biederman wrote:
>> Patrick McHardy <kaber@trash.net> writes:
>>
>>
>>> Maybe I can save you some time: we used to do down_trylock()
>>> for the rtnl mutex, so senders would simply return if someone
>>> else was already processing the queue *or* the rtnl was locked
>>> for some other reason. In the first case the process already
>>> processing the queue would also process the new messages, but
>>> if it the rtnl was locked for some other reason (for example
>>> during module registration) the message would sit in the
>>> queue until the next rtnetlink sendmsg call, which is why
>>> rtnl_unlock does queue processing. Commit 6756ae4b changed
>>> the down_trylock to mutex_lock, so senders will now simply wait
>>> until the mutex is released and then call netlink_run_queue
>>> themselves. This means its not needed anymore.
>>
>> Sounds reasonable.
>>
>> I started looking through the code paths and I currently cannot
>> see anything that would leave a message on a kernel rtnl socket.
>>
>> However I did a quick test adding a WARN_ON if there were any messages
>> found in the queue during rtnl_unlock and I found this code path
>> getting invoked from linkwatch_event.  So there is clearly something I
>> don't understand, and it sounds at odds just a bit from your
>> description.
> 
> 
> That sounds like a bug. Did you place the WARN_ON before or after
> the mutex_unlock()?
The presence of the message in the queue during rtnl_unlock is quite
possible as normal user->kernel message processing path for rtnl is the
following:
netlink_sendmsg
   netlink_unicast
      netlink_sendskb
          skb_queue_tail
          netlink_data_ready
              rtnetlink_rcv
                  mutex_lock(&rtnl_mutex);
                  netlink_run_queue(sk, qlen, &rtnetlink_rcv_msg);
                  mutex_unlock(&rtnl_mutex);
so, the presence of the packet in the rtnl queue on rtnl_unlock is
normal race with a rtnetlink_rcv for me.
Regards,
	Den
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers |  
	|  |  |  
	|  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21524 is a reply to message #20979] | Wed, 10 October 2007 12:33   |  
			| 
				
				
					|  den Messages: 494
 Registered: December 2005
 | Senior Member |  |  |  
	| Eric W. Biederman wrote:
> Before I can enable rtnetlink to work in all network namespaces
> I need to be certain that something won't break.  So this
> patch deliberately disables all of the rtnletlink methods in everything
> except the initial network namespace.  After the methods have been
> audited this extra check can be disabled.
>
[...]
>  static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>  {
> +	struct net *net = skb->sk->sk_net;
>  	struct net_device *dev;
>  	int idx;
>  
I've read some code today greping 'init_net.loopback_dev' and found
interesting non-trivial for me issue.
Network namespace is extracted from the packet in two different ways in
TCP. This is a socket for outgoing path and a device for incoming.
Though, there are some places called uniformly both from incoming and
outgoing path.
Typical example is netfilters. They are called uniformly all around the
code. The prototype is the following:
static unsigned int reject6_target(struct sk_buff **pskb,
                           const struct net_device *in,
                           const struct net_device *out,
                           unsigned int hooknum,
                           const struct xt_target *target,
                           const void *targinfo);
So, we are bound to the following options:
- perform additional non-uniform hacks around to place 'struct net' into
  other and other structures like xt_target
- add 7th parameter here and over
- introduce an skb_net field in the 'struct sk_buff' making all code
  uniform, at least when we have an skb
I think that this is not the last place with such a parameter list and
we should make a decision at this point when the code in not mainline yet.
As far as I understand, netfilters are not touched by the Eric and we
can face some non-trivial problems there.
So, if my point about uniformity is valid, this patchset looks wrong and
should be re-worked :(
Regards,
	Den |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21529 is a reply to message #21524] | Wed, 10 October 2007 14:05   |  
			| 
				
				
					|  Daniel Lezcano Messages: 417
 Registered: June 2006
 | Senior Member |  |  |  
	| Denis V. Lunev wrote:
> Eric W. Biederman wrote:
>> Before I can enable rtnetlink to work in all network namespaces
>> I need to be certain that something won't break.  So this
>> patch deliberately disables all of the rtnletlink methods in everything
>> except the initial network namespace.  After the methods have been
>> audited this extra check can be disabled.
>>
> [...]
>>  static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>>  {
>> +	struct net *net = skb->sk->sk_net;
>>  	struct net_device *dev;
>>  	int idx;
>>  
> 
> I've read some code today greping 'init_net.loopback_dev' and found
> interesting non-trivial for me issue.
> 
> Network namespace is extracted from the packet in two different ways in
> TCP. This is a socket for outgoing path and a device for incoming.
> Though, there are some places called uniformly both from incoming and
> outgoing path.
> 
> Typical example is netfilters. They are called uniformly all around the
> code. The prototype is the following:
> 
> static unsigned int reject6_target(struct sk_buff **pskb,
>                            const struct net_device *in,
>                            const struct net_device *out,
>                            unsigned int hooknum,
>                            const struct xt_target *target,
>                            const void *targinfo);
> 
Thanks Denis for auditing the code.
As far as I see, struct net_device *in is NULL for outgoing traffic and 
struct net_device *out is NULL for ingress traffic. Except for the 
FORWARD rules where both are filled. If we are following network 
namespace semantic, we should not have two network devices belonging to 
two differents namespaces, right ?
In this case, the following line of code should be sufficient to 
retrieve the network namespace, no ?
struct net *net = in?in->nd_net:out->nd_net;
> So, we are bound to the following options:
> - perform additional non-uniform hacks around to place 'struct net' into
>   other and other structures like xt_target
> - add 7th parameter here and over
> - introduce an skb_net field in the 'struct sk_buff' making all code
>   uniform, at least when we have an skb
> 
> I think that this is not the last place with such a parameter list and
> we should make a decision at this point when the code in not mainline yet.
> 
> As far as I understand, netfilters are not touched by the Eric and we
> can face some non-trivial problems there.
In Eric's git tree:
http://git.kernel.org/?p=linux/kernel/git/ebiederm/linux-2.6-netns.git
There are some modifications concerning 
net/ipv4/netfiler/iptable_filter.c and at the ipt_hook function, there is:
struct net *net = (in?in:out)->nd_net;
> So, if my point about uniformity is valid, this patchset looks wrong and
> should be re-worked :(
As Eric said, we want to build the network namespace step by step, 
taking care of not breaking the init network namespace.
If you want to make iptables per namespace or catch problems before the 
code goes to Dave's tree, IMHO it will be more convenient to post to 
containers@ the patches against netns49, where the modifications will be 
in a network namespace big picture.
Regards.
   -- Daniel
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21532 is a reply to message #21529] | Wed, 10 October 2007 14:29   |  
			| 
				
				
					|  den Messages: 494
 Registered: December 2005
 | Senior Member |  |  |  
	| Daniel Lezcano wrote:
> struct net *net = in?in->nd_net:out->nd_net;
> 
>> So, we are bound to the following options:
>> - perform additional non-uniform hacks around to place 'struct net' into
>>   other and other structures like xt_target
>> - add 7th parameter here and over
>> - introduce an skb_net field in the 'struct sk_buff' making all code
>>   uniform, at least when we have an skb
>>
>> I think that this is not the last place with such a parameter list and
>> we should make a decision at this point when the code in not mainline
>> yet.
>>
>> As far as I understand, netfilters are not touched by the Eric and we
>> can face some non-trivial problems there.
> 
> In Eric's git tree:
> http://git.kernel.org/?p=linux/kernel/git/ebiederm/linux-2.6-netns.git
> 
> There are some modifications concerning
> net/ipv4/netfiler/iptable_filter.c and at the ipt_hook function, there is:
> 
> struct net *net = (in?in:out)->nd_net;
> 
>> So, if my point about uniformity is valid, this patchset looks wrong and
>> should be re-worked :(
> 
> As Eric said, we want to build the network namespace step by step,
> taking care of not breaking the init network namespace.
> 
> If you want to make iptables per namespace or catch problems before the
> code goes to Dave's tree, IMHO it will be more convenient to post to
> containers@ the patches against netns49, where the modifications will be
> in a network namespace big picture.
> 
my point is somewhat another. Yes, this is enough for that place. If so,
I must scatter these checks all around in the netfilters code. Brr.
In forward chain the situation is different for Layer3 switching. Let's
assume that we have an OpenVZ scheme, where the packet flows from socket
to device and after that from device to device via forwarding path. You
can't call skb_orphan on namespace switching as this breaks UDP flow
regulation. Virtual network device is fast while real Ethernet is slow,
packets will be dropped on queue in real device. So, the situation with
packet on send path with a socket from other namespace is possible :(
I just pray for uniformity to concentrate on the code rather than on
guesses on which path we are :(
Regards,
	Den
_______________________________________________
Containers mailing list
Containers@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21548 is a reply to message #21524] | Wed, 10 October 2007 19:37   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| "Denis V. Lunev" <den@sw.ru> writes:
> Eric W. Biederman wrote:
>> Before I can enable rtnetlink to work in all network namespaces
>> I need to be certain that something won't break.  So this
>> patch deliberately disables all of the rtnletlink methods in everything
>> except the initial network namespace.  After the methods have been
>> audited this extra check can be disabled.
>>
> [...]
>>  static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
>>  {
>> +	struct net *net = skb->sk->sk_net;
>>  	struct net_device *dev;
>>  	int idx;
>>  
>
> I've read some code today greping 'init_net.loopback_dev' and found
> interesting non-trivial for me issue.
>
> Network namespace is extracted from the packet in two different ways in
> TCP. This is a socket for outgoing path and a device for incoming.
> Though, there are some places called uniformly both from incoming and
> outgoing path.
>
> Typical example is netfilters. They are called uniformly all around the
> code. The prototype is the following:
>
> static unsigned int reject6_target(struct sk_buff **pskb,
>                            const struct net_device *in,
>                            const struct net_device *out,
>                            unsigned int hooknum,
>                            const struct xt_target *target,
>                            const void *targinfo);
>
> So, we are bound to the following options:
> - perform additional non-uniform hacks around to place 'struct net' into
>   other and other structures like xt_target
> - add 7th parameter here and over
> - introduce an skb_net field in the 'struct sk_buff' making all code
>   uniform, at least when we have an skb
No.  That bloats a sk_buff, changes the semantics of moving a skb
around, and decreases performance (because we have to maintain the
field on a fast path).  
There will not be a skb_net field.
The entire concept of skb_net is a maintenance disaster.
> I think that this is not the last place with such a parameter list and
> we should make a decision at this point when the code in not mainline yet.
Certainly that is what I have a proof of concept tree for.  So we can
see how these things look before we merge them. 
> As far as I understand, netfilters are not touched by the Eric and we
> can face some non-trivial problems there.
No.  In my proof of concept tree I should have working per network
namespace netfilter code.  My intention was to just do enough to see
what the impact would be so most of the netfilter code (in my tree)
insists on running in the initial network namespace.  But there are
a few pieces that are fully converted.  Please take a look.
> So, if my point about uniformity is valid, this patchset looks wrong and
> should be re-worked :(
This patchset does need to get rebased on top of net-2.6.25 when it
opens and hopefully your patchset to remove the unnecessary work in
rtnl_unlock, and to really process netlink requests in process
context.  I see a need for the more fundamental change you seem to
be advocating.
Differentiating between the incoming and the outgoing code paths is
something we already do permission checking, for locking, for
sleeping, etc.  Modifying the code requires reading and understanding
it in context.  That is the nature of code.
This does make large patches going across the entire networking stack
making something a network namespace parameter difficult, but it
should not cause any problem for maintenance or other work on
the code.  As shown by the fact that even outside the tree rebasing my
network namespace patches has not been all that difficult.
So no I don't think uniformity, or beauty or elegance is what we
are after right now.  Trying to hard in that direction ultimately
obfuscates the code.
What we want is something that is simple, straight forward, and
doesn't require you to be an expert in network namespaces to
understand the code or the patches.
In the particular case of the netfilter hooks we don't have a
network namespace parameter laying around before we call NF_HOOK,
and the idiom "net = (in?in:out)->nd_net" seems perfectly accurate
so it seems reasonable to me to derive the network namespace that
way in generic code.  Although thinking about this.  We know which
hooks we are being called from so we may in fact actually know
if which of in or out must be valid when we get to the netfilter
hook.
Eric |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21572 is a reply to message #21548] | Thu, 11 October 2007 06:35   |  
			| 
				
				
					|  den Messages: 494
 Registered: December 2005
 | Senior Member |  |  |  
	| Eric W. Biederman wrote:
[..]
>> - introduce an skb_net field in the 'struct sk_buff' making all code
>>   uniform, at least when we have an skb
> 
> No.  That bloats a sk_buff, changes the semantics of moving a skb
> around, and decreases performance (because we have to maintain the
> field on a fast path).  
> 
> There will not be a skb_net field.
> 
> The entire concept of skb_net is a maintenance disaster.
> 
>> I think that this is not the last place with such a parameter list and
>> we should make a decision at this point when the code in not mainline yet.
> 
> Certainly that is what I have a proof of concept tree for.  So we can
> see how these things look before we merge them. 
> 
>> As far as I understand, netfilters are not touched by the Eric and we
>> can face some non-trivial problems there.
> 
> No.  In my proof of concept tree I should have working per network
> namespace netfilter code.  My intention was to just do enough to see
> what the impact would be so most of the netfilter code (in my tree)
> insists on running in the initial network namespace.  But there are
> a few pieces that are fully converted.  Please take a look.
> 
>> So, if my point about uniformity is valid, this patchset looks wrong and
>> should be re-worked :(
> 
> This patchset does need to get rebased on top of net-2.6.25 when it
> opens and hopefully your patchset to remove the unnecessary work in
> rtnl_unlock, and to really process netlink requests in process
> context.  I see a need for the more fundamental change you seem to
> be advocating.
I see that current patchset of RTNL code is attached. I'll start the
next piece of work next week after some reaction from people.
[..]
> In the particular case of the netfilter hooks we don't have a
> network namespace parameter laying around before we call NF_HOOK,
> and the idiom "net = (in?in:out)->nd_net" seems perfectly accurate
> so it seems reasonable to me to derive the network namespace that
> way in generic code.  Although thinking about this.  We know which
> hooks we are being called from so we may in fact actually know
> if which of in or out must be valid when we get to the netfilter
> hook.
I understand your position. But still have some points :)
First. A real-life usecase we have recently fixed in the OpenVZ. I have
described it in the previous post, but (may be) not in great details.
UDP buffers management for outgoing traffic.
The packet carries over a destructor, which is called in skb_orphan in
the real networking device. This destructor allows to queue more
packets. There is no problem in the current implementation. But there is
one after namespace introduction in the following configuration:
    [NS1] <-> [NS2] <-> [world]
There are two namespaces on the host. One of them is connected to the
outside world via another and the packet follows usual forwarding path
in NS2.
The problem: if we call skb_orphan in [NS1] outgoing device, there is a
great possibility to have a really huge packet drop in [NS2] on queuing
operation.
In OpenVZ we have added skb_orphan into receive path (tcp_v4_rcv,
__udp4_lib_rcv etc) and removed one from our virtual devices. As
unfortunate side effect we have packet in NS2 with a socket from NS1.
So, we should have an architectural solution for this from a beginning.
It will be too late to hack around and change namespace lookup scheme.
I see that we should adopt a generic approach:
- each concrete packet belongs to a concrete namespace
- if function has a packet as a parameter, we should get namespace from
packet
- if skb->dev is present we should get namespace as skb->dev->nd_net
- otherwise we should get skb->sk->sk_net. If skb->sk is NULL -> this is
 a bug
So, for the case of all netfilter calls we'll have a pskb->dev->nd_net
defined correctly.
Regards,
	Den |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21573 is a reply to message #21572] | Thu, 11 October 2007 07:57   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| "Denis V. Lunev" <den@sw.ru> writes:
>> This patchset does need to get rebased on top of net-2.6.25 when it
>> opens and hopefully your patchset to remove the unnecessary work in
>> rtnl_unlock, and to really process netlink requests in process
>> context.  I see a need for the more fundamental change you seem to
>> be advocating.
Grr.  That last sentence should have been I do not see a need for the more
fundamental change you seem to be advocating.
> I see that current patchset of RTNL code is attached. I'll start the
> next piece of work next week after some reaction from people.
> I understand your position. But still have some points :)
>
> First. A real-life usecase we have recently fixed in the OpenVZ. I have
> described it in the previous post, but (may be) not in great details.
>
> UDP buffers management for outgoing traffic.
> The packet carries over a destructor, which is called in skb_orphan in
> the real networking device. This destructor allows to queue more
> packets. There is no problem in the current implementation. But there is
> one after namespace introduction in the following configuration:
>     [NS1] <-> [NS2] <-> [world]
> There are two namespaces on the host. One of them is connected to the
> outside world via another and the packet follows usual forwarding path
> in NS2.
Yes.  We can't quite do that today because of the missing ipv4 support,
but the basic infrastructure exists to describe this is already
merged.
> The problem: if we call skb_orphan in [NS1] outgoing device, there is a
> great possibility to have a really huge packet drop in [NS2] on queuing
> operation.
Yes.  Removing skb_orphan from veth to solve this looks like a
pretty substantial hack.  A clean solution is more likely to resemble
ethernet pause frames so we temporarily plug the virtual device.
Although there are issues with that as currently virtual devices
don't have queues.
> In OpenVZ we have added skb_orphan into receive path (tcp_v4_rcv,
> __udp4_lib_rcv etc) and removed one from our virtual devices. As
> unfortunate side effect we have packet in NS2 with a socket from NS1.
That also does some really nasty things to accounting.  Using
the macvlan devices solve all of this rather neatly and with
higher performance.
> So, we should have an architectural solution for this from a beginning.
> It will be too late to hack around and change namespace lookup scheme.
However what you are specifically concerned about seems to be
using skb->sk->sk_net.  Currently the only place I use this
is in the rtnetlink code because the functions that process packets
are not also passed a socket.  Those functions we could easily
pass in an explicit namespace or a socket parameter, and so those
functions would not care.
> I see that we should adopt a generic approach:
> - each concrete packet belongs to a concrete namespace
> - if function has a packet as a parameter, we should get namespace from
> packet
This approach when suggested in earlier review was pretty
substantially shot down.  Shrinking the size of struct sk_buff is
currently an ongoing todo for the linux networking stack.
> - if skb->dev is present we should get namespace as skb->dev->nd_net
> - otherwise we should get skb->sk->sk_net. If skb->sk is NULL -> this is
>  a bug
Currently killing skb->dev is a todo list item, so using it more is
probably the wrong approach.
I do agree the duplication information between fields on the skb
and fields passed to functions is a pain.  Moving more onto the
skb seems contrary to the how the rest of the networking stack would
like to go, so it is likely better to simply remove skb->dev and pass
an explicit dev parameter instead.  Please note that parameters passed
explicitly will live in registers and will be quite fast.
> So, for the case of all netfilter calls we'll have a pskb->dev->nd_net
> defined correctly.
That would certainly be a nice extra, but that really seems the wrong
way to go.  I would rather add an explicit struct net *net parameter
to nf_hookfn. 
Eric |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21582 is a reply to message #21573] | Thu, 11 October 2007 12:17   |  
			| 
				
				
					|  den Messages: 494
 Registered: December 2005
 | Senior Member |  |  |  
	| Eric W. Biederman wrote:
> "Denis V. Lunev" <den@sw.ru> writes:
> 
>>> This patchset does need to get rebased on top of net-2.6.25 when it
>>> opens and hopefully your patchset to remove the unnecessary work in
>>> rtnl_unlock, and to really process netlink requests in process
>>> context.  I see a need for the more fundamental change you seem to
>>> be advocating.
> 
> Grr.  That last sentence should have been I do not see a need for the more
> fundamental change you seem to be advocating.
why?
>> First. A real-life usecase we have recently fixed in the OpenVZ. I have
>> described it in the previous post, but (may be) not in great details.
>>
>> UDP buffers management for outgoing traffic.
>> The packet carries over a destructor, which is called in skb_orphan in
>> the real networking device. This destructor allows to queue more
>> packets. There is no problem in the current implementation. But there is
>> one after namespace introduction in the following configuration:
>>     [NS1] <-> [NS2] <-> [world]
>> There are two namespaces on the host. One of them is connected to the
>> outside world via another and the packet follows usual forwarding path
>> in NS2.
> 
> Yes.  We can't quite do that today because of the missing ipv4 support,
> but the basic infrastructure exists to describe this is already
> merged.
> 
>> The problem: if we call skb_orphan in [NS1] outgoing device, there is a
>> great possibility to have a really huge packet drop in [NS2] on queuing
>> operation.
> 
> Yes.  Removing skb_orphan from veth to solve this looks like a
> pretty substantial hack.  A clean solution is more likely to resemble
> ethernet pause frames so we temporarily plug the virtual device.
> Although there are issues with that as currently virtual devices
> don't have queues.
> 
>> In OpenVZ we have added skb_orphan into receive path (tcp_v4_rcv,
>> __udp4_lib_rcv etc) and removed one from our virtual devices. As
>> unfortunate side effect we have packet in NS2 with a socket from NS1.
> 
> That also does some really nasty things to accounting.  Using
> the macvlan devices solve all of this rather neatly and with
> higher performance.
veth is not mainly affected, if it is connected to a bridge there will
be no problem. You are talking about decision to switch/choose VE on
level 2, I am told about layer 3, i.e. at the moment of routing/based on
IP. So,
I do not understand how this will help me in my usecase. The macvlan
device transmit the packet into real device. OK. But it does not help if
I want to setup router in one namespace for another.
Basically, queue stop will not help, as routing namespace can have two
interfaces, one fast and one slow. In your case we should stop accepting
the message based on the slowest one?
>> So, we should have an architectural solution for this from a beginning.
>> It will be too late to hack around and change namespace lookup scheme.
> 
> However what you are specifically concerned about seems to be
> using skb->sk->sk_net.  Currently the only place I use this
> is in the rtnetlink code because the functions that process packets
> are not also passed a socket.  Those functions we could easily
> pass in an explicit namespace or a socket parameter, and so those
> functions would not care.
> 
>> I see that we should adopt a generic approach:
>> - each concrete packet belongs to a concrete namespace
>> - if function has a packet as a parameter, we should get namespace from
>> packet
> 
> This approach when suggested in earlier review was pretty
> substantially shot down.  Shrinking the size of struct sk_buff is
> currently an ongoing todo for the linux networking stack.
> 
>> - if skb->dev is present we should get namespace as skb->dev->nd_net
>> - otherwise we should get skb->sk->sk_net. If skb->sk is NULL -> this is
>>  a bug
> 
> Currently killing skb->dev is a todo list item, so using it more is
> probably the wrong approach.
> 
> I do agree the duplication information between fields on the skb
> and fields passed to functions is a pain.  Moving more onto the
> skb seems contrary to the how the rest of the networking stack would
> like to go, so it is likely better to simply remove skb->dev and pass
> an explicit dev parameter instead.  Please note that parameters passed
> explicitly will live in registers and will be quite fast.
how much registers do you have on i386 for this? the call found in my
original letter already has 6. Additionally, we can be shot by embedded
people arguing about 1 more argument and additional not needed for them
code.... And namespace code becomes unremovable one by usual ifdef way.
Regards,
	Den |  
	|  |  |  
	| 
		
			| Re:  [PATCH 1/5] net: Modify all rtnetlink methods to only work	in the initial namespace [message #21589 is a reply to message #21582] | Thu, 11 October 2007 17:08   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| "Denis V. Lunev" <den@sw.ru> writes:
>> Grr.  That last sentence should have been I do not see a need for the more
>> fundamental change you seem to be advocating.
>
> why?
Because I do not see the need.  There are certainly details that can be
improved, but you seem to be talking about ripping everything out, ignoring
the reviews and the acceptance that has happened and starting from
scratch all over again.
I don't see the need to start again from scratch.
> veth is not mainly affected, if it is connected to a bridge there will
> be no problem. You are talking about decision to switch/choose VE on
> level 2, I am told about layer 3, i.e. at the moment of routing/based on
> IP. So,
>
> I do not understand how this will help me in my usecase. The macvlan
> device transmit the packet into real device. OK. But it does not help if
> I want to setup router in one namespace for another.
>
> Basically, queue stop will not help, as routing namespace can have two
> interfaces, one fast and one slow. In your case we should stop accepting
> the message based on the slowest one?
You are proposing a solution that only works for namespaces, while the
problem continues to exist in all other routing cases.  Solving this
in a namespace specific way seems to reduce the value of using
namespaces for testing weird aspects of the networking stack.  And
except for OpenVZ this is not a common case for anyone yet, so it
seems a seriously premature optimization.  Given that it could only
be one path having something like ECN for UDP to slow the transmitter
down would be nice.
The point of the macvlan example is that I believe that will be our
common case when things network namespaces are deployed in the real
world.  Both because macvlans are easier to setup then routing or
bridging, and because they perform better.
Further this problem feels like an application bug to me.  Going
from a fast to a slow network when you are saturating the fast network
with UDP packets will cause packets to be dropped.  If this is a
problem the application should slow down it's packet transmission rate
or at least wait for an ACK periodically.
> how much registers do you have on i386 for this? the call found in my
> original letter already has 6. Additionally, we can be shot by embedded
> people arguing about 1 more argument and additional not needed for them
> code.... And namespace code becomes unremovable one by usual ifdef
> way.
Mucking with data on the skb has all kinds of potential for slowing
down the fast path of the networking stack, and if the networking
stack is slow the embedded people won't even use it.  So as much
as I am sympathetic to preventing code size growth, maintenance and
performance are higher priorities.  Especially when you are advocating
growth in the fast path, while I am only advocating growth in the slow
path.
As for i386 stack accesses are optimized almost as well as registers,
so function parameters are still not a bad deal.
>From the first couple of rounds of review the clear message was:
Maintainable and comprehensible code and don't touch the fast
path.
If to achieve the above I have to use a solution that distresses
the embedded people my apologies.
Eric |  
	|  |  |  
	|  | 
 
 
 Current Time: Sun Oct 26 20:14:21 GMT 2025 
 Total time taken to generate the page: 0.08441 seconds |