| Home » Mailing lists » Devel » [patch 1/4] Network namespaces: cleanup of dev_base list use Goto Forum:
	| 
		
			| [patch 1/4] Network namespaces: cleanup of dev_base list use [message #3967] | Mon, 26 June 2006 09:49  |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| Cleanup of dev_base list use, with the aim to make device list per-namespace. In almost every occasion, use of dev_base variable and dev->next pointer
 could be easily replaced by for_each_netdev loop.
 A few most complicated places were converted to using
 first_netdev()/next_netdev().
 
 Signed-off-by: Andrey Savochkin <saw@swsoft.com>
 ---
 arch/s390/appldata/appldata_net_sum.c |    2
 arch/sparc64/solaris/ioctl.c          |    2
 drivers/block/aoe/aoecmd.c            |    8 ++-
 drivers/net/wireless/strip.c          |    4 -
 drivers/parisc/led.c                  |    2
 include/linux/netdevice.h             |   28 +++++++++++--
 net/8021q/vlan.c                      |    4 -
 net/8021q/vlanproc.c                  |   10 ++--
 net/bridge/br_if.c                    |    4 -
 net/bridge/br_ioctl.c                 |    4 +
 net/bridge/br_netlink.c               |    3 -
 net/core/dev.c                        |   70 ++++++++++++----------------------
 net/core/dev_mcast.c                  |    4 -
 net/core/rtnetlink.c                  |   18 ++++----
 net/decnet/af_decnet.c                |   11 +++--
 net/decnet/dn_dev.c                   |   17 ++++----
 net/decnet/dn_fib.c                   |    2
 net/decnet/dn_route.c                 |   12 ++---
 net/ipv4/devinet.c                    |   15 ++++---
 net/ipv4/igmp.c                       |   25 +++++++-----
 net/ipv6/addrconf.c                   |   28 ++++++++-----
 net/ipv6/anycast.c                    |   22 ++++++----
 net/ipv6/mcast.c                      |   20 +++++----
 net/llc/llc_core.c                    |    7 ++-
 net/netrom/nr_route.c                 |    4 -
 net/rose/rose_route.c                 |    8 ++-
 net/sched/sch_api.c                   |    8 ++-
 net/sctp/protocol.c                   |    2
 net/tipc/eth_media.c                  |   12 +++--
 29 files changed, 200 insertions, 156 deletions
 
 --- ./arch/s390/appldata/appldata_net_sum.c.vedevbase	Mon Mar 20 08:53:29 2006
 +++ ./arch/s390/appldata/appldata_net_sum.c	Thu Jun 22 12:03:07 2006
 @@ -108,7 +108,7 @@ static void appldata_get_net_sum_data(vo
 tx_dropped = 0;
 collisions = 0;
 read_lock(&dev_base_lock);
 -	for (dev = dev_base; dev != NULL; dev = dev->next) {
 +	for_each_netdev(dev) {
 if (dev->get_stats == NULL) {
 continue;
 }
 --- ./arch/sparc64/solaris/ioctl.c.vedevbase	Mon Mar 20 08:53:29 2006
 +++ ./arch/sparc64/solaris/ioctl.c	Thu Jun 22 12:03:07 2006
 @@ -686,7 +686,7 @@ static inline int solaris_i(unsigned int
 int i = 0;
 
 read_lock_bh(&dev_base_lock);
 -			for (d = dev_base; d; d = d->next) i++;
 +			for_each_netdev(d) i++;
 read_unlock_bh(&dev_base_lock);
 
 if (put_user (i, (int __user *)A(arg)))
 --- ./drivers/block/aoe/aoecmd.c.vedevbase	Wed Jun 21 18:50:28 2006
 +++ ./drivers/block/aoe/aoecmd.c	Thu Jun 22 12:03:07 2006
 @@ -204,14 +204,17 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigne
 sl = sl_tail = NULL;
 
 read_lock(&dev_base_lock);
 -	for (ifp = dev_base; ifp; dev_put(ifp), ifp = ifp->next) {
 +	for_each_netdev(dev) {
 dev_hold(ifp);
 -		if (!is_aoe_netif(ifp))
 +		if (!is_aoe_netif(ifp)) {
 +			dev_put(ifp);
 continue;
 +		}
 
 skb = new_skb(ifp, sizeof *h + sizeof *ch);
 if (skb == NULL) {
 printk(KERN_INFO "aoe: aoecmd_cfg: skb alloc failure\n");
 +			dev_put(ifp);
 continue;
 }
 if (sl_tail == NULL)
 @@ -229,6 +232,7 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigne
 
 skb->next = sl;
 sl = skb;
 +		dev_put(ifp);
 }
 read_unlock(&dev_base_lock);
 
 --- ./drivers/net/wireless/strip.c.vedevbase	Wed Jun 21 18:50:43 2006
 +++ ./drivers/net/wireless/strip.c	Thu Jun 22 12:03:07 2006
 @@ -1970,8 +1970,7 @@ static struct net_device *get_strip_dev(
 sizeof(zero_address))) {
 struct net_device *dev;
 read_lock_bh(&dev_base_lock);
 -		dev = dev_base;
 -		while (dev) {
 +		for_each_netdev(dev) {
 if (dev->type == strip_info->dev->type &&
 !memcmp(dev->dev_addr,
 &strip_info->true_dev_addr,
 @@ -1982,7 +1981,6 @@ static struct net_device *get_strip_dev(
 read_unlock_bh(&dev_base_lock);
 return (dev);
 }
 -			dev = dev->next;
 }
 read_unlock_bh(&dev_base_lock);
 }
 --- ./drivers/parisc/led.c.vedevbase	Wed Jun 21 18:52:58 2006
 +++ ./drivers/parisc/led.c	Thu Jun 22 12:03:07 2006
 @@ -368,7 +368,7 @@ static __inline__ int led_get_net_activi
 * for reading should be OK */
 read_lock(&dev_base_lock);
 rcu_read_lock();
 -	for (dev = dev_base; dev; dev = dev->next) {
 +	for_each_netdev(dev) {
 struct net_device_stats *stats;
 struct in_device *in_dev = __in_dev_get_rcu(dev);
 if (!in_dev || !in_dev->ifa_list)
 --- ./include/linux/netdevice.h.vedevbase	Wed Jun 21 18:53:17 2006
 +++ ./include/linux/netdevice.h	Thu Jun 22 18:57:50 2006
 @@ -289,8 +289,8 @@ struct net_device
 
 unsigned long		state;
 
 -	struct net_device	*next;
 -
 +	struct list_head	dev_list;
 +
 /* The device initialization function. Called only once. */
 int			(*init)(struct net_device *dev);
 
 @@ -543,9 +543,27 @@ struct packet_type {
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 
 -extern struct net_device		loopback_dev;		/* The loopback */
 -extern struct net_device		*dev_base;		/* All devices */
 -extern rwlock_t				dev_base_lock;		/* Device list lock */
 +extern struct net_device	loopback_dev;		/* The loopback */
 +extern struct list_head		dev_base_head;		/* All devices */
 +extern rwlock_t			dev_base_lock;		/* Device list lock */
 +
 +#define for_each_netdev(p)	list_for_each_entry(p, &dev_base_head, dev_list)
 +
 +/* DO NOT USE first_netdev/next_netdev, use loop defined above */
 +#define first_netdev()		({ \
 +					list_empty(&dev_base_head) ? NULL : \
 +						list_entry(dev_base_head.next, \
 +							struct net_device, \
 +							dev_list); \
 +				 })
 +#define next_netdev(dev)	({ \
 +					struct list_head *__next; \
 +					__next = (dev)->dev_list.next; \
 +					__next == &dev_base_head ? NULL : \
 +						list_entry(__next, \
 +							struct net_device, \
 +							dev_list); \
 +				 })
 
 extern int 			netdev_boot_setup_check(struct net_device *dev);
 extern unsigned long		netdev_boot_base(const char *prefix, int unit);
 --- ./net/8021q/vlan.c.vedevbase	Wed Jun 21 18:51:08 2006
 +++ ./net/8021q/vlan.c	Thu Jun 22 12:03:07 2006
 @@ -121,8 +121,8 @@ static void __exit vlan_cleanup_devices(
 struct net_device *dev, *nxt;
 
 rtnl_lock();
 -	for (dev = dev_base; dev; dev = nxt) {
 -		nxt = dev->next;
 +	for (dev = first_netdev(); dev; dev = nxt) {
 +		nxt = next_netdev(dev);
 if (dev->priv_flags & IFF_802_1Q_VLAN) {
 unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev,
 VLAN_DEV_INFO(dev)->vlan_id);
 --- ./net/8021q/vlanproc.c.vedevbase	Mon Mar 20 08:53:29 2006
 +++ ./net/8021q/vlanproc.c	Thu Jun 22 12:03:07 2006
 @@ -242,7 +242,7 @@ int vlan_proc_rem_dev(struct net_device
 static struct net_device *vlan_skip(struct net_device *dev)
 {
 while (dev && !(dev->priv_flags & IFF_802_1Q_VLAN))
 -		dev = dev->next;
 +		dev = next_netdev(dev);
 
 return dev;
 }
 @@ -258,8 +258,8 @@ static void *vlan_seq_start(struct seq_f
 if (*pos == 0)
 return SEQ_START_TOKEN;
 
 -	for (dev = vlan_skip(dev_base); dev && i < *pos;
 -	     dev = vlan_skip(dev->next), ++i);
 +	for (dev = vlan_skip(first_netdev()); dev && i < *pos;
 +	     dev = vlan_skip(next_netdev(dev)), ++i);
 
 return  (i == *pos) ? dev : NULL;
 }
 @@ -269,8 +269,8 @@ static void *vlan_seq_next(struct seq_fi
 ++*pos;
 
 return vlan_skip((v == SEQ_START_TOKEN)
 -			    ? dev_base
 -			    : ((struct net_device *)v)->next);
 +			    ? first_netdev()
 +			    : next_netdev((struct net_device *)v));
 }
 
 static void vlan_seq_stop(struct seq_file *seq, void *v)
 --- ./net/bridge/br_if.c.vedevbase	Wed Jun 21 18:53:18 2006
 +++ ./net/bridge/br_if.c	Thu Jun 22 12:03:07 2006
 @@ -468,8 +468,8 @@ void __exit br_cleanup_bridges(void)
 struct net_device *dev, *nxt;
 
 rtnl_lock();
 -	for (dev = dev_base; dev; dev = nxt) {
 -		nxt = dev->next;
 +	for (dev = first_netdev(); dev; dev = nxt) {
 +		nxt = next_netdev(dev);
 if (dev->priv_flags & IFF_EBRIDGE)
 del_br(dev->priv);
 }
 --- ./net/bridge/br_ioctl.c.vedevbase	Mon Mar 20 08:53:29 2006
 +++ ./net/bridge/br_ioctl.c	Thu Jun 22 12:03:07 2006
 @@ -27,7 +27,9 @@ static int get_bridge_ifindices(int *ind
 struct net_device *dev;
 int i = 0;
 
 -	for (dev = dev_base; dev && i < num; dev = dev->next) {
 +	for_each_netdev(dev) {
 +		if (i >= num)
 +			break;
 if (dev->priv_flags & IFF_EBRIDGE)
 indices[i++] = dev->ifindex;
 }
 --- ./net/bridge/br_netlink.c.vedevbase	Wed Jun 21 18:53:18 2006
 +++ ./net/bridge/br_netlink.c	Thu Jun 22 12:03:07 2006
 @@ -109,7 +109,8 @@ static int br_dump_ifinfo(struct sk_buff
 int err = 0;
 
 read_lock(&dev_base_lock);
 -	for (dev = dev_base, idx = 0; dev; dev = dev->next) {
 +	idx = 0;
 +	for_each_netdev(dev) {
 struct net_bridge_port *p = dev->br_port;
 
 /* not a bridge port */
 --- ./net/core/dev.c.vedevbase	Wed Jun 21 18:53:18 2006
 +++ ./net/core/dev.c	Thu Jun 22 17:40:13 2006
 @@ -174,13 +174,12 @@ static spinlock_t net_dma_event_lock;
 * unregister_netdevice(), which must be called with the rtnl
 * semaphore held.
 */
 -struct net_device *dev_base;
 -static struct net_device **dev_tail = &dev_base;
 DEFINE_RWLOCK(dev_base_lock);
 -
 -EXPORT_SYMBOL(dev_base);
 EXPORT_SYMBOL(dev_base_lock);
 
 +LIST_HEAD(dev_base_head);
 +EXPORT_SYMBOL(dev_base_head);
 +
 #define NETDEV_HASHBITS	8
 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 @@ -575,7 +574,7 @@ struct net_device *dev_getbyhwaddr(unsig
 
 ASSERT_RTNL();
 
 -	for (dev = dev_base; dev; dev = dev->next)
 +	for_each_netdev(dev)
 if (dev->type == type &&
 !memcmp(dev->dev_addr, ha, dev->addr_len))
 break;
 @@ 
...
 
 
 |  
	|  |  |  
	| 
		
			| [patch 2/4] Network namespaces: cleanup of dev_base list use [message #3969 is a reply to message #3967] | Mon, 26 June 2006 09:52   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| CONFIG_NET_NS and net_namespace structure are introduced. List of network devices is made per-namespace.
 Each namespace gets its own loopback device.
 
 Task's net_namespace pointer is not incorporated into nsproxy structure,
 since current namespace changes temporarily for processing of packets
 in softirq.
 
 Signed-off-by: Andrey Savochkin <saw@swsoft.com>
 ---
 drivers/net/loopback.c    |   70 +++++++++++--------
 include/linux/init_task.h |    9 ++
 include/linux/net_ns.h    |   88 ++++++++++++++++++++++++
 include/linux/netdevice.h |   20 ++++-
 include/linux/nsproxy.h   |    3
 include/linux/sched.h     |    3
 kernel/nsproxy.c          |   14 +++
 net/Kconfig               |    7 +
 net/core/dev.c            |  162 +++++++++++++++++++++++++++++++++++++++++++++-
 net/core/net-sysfs.c      |   24 ++++++
 net/ipv4/devinet.c        |    2
 net/ipv6/addrconf.c       |    2
 net/ipv6/route.c          |    3
 13 files changed, 371 insertions, 36 deletions
 
 --- ./drivers/net/loopback.c.venshd	Wed Jun 21 18:50:39 2006
 +++ ./drivers/net/loopback.c	Fri Jun 23 11:48:09 2006
 @@ -196,42 +196,56 @@ static struct ethtool_ops loopback_ethto
 .set_tso		= ethtool_op_set_tso,
 };
 
 -struct net_device loopback_dev = {
 -	.name	 		= "lo",
 -	.mtu			= (16 * 1024) + 20 + 20 + 12,
 -	.hard_start_xmit	= loopback_xmit,
 -	.hard_header		= eth_header,
 -	.hard_header_cache	= eth_header_cache,
 -	.header_cache_update	= eth_header_cache_update,
 -	.hard_header_len	= ETH_HLEN,	/* 14	*/
 -	.addr_len		= ETH_ALEN,	/* 6	*/
 -	.tx_queue_len		= 0,
 -	.type			= ARPHRD_LOOPBACK,	/* 0x0001*/
 -	.rebuild_header		= eth_rebuild_header,
 -	.flags			= IFF_LOOPBACK,
 -	.features 		= NETIF_F_SG | NETIF_F_FRAGLIST
 +struct net_device loopback_dev_static;
 +EXPORT_SYMBOL(loopback_dev_static);
 +
 +void loopback_dev_dtor(struct net_device *dev)
 +{
 +	if (dev->priv) {
 +		kfree(dev->priv);
 +		dev->priv = NULL;
 +	}
 +	free_netdev(dev);
 +}
 +
 +void loopback_dev_ctor(struct net_device *dev)
 +{
 +	struct net_device_stats *stats;
 +
 +	memset(dev, 0, sizeof(*dev));
 +	strcpy(dev->name, "lo");
 +	dev->mtu		= (16 * 1024) + 20 + 20 + 12;
 +	dev->hard_start_xmit	= loopback_xmit;
 +	dev->hard_header	= eth_header;
 +	dev->hard_header_cache	= eth_header_cache;
 +	dev->header_cache_update = eth_header_cache_update;
 +	dev->hard_header_len	= ETH_HLEN;	/* 14	*/
 +	dev->addr_len		= ETH_ALEN;	/* 6	*/
 +	dev->tx_queue_len	= 0;
 +	dev->type		= ARPHRD_LOOPBACK;	/* 0x0001*/
 +	dev->rebuild_header	= eth_rebuild_header;
 +	dev->flags		= IFF_LOOPBACK;
 +	dev->features 		= NETIF_F_SG | NETIF_F_FRAGLIST
 #ifdef LOOPBACK_TSO
 | NETIF_F_TSO
 #endif
 | NETIF_F_NO_CSUM | NETIF_F_HIGHDMA
 -				  | NETIF_F_LLTX,
 -	.ethtool_ops		= &loopback_ethtool_ops,
 -};
 -
 -/* Setup and register the loopback device. */
 -int __init loopback_init(void)
 -{
 -	struct net_device_stats *stats;
 +				  | NETIF_F_LLTX
 +				  | NETIF_F_NSOK;
 +	dev->ethtool_ops	= &loopback_ethtool_ops;
 
 /* Can survive without statistics */
 stats = kmalloc(sizeof(struct net_device_stats), GFP_KERNEL);
 if (stats) {
 memset(stats, 0, sizeof(struct net_device_stats));
 -		loopback_dev.priv = stats;
 -		loopback_dev.get_stats = &get_stats;
 +		dev->priv = stats;
 +		dev->get_stats = &get_stats;
 }
 -
 -	return register_netdev(&loopback_dev);
 -};
 +}
 
 -EXPORT_SYMBOL(loopback_dev);
 +/* Setup and register the loopback device. */
 +int __init loopback_init(void)
 +{
 +	loopback_dev_ctor(&loopback_dev_static);
 +	return register_netdev(&loopback_dev_static);
 +};
 --- ./include/linux/init_task.h.venshd	Wed Jun 21 18:53:16 2006
 +++ ./include/linux/init_task.h	Fri Jun 23 11:48:09 2006
 @@ -87,6 +87,14 @@ extern struct nsproxy init_nsproxy;
 
 extern struct group_info init_groups;
 
 +#ifdef CONFIG_NET_NS
 +extern struct net_namespace init_net_ns;
 +#define INIT_NET_NS \
 +	.net_context	= &init_net_ns,
 +#else
 +#define INIT_NET_NS
 +#endif
 +
 /*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
 @@ -129,6 +137,7 @@ extern struct group_info init_groups;
 .signal		= &init_signals,				\
 .sighand	= &init_sighand,				\
 .nsproxy	= &init_nsproxy,				\
 +	INIT_NET_NS							\
 .pending	= {						\
 .list = LIST_HEAD_INIT(tsk.pending.list),		\
 .signal = {{0}}},					\
 --- ./include/linux/net_ns.h.venshd	Thu Jun 22 12:10:13 2006
 +++ ./include/linux/net_ns.h	Fri Jun 23 11:49:42 2006
 @@ -0,0 +1,88 @@
 +/*
 + * Copyright (C) 2006  SWsoft
 + */
 +#ifndef __LINUX_NET_NS__
 +#define __LINUX_NET_NS__
 +
 +#ifdef CONFIG_NET_NS
 +
 +#include <asm/atomic.h>
 +#include <linux/list.h>
 +#include <linux/workqueue.h>
 +
 +struct net_namespace {
 +	atomic_t		active_ref, use_ref;
 +	struct list_head	dev_base;
 +	struct net_device	*loopback;
 +	unsigned int		hash;
 +	struct execute_work	destroy_work;
 +};
 +
 +static inline struct net_namespace *get_net_ns(struct net_namespace *ns)
 +{
 +	atomic_inc(&ns->active_ref);
 +	return ns;
 +}
 +
 +extern void net_ns_stop(struct net_namespace *ns);
 +static inline void put_net_ns(struct net_namespace *ns)
 +{
 +	if (atomic_dec_and_test(&ns->active_ref))
 +		net_ns_stop(ns);
 +}
 +
 +static inline struct net_namespace *ref_net_ns(struct net_namespace *ns)
 +{
 +	atomic_inc(&ns->use_ref);
 +	return ns;
 +}
 +
 +extern void net_ns_free(struct net_namespace *ns);
 +static inline void unref_net_ns(struct net_namespace *ns)
 +{
 +	if (atomic_dec_and_test(&ns->use_ref))
 +		net_ns_free(ns);
 +}
 +
 +extern struct net_namespace init_net_ns;
 +#define current_net_ns		(current->net_context)
 +
 +#define push_net_ns(to, orig)	do { \
 +					task_t *__cur; \
 +					__cur = current; \
 +					orig = __cur->net_context; \
 +					__cur->net_context = ref_net_ns(to); \
 +				} while (0)
 +#define pop_net_ns(orig)	do { \
 +					task_t *__cur; \
 +					struct net_namespace *__cur_ns; \
 +					__cur = current; \
 +					__cur_ns = __cur->net_context; \
 +					__cur->net_context = orig; \
 +					unref_net_ns(__cur_ns); \
 +				} while (0)
 +#define switch_net_ns(to)	do { \
 +					task_t *__cur; \
 +					struct net_namespace *__cur_ns; \
 +					__cur = current; \
 +					__cur_ns = __cur->net_context; \
 +					__cur->net_context = ref_net_ns(to); \
 +					unref_net_ns(__cur_ns); \
 +				} while (0)
 +
 +#define net_ns_same(target, context) ((target) == (context))
 +
 +#else /* CONFIG_NET_NS */
 +
 +struct net_namespace;
 +
 +#define get_net_ns(x)		NULL
 +#define put_net_ns(x)		((void)0)
 +
 +#define current_net_ns		NULL
 +
 +#define net_ns_same(target, context) 1
 +
 +#endif /* CONFIG_NET_NS */
 +
 +#endif /* __LINUX_NET_NS__ */
 --- ./include/linux/netdevice.h.venshd	Thu Jun 22 18:57:50 2006
 +++ ./include/linux/netdevice.h	Fri Jun 23 11:48:15 2006
 @@ -311,6 +311,7 @@ struct net_device
 #define NETIF_F_TSO		2048	/* Can offload TCP/IP segmentation */
 #define NETIF_F_LLTX		4096	/* LockLess TX */
 #define NETIF_F_UFO             8192    /* Can offload UDP Large Send*/
 +#define NETIF_F_NSOK		16384	/* OK for namespaces */
 
 #define NETIF_F_GEN_CSUM	(NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
 #define NETIF_F_ALL_CSUM	(NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM)
 @@ -366,6 +367,10 @@ struct net_device
 int			promiscuity;
 int			allmulti;
 
 +#ifdef CONFIG_NET_NS
 +	struct net_namespace	*net_ns;
 +#endif
 +
 
 /* Protocol specific pointers */
 
 @@ -542,17 +547,26 @@ struct packet_type {
 
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 +#include <linux/net_ns.h>
 
 -extern struct net_device	loopback_dev;		/* The loopback */
 +extern struct net_device	loopback_dev_static;
 +#ifndef CONFIG_NET_NS
 +#define loopback_dev		loopback_dev_static	/* The loopback */
 extern struct list_head		dev_base_head;		/* All devices */
 +#else
 +#define loopback_dev		(*current_net_ns->loopback)
 +#define dev_base_head		(current_net_ns->dev_base)
 +#endif
 extern rwlock_t			dev_base_lock;		/* Device list lock */
 
 #define for_each_netdev(p)	list_for_each_entry(p, &dev_base_head, dev_list)
 
 /* DO NOT USE first_netdev/next_netdev, use loop defined above */
 #define first_netdev()		({ \
 -					list_empty(&dev_base_head) ? NULL : \
 -						list_entry(dev_base_head.next, \
 +					struct list_head *__base; \
 +					__base = &dev_base_head; \
 +					list_empty(__base) ? NULL : \
 +						list_entry(__base->next, \
 struct net_device, \
 dev_list); \
 })
 --- ./include/linux/nsproxy.h.venshd	Wed Jun 21 18:53:17 2006
 +++ ./include/linux/nsproxy.h	Fri Jun 23 11:48:15 2006
 @@ -33,6 +33,7 @@ struct nsproxy *dup_namespaces(struct ns
 int copy_namespaces(int flags, struct task_struct *tsk);
 void get_task_namespaces(struct task_struct *tsk);
 void free_nsproxy(struct nsproxy *ns);
 +void release_net_context(struct task_struct *tsk);
 
 static inline void put_nsproxy(struct nsproxy *ns)
 {
 @@ -48,5 +49,7 @@ static inline void exit_task_namespaces(
 put_nsproxy(ns);
 p->nsproxy = NULL;
 }
 +	release_net_context(p);
 }
 +
 #endif
 --- ./include/linux/sched.h.venshd	Wed Jun 21 18:53:17 2006
 +++ ./include/linux/sched.h	Fri Jun 23 11:48:15 2006
 @@ -887,6 +887,9 @@ struct task_struct {
 struct files_struct *files;
 /* namespaces */
 struct nsproxy *nsproxy;
 +#ifdef CONFIG_NET_NS
 +	struct net_namespace *net_context;
 +#endif
 /* signal handlers */
 struct signal_struct *signal;
 struct sighand_struct *sighand;
 --- ./kernel/nsproxy.c.venshd	Wed Jun 21 18:53:17 2006
 +++ ./kernel/nsproxy.c	Fri Jun 23 11:48:15 2006
 @@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/version.h>
 #include <linux/nsproxy.h>
 +#include <linux/net_ns.h>
 #include <linux/namespace.h>
 #include <linux/utsname.h>
 
 @@ -84,6 +85,7 @@ int copy_namespaces(int flags, struct ta
 return 0;
 
 get_nsproxy(old_ns);
 +	(void) get_net_ns(tsk->net_context); /* for pointer copied by memcpy */
 
 if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC)))
 return 0;
 @@ -134,3 +136,15 @@ void free
...
 
 
 |  
	|  |  |  
	| 
		
			| [patch 3/4] Network namespaces: IPv4 FIB/routing in namespaces [message #3970 is a reply to message #3969] | Mon, 26 June 2006 09:54   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| Structures related to IPv4 rounting (FIB and routing cache) are made per-namespace.
 
 Signed-off-by: Andrey Savochkin <saw@swsoft.com>
 ---
 include/linux/net_ns.h   |    9 +++
 include/net/flow.h       |    3 +
 include/net/ip_fib.h     |   62 ++++++++++++++++++++-----
 net/core/dev.c           |    7 ++
 net/ipv4/Kconfig         |    4 -
 net/ipv4/fib_frontend.c  |   87 +++++++++++++++++++++++++++++------
 net/ipv4/fib_hash.c      |   13 ++++-
 net/ipv4/fib_rules.c     |  114 +++++++++++++++++++++++++++++++++++++++++------
 net/ipv4/fib_semantics.c |  104 +++++++++++++++++++++++++++++-------------
 net/ipv4/route.c         |   26 ++++++++++
 10 files changed, 348 insertions, 81 deletions
 
 --- ./include/linux/net_ns.h.vensrt	Fri Jun 23 11:49:42 2006
 +++ ./include/linux/net_ns.h	Fri Jun 23 11:50:16 2006
 @@ -14,7 +14,16 @@ struct net_namespace {
 atomic_t		active_ref, use_ref;
 struct list_head	dev_base;
 struct net_device	*loopback;
 +#ifndef CONFIG_IP_MULTIPLE_TABLES
 +	struct fib_table	*fib4_local_table, *fib4_main_table;
 +#else
 +	struct fib_table	**fib4_tables;
 +	struct hlist_head	fib4_rules;
 +#endif
 +	struct hlist_head	*fib4_hash, *fib4_laddrhash;
 +	unsigned		fib4_hash_size, fib4_info_cnt;
 unsigned int		hash;
 +	char			destroying;
 struct execute_work	destroy_work;
 };
 
 --- ./include/net/flow.h.vensrt	Wed Jun 21 18:51:08 2006
 +++ ./include/net/flow.h	Fri Jun 23 11:50:16 2006
 @@ -78,6 +78,9 @@ struct flowi {
 #define fl_icmp_type	uli_u.icmpt.type
 #define fl_icmp_code	uli_u.icmpt.code
 #define fl_ipsec_spi	uli_u.spi
 +#ifdef CONFIG_NET_NS
 +	struct net_namespace *net_ns;
 +#endif
 } __attribute__((__aligned__(BITS_PER_LONG/8)));
 
 #define FLOW_DIR_IN	0
 --- ./include/net/ip_fib.h.vensrt	Wed Jun 21 18:53:17 2006
 +++ ./include/net/ip_fib.h	Fri Jun 23 11:50:16 2006
 @@ -18,6 +18,7 @@
 
 #include <net/flow.h>
 #include <linux/seq_file.h>
 +#include <linux/net_ns.h>
 
 /* WARNING: The ordering of these elements must match ordering
 *          of RTA_* rtnetlink attribute numbers.
 @@ -169,14 +170,21 @@ struct fib_table {
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 
 -extern struct fib_table *ip_fib_local_table;
 -extern struct fib_table *ip_fib_main_table;
 +#ifndef CONFIG_NET_NS
 +extern struct fib_table *ip_fib_local_table_static;
 +extern struct fib_table *ip_fib_main_table_static;
 +#define ip_fib_local_table_ns()		ip_fib_local_table_static
 +#define ip_fib_main_table_ns()		ip_fib_main_table_static
 +#else
 +#define ip_fib_local_table_ns()		(current_net_ns->fib4_local_table)
 +#define ip_fib_main_table_ns()		(current_net_ns->fib4_main_table)
 +#endif
 
 static inline struct fib_table *fib_get_table(int id)
 {
 if (id != RT_TABLE_LOCAL)
 -		return ip_fib_main_table;
 -	return ip_fib_local_table;
 +		return ip_fib_main_table_ns();
 +	return ip_fib_local_table_ns();
 }
 
 static inline struct fib_table *fib_new_table(int id)
 @@ -186,23 +194,36 @@ static inline struct fib_table *fib_new_
 
 static inline int fib_lookup(const struct flowi *flp, struct fib_result *res)
 {
 -	if (ip_fib_local_table->tb_lookup(ip_fib_local_table, flp, res) &&
 -	    ip_fib_main_table->tb_lookup(ip_fib_main_table, flp, res))
 +	struct fib_table *tb;
 +
 +	tb = ip_fib_local_table_ns();
 +	if (!tb->tb_lookup(tb, flp, res))
 +		return 0;
 +	tb = ip_fib_main_table_ns();
 +	if (tb->tb_lookup(tb, flp, res))
 return -ENETUNREACH;
 return 0;
 }
 
 static inline void fib_select_default(const struct flowi *flp, struct fib_result *res)
 {
 +	struct fib_table *tb;
 +
 +	tb = ip_fib_main_table_ns();
 if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
 -		ip_fib_main_table->tb_select_default(ip_fib_main_table, flp, res);
 +		tb->tb_select_default(main_table, flp, res);
 }
 
 #else /* CONFIG_IP_MULTIPLE_TABLES */
 -#define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL])
 -#define ip_fib_main_table (fib_tables[RT_TABLE_MAIN])
 +#define ip_fib_local_table_ns() (fib_tables_ns()[RT_TABLE_LOCAL])
 +#define ip_fib_main_table_ns() (fib_tables_ns()[RT_TABLE_MAIN])
 
 -extern struct fib_table * fib_tables[RT_TABLE_MAX+1];
 +#ifndef CONFIG_NET_NS
 +extern struct fib_table * fib_tables_static[RT_TABLE_MAX+1];
 +#define fib_tables_ns() fib_tables_static
 +#else
 +#define fib_tables_ns() (current_net_ns->fib4_tables)
 +#endif
 extern int fib_lookup(const struct flowi *flp, struct fib_result *res);
 extern struct fib_table *__fib_new_table(int id);
 extern void fib_rule_put(struct fib_rule *r);
 @@ -212,7 +233,7 @@ static inline struct fib_table *fib_get_
 if (id == 0)
 id = RT_TABLE_MAIN;
 
 -	return fib_tables[id];
 +	return fib_tables_ns()[id];
 }
 
 static inline struct fib_table *fib_new_table(int id)
 @@ -220,7 +241,7 @@ static inline struct fib_table *fib_new_
 if (id == 0)
 id = RT_TABLE_MAIN;
 
 -	return fib_tables[id] ? : __fib_new_table(id);
 +	return fib_tables_ns()[id] ? : __fib_new_table(id);
 }
 
 extern void fib_select_default(const struct flowi *flp, struct fib_result *res);
 @@ -229,6 +250,10 @@ extern void fib_select_default(const str
 
 /* Exported by fib_frontend.c */
 extern void		ip_fib_init(void);
 +#ifdef CONFIG_NET_NS
 +extern int ip_fib_struct_init(void);
 +extern void ip_fib_struct_fini(void);
 +#endif
 extern int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 extern int inet_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg);
 @@ -246,9 +271,16 @@ extern int fib_sync_up(struct net_device
 extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
 struct kern_rta *rta, struct rtentry *r);
 extern u32  __fib_res_prefsrc(struct fib_result *res);
 +#ifdef CONFIG_NET_NS
 +extern void fib_hashtable_destroy(void);
 +#endif
 
 /* Exported by fib_hash.c */
 extern struct fib_table *fib_hash_init(int id);
 +#ifdef CONFIG_NET_NS
 +extern void fib_hash_fini(struct fib_table *tb);
 +extern void fib_hash_destroy_hash(void);
 +#endif
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 /* Exported by fib_rules.c */
 @@ -259,7 +291,11 @@ extern int inet_dump_rules(struct sk_buf
 #ifdef CONFIG_NET_CLS_ROUTE
 extern u32 fib_rules_tclass(struct fib_result *res);
 #endif
 -extern void fib_rules_init(void);
 +extern int fib_rules_struct_init(void);
 +extern void fib_rules_notif_init(void);
 +#ifdef CONFIG_NET_NS
 +extern void fib_rules_struct_fini(void);
 +#endif
 #endif
 
 static inline void fib_combine_itag(u32 *itag, struct fib_result *res)
 --- ./net/core/dev.c.vensrt	Fri Jun 23 11:48:15 2006
 +++ ./net/core/dev.c	Fri Jun 23 11:50:16 2006
 @@ -3398,6 +3398,8 @@ static int __init netdev_dma_register(vo
 #endif /* CONFIG_NET_DMA */
 
 #ifdef CONFIG_NET_NS
 +#include <net/ip_fib.h>
 +
 struct net_namespace init_net_ns = {
 .active_ref	= ATOMIC_INIT(2),
 /* one for init_task->net_context,
 @@ -3436,6 +3438,8 @@ int net_ns_start(void)
 task = current;
 orig_ns = task->net_context;
 task->net_context = ns;
 +	if (ip_fib_struct_init())
 +		goto out_fib4;
 err = register_netdev(dev);
 if (err)
 goto out_register;
 @@ -3443,6 +3447,8 @@ int net_ns_start(void)
 return 0;
 
 out_register:
 +	ip_fib_struct_fini();
 +out_fib4:
 dev->destructor(dev);
 task->net_context = orig_ns;
 BUG_ON(atomic_read(&ns->active_ref) != 1);
 @@ -3467,6 +3473,7 @@ static void net_ns_destroy(void *data)
 ns = data;
 push_net_ns(ns, orig_ns);
 unregister_netdev(ns->loopback);
 +	ip_fib_struct_fini();
 BUG_ON(!list_empty(&ns->dev_base));
 pop_net_ns(orig_ns);
 
 --- ./net/ipv4/Kconfig.vensrt	Wed Jun 21 18:53:19 2006
 +++ ./net/ipv4/Kconfig	Fri Jun 23 11:50:16 2006
 @@ -53,7 +53,7 @@ config IP_ADVANCED_ROUTER
 
 choice
 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
 -	depends on IP_ADVANCED_ROUTER
 +	depends on IP_ADVANCED_ROUTER && !NET_NS
 default ASK_IP_FIB_HASH
 
 config ASK_IP_FIB_HASH
 @@ -83,7 +83,7 @@ config IP_FIB_TRIE
 endchoice
 
 config IP_FIB_HASH
 -	def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER
 +	def_bool ASK_IP_FIB_HASH || !IP_ADVANCED_ROUTER || NET_NS
 
 config IP_MULTIPLE_TABLES
 bool "IP: policy routing"
 --- ./net/ipv4/fib_frontend.c.vensrt	Wed Jun 21 18:53:19 2006
 +++ ./net/ipv4/fib_frontend.c	Fri Jun 23 11:50:16 2006
 @@ -53,14 +53,18 @@
 
 #define RT_TABLE_MIN RT_TABLE_MAIN
 
 -struct fib_table *ip_fib_local_table;
 -struct fib_table *ip_fib_main_table;
 +#ifndef CONFIG_NET_NS
 +struct fib_table *ip_fib_local_table_static;
 +struct fib_table *ip_fib_main_table_static;
 +#endif
 
 #else
 
 #define RT_TABLE_MIN 1
 
 -struct fib_table *fib_tables[RT_TABLE_MAX+1];
 +#ifndef CONFIG_NET_NS
 +struct fib_table *fib_tables_static[RT_TABLE_MAX+1];
 +#endif
 
 struct fib_table *__fib_new_table(int id)
 {
 @@ -69,7 +73,7 @@ struct fib_table *__fib_new_table(int id
 tb = fib_hash_init(id);
 if (!tb)
 return NULL;
 -	fib_tables[id] = tb;
 +	fib_tables_ns()[id] = tb;
 return tb;
 }
 
 @@ -80,8 +84,8 @@ struct fib_table *__fib_new_table(int id
 static void fib_flush(void)
 {
 int flushed = 0;
 -#ifdef CONFIG_IP_MULTIPLE_TABLES
 struct fib_table *tb;
 +#ifdef CONFIG_IP_MULTIPLE_TABLES
 int id;
 
 for (id = RT_TABLE_MAX; id>0; id--) {
 @@ -90,8 +94,10 @@ static void fib_flush(void)
 flushed += tb->tb_flush(tb);
 }
 #else /* CONFIG_IP_MULTIPLE_TABLES */
 -	flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
 -	flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
 +	tb = ip_fib_main_table_ns();
 +	flushed += tb->tb_flush(tb);
 +	tb = ip_fib_local_table_ns();
 +	flushed += tb->tb_flush(tb);
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 if (flushed)
 @@ -106,14 +112,15 @@ struct net_device * ip_dev_find(u32 addr
 {
 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
 struct fib_result res;
 +	struct fib_table *tb;
 struct net_device *dev = NULL;
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 res.r = NULL;
 #endif
 
 -	if (!ip_fib_local_table ||
 -	    ip_fib_local_table->tb_lookup(ip_fib_local_table, &am
...
 
 
 |  
	|  |  |  
	| 
		
			| Re: [RFC][patch 1/4] Network namespaces: cleanup of dev_base list use [message #3984 is a reply to message #3967] | Mon, 26 June 2006 15:13   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Andrey Savochkin <saw@swsoft.com> writes: 
 > Cleanup of dev_base list use, with the aim to make device list per-namespace.
 > In almost every occasion, use of dev_base variable and dev->next pointer
 > could be easily replaced by for_each_netdev loop.
 > A few most complicated places were converted to using
 > first_netdev()/next_netdev().
 
 As a proof of concept patch this is ok.
 
 As a real world patch this is much too big, which prevents review.
 Plus it takes a few actions that are more than replace just
 iterators through the device list.
 
 In addition I suspect several if not all of these iterators
 can be replaced with the an appropriate helper function.
 
 The normal structure for a patch like this would be to
 introduce the new helper function.  for_each_netdev.
 And then to replace all of the users while cc'ing the
 maintainers of those drivers.  With each different
 driver being a different patch.
 
 There is another topic for discussion in this patch as well.
 How much of the context should be implicit and how much
 should be explicit.
 
 If the changes from netchannels had already been implemented, and all of
 the network processing was happening in a process context then I would
 trivially agree that implicit would be the way to go.
 
 However short of always having code always execute in the proper
 context I'm not comfortable with implicit parameters to functions.
 Not that this the contents of this patch should address this but the
 later patches should.
 
 When I went through this, my patchset just added an explicit
 continue if the devices was not in the appropriate namespace.
 I actually prefer the multiple list implementation but at
 the same time I think it is harder to get a clean implementation
 out of it.
 
 Eric
 
 
 > --- ./drivers/block/aoe/aoecmd.c.vedevbase	Wed Jun 21 18:50:28 2006
 > +++ ./drivers/block/aoe/aoecmd.c	Thu Jun 22 12:03:07 2006
 > @@ -204,14 +204,17 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigne
 >  	sl = sl_tail = NULL;
 >
 >  	read_lock(&dev_base_lock);
 > -	for (ifp = dev_base; ifp; dev_put(ifp), ifp = ifp->next) {
 > +	for_each_netdev(dev) {
 >  		dev_hold(ifp);
 > -		if (!is_aoe_netif(ifp))
 > +		if (!is_aoe_netif(ifp)) {
 > +			dev_put(ifp);
 >  			continue;
 > +		}
 >
 >  		skb = new_skb(ifp, sizeof *h + sizeof *ch);
 >  		if (skb == NULL) {
 >  			printk(KERN_INFO "aoe: aoecmd_cfg: skb alloc
 > failure\n");
 > +			dev_put(ifp);
 >  			continue;
 >  		}
 >  		if (sl_tail == NULL)
 > @@ -229,6 +232,7 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigne
 >
 >  		skb->next = sl;
 >  		sl = skb;
 > +		dev_put(ifp);
 >  	}
 >  	read_unlock(&dev_base_lock);
 
 These hunks should use for_each_netdev(ifp);
 
 > --- ./include/linux/netdevice.h.vedevbase	Wed Jun 21 18:53:17 2006
 > +++ ./include/linux/netdevice.h	Thu Jun 22 18:57:50 2006
 > @@ -289,8 +289,8 @@ struct net_device
 >
 >  	unsigned long		state;
 >
 > -	struct net_device	*next;
 > -
 > +	struct list_head	dev_list;
 > +
 >  	/* The device initialization function. Called only once. */
 >  	int			(*init)(struct net_device *dev);
 >
 > @@ -543,9 +543,27 @@ struct packet_type {
 >  #include <linux/interrupt.h>
 >  #include <linux/notifier.h>
 >
 > -extern struct net_device loopback_dev; /* The loopback */
 > -extern struct net_device *dev_base; /* All devices */
 > -extern rwlock_t dev_base_lock; /* Device list lock */
 > +extern struct net_device	loopback_dev;		/* The loopback */
 > +extern struct list_head dev_base_head; /* All devices */
 > +extern rwlock_t dev_base_lock; /* Device list lock */
 > +
 
 No need to change the loopback_dev and dev_base_lock here.
 
 What is the advantage of changing the type of dev_base?
 I can guess but there should be an explanation of it.
 
 > +#define for_each_netdev(p) list_for_each_entry(p, &dev_base_head, dev_list)
 > +
 > +/* DO NOT USE first_netdev/next_netdev, use loop defined above */
 > +#define first_netdev()		({ \
 > +					list_empty(&dev_base_head) ? NULL : \
 > + list_entry(dev_base_head.next, \
 > +							struct net_device, \
 > +							dev_list); \
 > +				 })
 > +#define next_netdev(dev)	({ \
 > +					struct list_head *__next; \
 > +					__next = (dev)->dev_list.next; \
 > +					__next == &dev_base_head ? NULL : \
 > +						list_entry(__next, \
 > +							struct net_device, \
 > +							dev_list); \
 > +				 })
 >
 >  extern int netdev_boot_setup_check(struct net_device *dev);
 >  extern unsigned long		netdev_boot_base(const char *prefix, int unit);
 
 
 > @@ -1903,7 +1902,7 @@ static int dev_ifconf(char __user *arg)
 >  	 */
 >
 >  	total = 0;
 > -	for (dev = dev_base; dev; dev = dev->next) {
 > +	for_each_netdev(dev) {
 >  		for (i = 0; i < NPROTO; i++) {
 >  			if (gifconf_list[i]) {
 >  				int done;
 > @@ -1935,26 +1934,25 @@ static int dev_ifconf(char __user *arg)
 
 Hmm.  The proc code here appears to be more than non-trivial
 restructuring.  I'm not certain it is but it looks that way
 which make review harder.
 
 >   *	This is invoked by the /proc filesystem handler to display a device
 >   *	in detail.
 >   */
 > -static __inline__ struct net_device *dev_get_idx(loff_t pos)
 > -{
 > -	struct net_device *dev;
 > -	loff_t i;
 > -
 > -	for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
 > -
 > -	return i == pos ? dev : NULL;
 > -}
 > -
 >  void *dev_seq_start(struct seq_file *seq, loff_t *pos)
 >  {
 > +	struct net_device *dev;
 > +	loff_t off = 1;
 >  	read_lock(&dev_base_lock);
 > -	return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
 > +	if (!*pos)
 > +		return SEQ_START_TOKEN;
 > +	for_each_netdev(dev) {
 > +		if (off++ == *pos)
 > +			return dev;
 > +	}
 > +	return NULL;
 >  }
 >
 >  void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 >  {
 > +	struct net_device *dev = v;
 >  	++*pos;
 > - return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
 > +	return v == SEQ_START_TOKEN ? first_netdev() : next_netdev(dev);
 >  }
 >
 >  void dev_seq_stop(struct seq_file *seq, void *v)
 |  
	|  |  |  
	| 
		
			| Re: [patch 1/4] Network namespaces: cleanup of dev_base list use [message #3988 is a reply to message #3984] | Mon, 26 June 2006 15:42   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| Hi Eric, 
 On Mon, Jun 26, 2006 at 09:13:52AM -0600, Eric W. Biederman wrote:
 > Andrey Savochkin <saw@swsoft.com> writes:
 >
 > > Cleanup of dev_base list use, with the aim to make device list per-namespace.
 > > In almost every occasion, use of dev_base variable and dev->next pointer
 > > could be easily replaced by for_each_netdev loop.
 > > A few most complicated places were converted to using
 > > first_netdev()/next_netdev().
 >
 > As a proof of concept patch this is ok.
 >
 > As a real world patch this is much too big, which prevents review.
 > Plus it takes a few actions that are more than replace just
 > iterators through the device list.
 
 dev_base list is historically not the cleanest part of Linux networking.
 I've still spotted a place where the first device in dev_base list is assumed
 to be loopback.  In early days we had more, now only one place or two...
 
 >
 > In addition I suspect several if not all of these iterators
 > can be replaced with the an appropriate helper function.
 >
 > The normal structure for a patch like this would be to
 > introduce the new helper function.  for_each_netdev.
 > And then to replace all of the users while cc'ing the
 > maintainers of those drivers.  With each different
 > driver being a different patch.
 >
 > There is another topic for discussion in this patch as well.
 > How much of the context should be implicit and how much
 > should be explicit.
 >
 > If the changes from netchannels had already been implemented, and all of
 > the network processing was happening in a process context then I would
 > trivially agree that implicit would be the way to go.
 
 Why would we want all network processing happen in a process context?
 
 >
 > However short of always having code always execute in the proper
 > context I'm not comfortable with implicit parameters to functions.
 > Not that this the contents of this patch should address this but the
 > later patches should.
 
 We just have too many layers in networking code, and FIB/routing
 illustrates it well.
 
 >
 > When I went through this, my patchset just added an explicit
 > continue if the devices was not in the appropriate namespace.
 > I actually prefer the multiple list implementation but at
 > the same time I think it is harder to get a clean implementation
 > out of it.
 
 Certainly, dev_base list reorganization is not the crucial point in network
 namespaces.  But it has to be done some way or other.
 If people vote for a single list with skipping devices from a wrong
 namespace, it's fine with me, I can re-make this patch.
 
 I personally prefer per-namespace device list since we have too many places
 in the kernel where this list is walked in a linear fashion,
 and with many namespaces this list may become quite long.
 
 Regards
 
 Andrey
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [patch 1/4] Network namespaces: cleanup of dev_base list use [message #3991 is a reply to message #3988] | Mon, 26 June 2006 16:26   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Andrey Savochkin <saw@swsoft.com> writes: 
 > Hi Eric,
 >
 > On Mon, Jun 26, 2006 at 09:13:52AM -0600, Eric W. Biederman wrote:
 >> Andrey Savochkin <saw@swsoft.com> writes:
 >>
 >> > Cleanup of dev_base list use, with the aim to make device list
 > per-namespace.
 >> > In almost every occasion, use of dev_base variable and dev->next pointer
 >> > could be easily replaced by for_each_netdev loop.
 >> > A few most complicated places were converted to using
 >> > first_netdev()/next_netdev().
 >>
 >> As a proof of concept patch this is ok.
 >>
 >> As a real world patch this is much too big, which prevents review.
 >> Plus it takes a few actions that are more than replace just
 >> iterators through the device list.
 >
 > dev_base list is historically not the cleanest part of Linux networking.
 > I've still spotted a place where the first device in dev_base list is assumed
 > to be loopback.  In early days we had more, now only one place or two...
 
 I agree. I'm just saying this should be several patches in a patchset
 not just one big one.
 
 >> In addition I suspect several if not all of these iterators
 >> can be replaced with the an appropriate helper function.
 >>
 >> The normal structure for a patch like this would be to
 >> introduce the new helper function.  for_each_netdev.
 >> And then to replace all of the users while cc'ing the
 >> maintainers of those drivers.  With each different
 >> driver being a different patch.
 >>
 >> There is another topic for discussion in this patch as well.
 >> How much of the context should be implicit and how much
 >> should be explicit.
 >>
 >> If the changes from netchannels had already been implemented, and all of
 >> the network processing was happening in a process context then I would
 >> trivially agree that implicit would be the way to go.
 >
 > Why would we want all network processing happen in a process context?
 
 The basic idea is that an interrupt comes in.  A light weigh classifier
 looks at the packet and throws it into the appropriate socket packet queue.
 
 Beyond that everything can happen in the socket packet queue in process
 context which reduces the number of locks you need, and increases cache
 locality.
 
 Van Jacobson's slides showed some impressive system load reductions by
 doing that.
 
 The increased locality aids the kind of work we are doing as well,
 by meaning we don't have to guess.
 
 It is a big enough problem that I don't think we want to gate on
 that development but we need to be ready to take advantage of it when
 it happens.
 
 >> However short of always having code always execute in the proper
 >> context I'm not comfortable with implicit parameters to functions.
 >> Not that this the contents of this patch should address this but the
 >> later patches should.
 >
 > We just have too many layers in networking code, and FIB/routing
 > illustrates it well.
 
 I don't follow this comment.  How does a lot of layers affect
 the choice of implicit or explicit parameters?  If you are maintaining
 a patch outside the kernel I could see how there could be a win for
 touching the least amount of code possible but for merged code that
 you only have to go through once I don't see how the number of layers
 affects things.
 
 As I recall for most of the FIB/routing code once you have removed
 the global variable accesses and introduce namespace checks in
 the hash table (because allocating hash tables at runtime isn't sane)
 the rest of the code was agnostic about what was going on.  So I think
 you have touched everything that needs touching.  So I don't see
 a code size or complexity argument there.
 
 I do agree that we do have a lot of code there.
 
 >> When I went through this, my patchset just added an explicit
 >> continue if the devices was not in the appropriate namespace.
 >> I actually prefer the multiple list implementation but at
 >> the same time I think it is harder to get a clean implementation
 >> out of it.
 >
 > Certainly, dev_base list reorganization is not the crucial point in network
 > namespaces.  But it has to be done some way or other.
 > If people vote for a single list with skipping devices from a wrong
 > namespace, it's fine with me, I can re-make this patch.
 >
 > I personally prefer per-namespace device list since we have too many places
 > in the kernel where this list is walked in a linear fashion,
 > and with many namespaces this list may become quite long.
 
 I completely agree that cleaning up the list is a good thing to do,
 regardless of how we implement things.  So that can easily be a pre cursor
 to the work.  My only practical question is if we can remove the list
 walks be calling some of the generic helper functions.
 
 I would say just separate out the list walk cleanup code and submit it.
 The important point is that the patch needs to stand on it's as a
 cleanup rather than depending on network namespaces as a justification.
 
 Eric
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: [patch 1/4] Network namespaces: cleanup of dev_base list use [message #4009 is a reply to message #3991] | Mon, 26 June 2006 20:14   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| Eric, 
 On Mon, Jun 26, 2006 at 10:26:23AM -0600, Eric W. Biederman wrote:
 > Andrey Savochkin <saw@swsoft.com> writes:
 >
 > > On Mon, Jun 26, 2006 at 09:13:52AM -0600, Eric W. Biederman wrote:
 > >>
 > >> There is another topic for discussion in this patch as well.
 > >> How much of the context should be implicit and how much
 > >> should be explicit.
 > >>
 > >> If the changes from netchannels had already been implemented, and all of
 > >> the network processing was happening in a process context then I would
 > >> trivially agree that implicit would be the way to go.
 > >
 >
 [snip]
 > It is a big enough problem that I don't think we want to gate on
 > that development but we need to be ready to take advantage of it when
 > it happens.
 
 Well, ok, implicit namespace reference will take advantage of it
 if it happens.
 
 >
 > >> However short of always having code always execute in the proper
 > >> context I'm not comfortable with implicit parameters to functions.
 > >> Not that this the contents of this patch should address this but the
 > >> later patches should.
 > >
 > > We just have too many layers in networking code, and FIB/routing
 > > illustrates it well.
 >
 > I don't follow this comment.  How does a lot of layers affect
 > the choice of implicit or explicit parameters?  If you are maintaining
 > a patch outside the kernel I could see how there could be a win for
 > touching the least amount of code possible but for merged code that
 > you only have to go through once I don't see how the number of layers
 > affects things.
 
 I agree that implicit vs explicit parameters is a topic for discussion.
 >From what you see from my patch, I vote for implicit ones in this case :)
 
 I was talking about layers because they imply changing more code,
 and usually imply adding more parameters to functions and passing these
 additional parameters to next layers.
 In "routing" code it goes from routing entry points, to routing cache, to
 general FIB functions, to table-specific code (FIB hash).
 
 These additional parameters bloat the code to some extent.
 Sometimes it's possible to save here and there by fetching the parameter
 (namespace pointer) indirectly from structures you already have at hand,
 but it can't be done universally.
 
 One of the properties of implicit argument which I especially like
 is that both input and output paths are absolutely symmetric in how
 the namespace pointer is extracted.
 
 >
 > As I recall for most of the FIB/routing code once you have removed
 > the global variable accesses and introduce namespace checks in
 > the hash table (because allocating hash tables at runtime isn't sane)
 > the rest of the code was agnostic about what was going on.  So I think
 > you have touched everything that needs touching.  So I don't see
 > a code size or complexity argument there.
 
 Andrey
 |  
	|  |  |  
	| 
		
			| Re: [patch 1/4] Network namespaces: cleanup of dev_base list use [message #4012 is a reply to message #4009] | Mon, 26 June 2006 21:02   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Andrey Savochkin <saw@swsoft.com> writes: 
 > Eric,
 >
 > On Mon, Jun 26, 2006 at 10:26:23AM -0600, Eric W. Biederman wrote:
 >>
 >>
 > [snip]
 >> It is a big enough problem that I don't think we want to gate on
 >> that development but we need to be ready to take advantage of it when
 >> it happens.
 >
 > Well, ok, implicit namespace reference will take advantage of it
 > if it happens.
 
 And if fact in that case we don't have to do anything special because
 the process pointer will always be correct.
 
 >> >> However short of always having code always execute in the proper
 >> >> context I'm not comfortable with implicit parameters to functions.
 >> >> Not that this the contents of this patch should address this but the
 >> >> later patches should.
 >> >
 >> > We just have too many layers in networking code, and FIB/routing
 >> > illustrates it well.
 >>
 >> I don't follow this comment.  How does a lot of layers affect
 >> the choice of implicit or explicit parameters?  If you are maintaining
 >> a patch outside the kernel I could see how there could be a win for
 >> touching the least amount of code possible but for merged code that
 >> you only have to go through once I don't see how the number of layers
 >> affects things.
 >
 > I agree that implicit vs explicit parameters is a topic for discussion.
 > From what you see from my patch, I vote for implicit ones in this case :)
 
 Yes.  I tend to be against implicit namespaces references mostly because
 the explicit ones tend to make the code clearer.
 
 > I was talking about layers because they imply changing more code,
 > and usually imply adding more parameters to functions and passing these
 > additional parameters to next layers.
 > In "routing" code it goes from routing entry points, to routing cache, to
 > general FIB functions, to table-specific code (FIB hash).
 
 Yes.  Although as I recall you don't have to pass anything down very far.
 Because most functions once you have done the table lookup operate
 on just a subset of the table, when they are getting the real work done.
 
 > These additional parameters bloat the code to some extent.
 > Sometimes it's possible to save here and there by fetching the parameter
 > (namespace pointer) indirectly from structures you already have at hand,
 > but it can't be done universally.
 >
 > One of the properties of implicit argument which I especially like
 > is that both input and output paths are absolutely symmetric in how
 > the namespace pointer is extracted.
 
 There is an element of that.  In the output path for the most part everything
 works implicitly because you are in the proper context.
 
 I need to dig out my code and start comparing to what you have done.
 
 Eric
 |  
	|  |  |  
	|  |  
	| 
		
			| Re: [patch 3/4] Network namespaces: IPv4 FIB/routing in namespaces [message #4033 is a reply to message #4007] | Tue, 27 June 2006 09:25   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| On Mon, Jun 26, 2006 at 10:05:14PM +0200, Herbert Poetzl wrote: > On Mon, Jun 26, 2006 at 04:56:46PM +0200, Daniel Lezcano wrote:
 > > Andrey Savochkin wrote:
 > > >Structures related to IPv4 rounting (FIB and routing cache)
 > > >are made per-namespace.
 > >
 > > How do you handle ICMP_REDIRECT ?
 >
 > and btw. how do you handle the beloved 'ping'
 > (i.e. ICMP_ECHO_REQUEST/REPLY for and from
 > guests?
 
 I don't need to do anything special.  They are just IP packets.
 If packets are local in the current net namespace, they are delivered to
 socket or handled by icmp_rcv.
 
 Certainly, packet/raw sockets shouldn't see packets they aren't supposed to
 see.  For raw sockets, it implies making socket lookup aware of namespaces,
 exactly like for TCP or UDP.
 
 Andrey
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: Network namespaces a path to mergable code. [message #4060 is a reply to message #4058] | Tue, 27 June 2006 17:58   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| Eric, 
 On Tue, Jun 27, 2006 at 11:20:40AM -0600, Eric W. Biederman wrote:
 >
 > Thinking about this I am going to suggest a slightly different direction
 > for get a patchset we can merge.
 >
 > First we concentrate on the fundamentals.
 > - How we mark a device as belonging to a specific network namespace.
 > - How we mark a socket as belonging to a specific network namespace.
 
 I agree with the direction of your thoughts.
 I was trying to do a similar thing, define clear steps in network
 namespace merging.
 
 My first patchset covers devices but not sockets.
 The only difference from what you're suggesting is ipv4 routing.
 For me, it is not less important than devices and sockets.  May be even
 more important, since routing exposes design deficiencies less obvious at
 socket level.
 
 >
 > As part of the fundamentals we add a patch to the generic socket code
 > that by default will disable it for protocol families that do not indicate
 > support for handling network namespaces, on a non-default network namespace.
 
 Fine
 
 Can you summarize you objections against my way of handling devices, please?
 And what was the typo you referred to in your letter to Kirill Korotaev?
 
 Regards
 Andrey
 |  
	|  |  |  
	| 
		
			| Re: Network namespaces a path to mergable code. [message #4061 is a reply to message #4060] | Tue, 27 June 2006 22:20   |  
			| 
				
				
					|  Sam Vilain Messages: 73
 Registered: February 2006
 | Member |  |  |  
	| Andrey Savochkin wrote: > On Tue, Jun 27, 2006 at 11:20:40AM -0600, Eric W. Biederman wrote:
 >
 >> Thinking about this I am going to suggest a slightly different direction
 >> for get a patchset we can merge.
 >>
 >> First we concentrate on the fundamentals.
 >> - How we mark a device as belonging to a specific network namespace.
 >> - How we mark a socket as belonging to a specific network namespace.
 >>
 >
 > I agree with the direction of your thoughts.
 > I was trying to do a similar thing, define clear steps in network
 > namespace merging.
 >
 > My first patchset covers devices but not sockets.
 > The only difference from what you're suggesting is ipv4 routing.
 > For me, it is not less important than devices and sockets.  May be even
 > more important, since routing exposes design deficiencies less obvious at
 > socket level.
 >
 
 It sounds then like it would be a good start to have general socket
 namespaces, if it would merge more easily - perhaps then network device
 namespaces would fall into place more easily.
 
 AIUI socket namespaces are also necessary for situations where you want
 containers to share IP addresses. AIUI, PlanetLab do something like this
 with a module atop of VServer already (but read
 http://openvz.org/pipermail/devel/2006-June/000666.html for a proper
 explanation from Mark Huang)
 
 >> As part of the fundamentals we add a patch to the generic socket code
 >> that by default will disable it for protocol families that do not indicate
 >> support for handling network namespaces, on a non-default network namespace.
 >>
 >
 > Fine
 >
 > Can you summarize you objections against my way of handling devices, please?
 >
 There were many objections, the major one being the patch was too large for certainty of adequate review.
 
 Quoting what I perceived as a summary from Eric:
 > When I went through this, my patchset just added an explicit
 > continue if the devices was not in the appropriate namespace.
 > I actually prefer the multiple list implementation but at
 > the same time I think it is harder to get a clean implementation
 > out of it.
 
 
 You offered to re-do the patch without separate lists - I suggest that
 this go ahead. No-one should really care; splitting it out into separate
 lists can then be considered a performance optimization for later.
 
 > And what was the typo you referred to in your letter to Kirill Korotaev?
 >
 I think this is the comment he refers to:
 > These hunks should use for_each_netdev(ifp);
 
 
 Both quotes are from http://lkml.org/lkml/2006/6/26/147
 
 Though, in Kirill's defense, it seems a bit strange to expect him to
 raise a fault that was just raised by Eric, in a reply to the message
 where he raised it.
 
 Sam.
 |  
	|  |  |  
	| 
		
			| Re: Network namespaces a path to mergable code. [message #4069 is a reply to message #4060] | Wed, 28 June 2006 04:20   |  
			| 
				
				
					|  ebiederm Messages: 1354
 Registered: February 2006
 | Senior Member |  |  |  
	| Andrey Savochkin <saw@swsoft.com> writes: 
 > Eric,
 >
 > On Tue, Jun 27, 2006 at 11:20:40AM -0600, Eric W. Biederman wrote:
 >>
 >> Thinking about this I am going to suggest a slightly different direction
 >> for get a patchset we can merge.
 >>
 >> First we concentrate on the fundamentals.
 >> - How we mark a device as belonging to a specific network namespace.
 >> - How we mark a socket as belonging to a specific network namespace.
 >
 > I agree with the direction of your thoughts.
 > I was trying to do a similar thing, define clear steps in network
 > namespace merging.
 >
 > My first patchset covers devices but not sockets.
 > The only difference from what you're suggesting is ipv4 routing.
 > For me, it is not less important than devices and sockets.  May be even
 > more important, since routing exposes design deficiencies less obvious at
 > socket level.
 
 I agree we need to do it.  I mostly want a base that allows us to
 not need to convert the whole network stack at once and still be able
 to merge code all the way to the stable kernel.
 
 The routing code is important for understanding design choices.  It
 isn't important for merging if that makes sense.
 
 For everyone looking at routing choices the IPv6 routing table is
 interesting because it does not use a hash table, and seems quite
 possibly to be an equally fast structure that scales better.
 
 There is something to think about there.
 
 >> As part of the fundamentals we add a patch to the generic socket code
 >> that by default will disable it for protocol families that do not indicate
 >> support for handling network namespaces, on a non-default network namespace.
 >
 > Fine
 >
 > Can you summarize you objections against my way of handling devices, please?
 > And what was the typo you referred to in your letter to Kirill Korotaev?
 
 I have no fundamental objects to the content I have seen so far.
 
 Please read the first email Kirill responded too.  I quoted a couple
 of sections of code and described the bugs I saw with the patch.
 
 All minor things.  The typo I was referring to was a section where the
 original iteration was on an ifp variable and you called it dev
 without changing the rest of the code in that section.
 
 The only big issue was that the patch too big, and should be split
 into a patchset for better review.  One patch for the new functions,
 and the an additional patch for each driver/subsystem hunk describing
 why that chunk needed to be changed.
 
 I'm still curious why many of those chunks can't use existing helper
 functions, to be cleaned up.
 
 Eric
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	| 
		
			| Re: Network namespaces a path to mergable code. [message #4082 is a reply to message #4069] | Wed, 28 June 2006 11:06   |  
			| 
				
				
					|  Andrey Savochkin Messages: 47
 Registered: December 2005
 | Member |  |  |  
	| Hi Eric, 
 On Tue, Jun 27, 2006 at 10:20:32PM -0600, Eric W. Biederman wrote:
 > Andrey Savochkin <saw@swsoft.com> writes:
 [snip]
 > > My first patchset covers devices but not sockets.
 > > The only difference from what you're suggesting is ipv4 routing.
 > > For me, it is not less important than devices and sockets.  May be even
 > > more important, since routing exposes design deficiencies less obvious at
 > > socket level.
 >
 > I agree we need to do it.  I mostly want a base that allows us to
 > not need to convert the whole network stack at once and still be able
 > to merge code all the way to the stable kernel.
 >
 > The routing code is important for understanding design choices.  It
 > isn't important for merging if that makes sense.
 
 Ok, fine.
 Now I'm working on socket code.
 
 We still have a question about implicit vs explicit function parameters.
 This question becomes more important for sockets: if we want to allow to use
 sockets belonging to namespaces other than the current one, we need to do
 something about it.
 
 One possible option to resolve this question is to show 2 relatively short
 patches just introducing namespaces for sockets in 2 ways: with explicit
 function parameters and using implicit current context.
 Then people can compare them and vote.
 Do you think it's worth the effort?
 
 >
 > For everyone looking at routing choices the IPv6 routing table is
 > interesting because it does not use a hash table, and seems quite
 > possibly to be an equally fast structure that scales better.
 >
 > There is something to think about there.
 
 Sure
 
 [snip]
 > >
 > > Can you summarize you objections against my way of handling devices, please?
 > > And what was the typo you referred to in your letter to Kirill Korotaev?
 >
 > I have no fundamental objects to the content I have seen so far.
 >
 > Please read the first email Kirill responded too.  I quoted a couple
 > of sections of code and described the bugs I saw with the patch.
 
 I found your comments, thank you!
 
 >
 > All minor things.  The typo I was referring to was a section where the
 > original iteration was on an ifp variable and you called it dev
 > without changing the rest of the code in that section.
 >
 > The only big issue was that the patch too big, and should be split
 > into a patchset for better review.  One patch for the new functions,
 > and the an additional patch for each driver/subsystem hunk describing
 > why that chunk needed to be changed.
 
 I'll split the patch.
 
 > I'm still curious why many of those chunks can't use existing helper
 > functions, to be cleaned up.
 
 What helper functions are you referring to?
 
 Best regards
 
 Andrey
 |  
	|  |  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  |  
	|  | 
 
 
 Current Time: Sat Oct 25 01:45:46 GMT 2025 
 Total time taken to generate the page: 0.09353 seconds |