OpenVZ Forum


Home » Mailing lists » Devel » [PATCH 0/4] Fix race between sk_filter reassign and sk_clone()
[PATCH 0/4] Fix race between sk_filter reassign and sk_clone() [message #21864] Wed, 17 October 2007 09:45 Go to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
The race can result in that some sock will get an sk_filter
pointer set to kfree-d memory. Look

CPU1:                            CPU2:
sk_clone():                      sk_attach_filter():
  new_sk = sk_alloc(...);
  sock_copy(new_sk, sk); 
  /* copies the filter ptr */
  ...
  filter = new_sk->sk_filter;
  if (filter)
                                     old_fp = sk->sk_filter;
                                     ...
                                     sk_filter_release(old_fp);
                                       if (atomic_dec_and_test(&old_fp->refcnt))
     atomic_inc(&filter->refcnt);
                                          /* true */
                                          call_rcu(&fp->rcu, kfree);

that's it - after a quiescent state pass the new_sk will have 
a pointer on kfree-d filter.

The same problem exists for detaching filter (SO_DETACH_FILTER).

The proposed fix consists of 3 preparation patches and the fix itself.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
[PATCH 1/4] Introduce the sk_detach_filter() call [message #21865 is a reply to message #21864] Wed, 17 October 2007 09:47 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
Filter is attached in a separate function, so do the
same for filter detaching.

This also removes one variable sock_setsockopt().

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 91b2e3b..ddfa037 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -146,6 +146,7 @@ struct sock;
 
 extern unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen);
 extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
+extern int sk_detach_filter(struct sock *sk);
 extern int sk_chk_filter(struct sock_filter *filter, int flen);
 #endif /* __KERNEL__ */
 
diff --git a/net/core/filter.c b/net/core/filter.c
index bd903aa..fd60758 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -433,5 +433,21 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	return err;
 }
 
+int sk_detach_filter(struct sock *sk)
+{
+	int ret = -ENOENT;
+	struct sk_filter *filter;
+
+	rcu_read_lock_bh();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter) {
+		rcu_assign_pointer(sk->sk_filter, NULL);
+		sk_filter_release(sk, filter);
+		ret = 0;
+	}
+	rcu_read_unlock_bh();
+	return ret;
+}
+
 EXPORT_SYMBOL(sk_chk_filter);
 EXPORT_SYMBOL(sk_run_filter);
diff --git a/net/core/sock.c b/net/core/sock.c
index d45ecdc..0710138 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -428,7 +428,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
 		    char __user *optval, int optlen)
 {
 	struct sock *sk=sock->sk;
-	struct sk_filter *filter;
 	int val;
 	int valbool;
 	struct linger ling;
@@ -652,16 +651,7 @@ set_rcvbuf:
 		break;
 
 	case SO_DETACH_FILTER:
-		rcu_read_lock_bh();
-		filter = rcu_dereference(sk->sk_filter);
-		if (filter) {
-			rcu_assign_pointer(sk->sk_filter, NULL);
-			sk_filter_release(sk, filter);
-			rcu_read_unlock_bh();
-			break;
-		}
-		rcu_read_unlock_bh();
-		ret = -ENONET;
+		ret = sk_detach_filter(sk);
 		break;
 
 	case SO_PASSSEC:
-- 
1.5.3.4
[PATCH 2/4] Move the filter releasing into a separate call [message #21866 is a reply to message #21864] Wed, 17 October 2007 09:49 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
This is done merely as a preparation for the fix.

The sk_filter_uncharge() unaccounts the filter memory and calls
the sk_filter_release(), which in turn decrements the refcount
anf frees the filter.

The latter function will be required separately.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/include/net/sock.h b/include/net/sock.h
index 453c79d..b9cfe12 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -922,14 +922,18 @@ static inline void sk_filter_rcu_free(struct rcu_head *rcu)
  *	Remove a filter from a socket and release its resources.
  */
 
-static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp)
+static inline void sk_filter_release(struct sk_filter *fp)
+{
+	if (atomic_dec_and_test(&fp->refcnt))
+		call_rcu_bh(&fp->rcu, sk_filter_rcu_free);
+}
+
+static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
 {
 	unsigned int size = sk_filter_len(fp);
 
 	atomic_sub(size, &sk->sk_omem_alloc);
-
-	if (atomic_dec_and_test(&fp->refcnt))
-		call_rcu_bh(&fp->rcu, sk_filter_rcu_free);
+	sk_filter_release(fp);
 }
 
 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
diff --git a/net/core/filter.c b/net/core/filter.c
index fd60758..2be1830 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -429,7 +429,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	}
 
 	if (fp)
-		sk_filter_release(sk, fp);
+		sk_filter_uncharge(sk, fp);
 	return err;
 }
 
@@ -442,7 +442,7 @@ int sk_detach_filter(struct sock *sk)
 	filter = rcu_dereference(sk->sk_filter);
 	if (filter) {
 		rcu_assign_pointer(sk->sk_filter, NULL);
-		sk_filter_release(sk, filter);
+		sk_filter_uncharge(sk, filter);
 		ret = 0;
 	}
 	rcu_read_unlock_bh();
diff --git a/net/core/sock.c b/net/core/sock.c
index 0710138..d292b41 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -915,7 +915,7 @@ void sk_free(struct sock *sk)
 
 	filter = rcu_dereference(sk->sk_filter);
 	if (filter) {
-		sk_filter_release(sk, filter);
+		sk_filter_uncharge(sk, filter);
 		rcu_assign_pointer(sk->sk_filter, NULL);
 	}
 
-- 
1.5.3.4
[PATCH 3/4] Cleanup the error path in sk_attach_filter [message #21867 is a reply to message #21864] Wed, 17 October 2007 09:51 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
The sk_filter_uncharge is called for error handling and
for releasing the former filter, but this will have to
be done in a bit different manner, so cleanup the error
path a bit.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/net/core/filter.c b/net/core/filter.c
index 2be1830..54dddc9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -398,7 +398,7 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
  */
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
-	struct sk_filter *fp;
+	struct sk_filter *fp, *old_fp;
 	unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
 	int err;
 
@@ -418,19 +418,18 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	fp->len = fprog->len;
 
 	err = sk_chk_filter(fp->insns, fp->len);
-	if (!err) {
-		struct sk_filter *old_fp;
-
-		rcu_read_lock_bh();
-		old_fp = rcu_dereference(sk->sk_filter);
-		rcu_assign_pointer(sk->sk_filter, fp);
-		rcu_read_unlock_bh();
-		fp = old_fp;
+	if (err) {
+		sk_filter_uncharge(sk, fp);
+		return err;
 	}
 
-	if (fp)
-		sk_filter_uncharge(sk, fp);
-	return err;
+	rcu_read_lock_bh();
+	old_fp = rcu_dereference(sk->sk_filter);
+	rcu_assign_pointer(sk->sk_filter, fp);
+	rcu_read_unlock_bh();
+
+	sk_filter_uncharge(sk, old_fp);
+	return 0;
 }
 
 int sk_detach_filter(struct sock *sk)
-- 
1.5.3.4
[PATCH 4/4] Fix the race between sk_filter_(de|at)tach and sk_clone() [message #21868 is a reply to message #21864] Wed, 17 October 2007 09:53 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
The proposed fix is to delay the reference counter decrement
until the quiescent state pass. This will give sk_clone() a
chance to get the reference on the cloned filter.

Regular sk_filter_uncharge can happen from the sk_free() only
and there's no need in delaying the put - the socket is dead
anyway and is to be release itself.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

---

diff --git a/include/net/sock.h b/include/net/sock.h
index b9cfe12..43fc3fa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -905,16 +905,6 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
 }
 
 /**
- * 	sk_filter_rcu_free: Free a socket filter
- *	@rcu: rcu_head that contains the sk_filter to free
- */
-static inline void sk_filter_rcu_free(struct rcu_head *rcu)
-{
-	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
-	kfree(fp);
-}
-
-/**
  *	sk_filter_release: Release a socket filter
  *	@sk: socket
  *	@fp: filter to remove
@@ -925,7 +915,7 @@ static inline void sk_filter_rcu_free(struct rcu_head *rcu)
 static inline void sk_filter_release(struct sk_filter *fp)
 {
 	if (atomic_dec_and_test(&fp->refcnt))
-		call_rcu_bh(&fp->rcu, sk_filter_rcu_free);
+		kfree(fp);
 }
 
 static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp)
diff --git a/net/core/filter.c b/net/core/filter.c
index 54dddc9..b8bc7d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -387,6 +387,25 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 }
 
 /**
+ * 	sk_filter_rcu_release: Release a socket filter by rcu_head
+ *	@rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_rcu_release(struct rcu_head *rcu)
+{
+	struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+	sk_filter_release(fp);
+}
+
+static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+	unsigned int size = sk_filter_len(fp);
+
+	atomic_sub(size, &sk->sk_omem_alloc);
+	call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
+}
+
+/**
  *	sk_attach_filter - attach a socket filter
  *	@fprog: the filter program
  *	@sk: the socket to use
@@ -428,7 +447,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	rcu_assign_pointer(sk->sk_filter, fp);
 	rcu_read_unlock_bh();
 
-	sk_filter_uncharge(sk, old_fp);
+	sk_filter_delayed_uncharge(sk, old_fp);
 	return 0;
 }
 
@@ -441,7 +460,7 @@ int sk_detach_filter(struct sock *sk)
 	filter = rcu_dereference(sk->sk_filter);
 	if (filter) {
 		rcu_assign_pointer(sk->sk_filter, NULL);
-		sk_filter_uncharge(sk, filter);
+		sk_filter_delayed_uncharge(sk, filter);
 		ret = 0;
 	}
 	rcu_read_unlock_bh();
-- 
1.5.3.4
Re: [PATCH 0/4] Fix race between sk_filter reassign and sk_clone() [message #21938 is a reply to message #21864] Thu, 18 October 2007 04:23 Go to previous messageGo to next message
davem is currently offline  davem
Messages: 463
Registered: February 2006
Senior Member
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 17 Oct 2007 13:45:54 +0400

> The race can result in that some sock will get an sk_filter
> pointer set to kfree-d memory. Look
> 
> CPU1:                            CPU2:
> sk_clone():                      sk_attach_filter():
>   new_sk = sk_alloc(...);
>   sock_copy(new_sk, sk); 
>   /* copies the filter ptr */
>   ...
>   filter = new_sk->sk_filter;
>   if (filter)
>                                      old_fp = sk->sk_filter;
>                                      ...
>                                      sk_filter_release(old_fp);
>                                        if (atomic_dec_and_test(&old_fp->refcnt))
>      atomic_inc(&filter->refcnt);
>                                           /* true */
>                                           call_rcu(&fp->rcu, kfree);
> 
> that's it - after a quiescent state pass the new_sk will have 
> a pointer on kfree-d filter.
> 
> The same problem exists for detaching filter (SO_DETACH_FILTER).
> 
> The proposed fix consists of 3 preparation patches and the fix itself.
> 
> Signed-off-by: Pavel Emelyanov <xemul@openvz.org>

Looks good, applied.

Thanks for fixing this bug Pavel!
Re: [PATCH 0/4] Fix race between sk_filter reassign and sk_clone() [message #22033 is a reply to message #21864] Fri, 19 October 2007 04:55 Go to previous messageGo to next message
davem is currently offline  davem
Messages: 463
Registered: February 2006
Senior Member
From: Olof Johansson <olof@lixom.net>
Date: Thu, 18 Oct 2007 21:29:47 -0500

> So, looks like rcu_dereference() returned NULL. I don't know the
> filter code at all, but it seems like it might be a valid case?
> sk_detach_filter() seems to handle a NULL sk_filter, at least.
> 
> 
> So, this needs review by someone who knows the filter, but it fixes the
> problem for me:
> 
> 
> Signed-off-by: Olof Johansson <olof@lixom.net>

I've applied this for now to my net-2.6 tree, thanks Olof
for tracking this down.

Pavel please take a look at this and let me know if it should
fixed in some other way.

Thanks!
Re: [PATCH 0/4] Fix race between sk_filter reassign and sk_clone() [message #22037 is a reply to message #22033] Fri, 19 October 2007 07:37 Go to previous messageGo to next message
Pavel Emelianov is currently offline  Pavel Emelianov
Messages: 1149
Registered: September 2006
Senior Member
David Miller wrote:
> From: Olof Johansson <olof@lixom.net>
> Date: Thu, 18 Oct 2007 21:29:47 -0500
> 
>> So, looks like rcu_dereference() returned NULL. I don't know the
>> filter code at all, but it seems like it might be a valid case?
>> sk_detach_filter() seems to handle a NULL sk_filter, at least.
>>
>>
>> So, this needs review by someone who knows the filter, but it fixes the
>> problem for me:

Yes. The NULL filter is a valid case, when there are no
filters attached at all. So this fix is correct.

Thanks, Olof. Sorry, Dave :(

>>
>> Signed-off-by: Olof Johansson <olof@lixom.net>

Acked-by: Pavel Emelyanov <xemul@openvz.org>

> I've applied this for now to my net-2.6 tree, thanks Olof
> for tracking this down.
> 
> Pavel please take a look at this and let me know if it should
> fixed in some other way.
> 
> Thanks!
>
Re: [PATCH 0/4] Fix race between sk_filter reassign and sk_clone() [message #22039 is a reply to message #22037] Fri, 19 October 2007 07:52 Go to previous message
davem is currently offline  davem
Messages: 463
Registered: February 2006
Senior Member
From: Pavel Emelyanov <xemul@openvz.org>
Date: Fri, 19 Oct 2007 11:37:02 +0400

> David Miller wrote:
> > From: Olof Johansson <olof@lixom.net>
> > Date: Thu, 18 Oct 2007 21:29:47 -0500
> > 
> >> So, looks like rcu_dereference() returned NULL. I don't know the
> >> filter code at all, but it seems like it might be a valid case?
> >> sk_detach_filter() seems to handle a NULL sk_filter, at least.
> >>
> >>
> >> So, this needs review by someone who knows the filter, but it fixes the
> >> problem for me:
> 
> Yes. The NULL filter is a valid case, when there are no
> filters attached at all. So this fix is correct.
> 
> Thanks, Olof. Sorry, Dave :(

No worries, thanks for reviewing.
Re: [PATCH 0/4] Fix race between sk_filter reassign and sk_clone() [message #22204 is a reply to message #21938] Fri, 19 October 2007 02:29 Go to previous message
Olof Johansson is currently offline  Olof Johansson
Messages: 1
Registered: October 2007
Junior Member
On Wed, Oct 17, 2007 at 09:23:02PM -0700, David Miller wrote:
[...]
> > The same problem exists for detaching filter (SO_DETACH_FILTER).
> > 
> > The proposed fix consists of 3 preparation patches and the fix itself.
> > 
> > Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
> 
> Looks good, applied.
> 
> Thanks for fixing this bug Pavel!

Looks like this might be causing problems, at least for me on ppc. This
happened during a normal boot, right around first interface config/dhcp
run..

cpu 0x0: Vector: 300 (Data Access) at [c00000000147b820]
    pc: c000000000435e5c: .sk_filter_delayed_uncharge+0x1c/0x60
    lr: c0000000004360d0: .sk_attach_filter+0x170/0x180
    sp: c00000000147baa0
   msr: 9000000000009032
   dar: 4
 dsisr: 40000000
  current = 0xc000000004780fa0
  paca    = 0xc000000000650480
    pid   = 1295, comm = dhclient3
0:mon> t
[c00000000147bb20] c0000000004360d0 .sk_attach_filter+0x170/0x180
[c00000000147bbd0] c000000000418988 .sock_setsockopt+0x788/0x7f0
[c00000000147bcb0] c000000000438a74 .compat_sys_setsockopt+0x4e4/0x5a0
[c00000000147bd90] c00000000043955c .compat_sys_socketcall+0x25c/0x2b0
[c00000000147be30] c000000000007508 syscall_exit+0x0/0x40
--- Exception: c01 (System Call) at 000000000ff618d8
SP (fffdf040) is in userspace
0:mon> 

I.e. null pointer deref at sk_filter_delayed_uncharge+0x1c:

0:mon> di $.sk_filter_delayed_uncharge
c000000000435e40  7c0802a6      mflr    r0
c000000000435e44  fbc1fff0      std     r30,-16(r1)
c000000000435e48  7c8b2378      mr      r11,r4
c000000000435e4c  ebc2cdd0      ld      r30,-12848(r2)
c000000000435e50  f8010010      std     r0,16(r1)
c000000000435e54  f821ff81      stdu    r1,-128(r1)
c000000000435e58  380300a4      addi    r0,r3,164
c000000000435e5c  81240004      lwz     r9,4(r4)

That's the deref of fp:

static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
{
        unsigned int size = sk_filter_len(fp);
...

That is called from sk_attach_filter():

...
        rcu_read_lock_bh();
        old_fp = rcu_dereference(sk->sk_filter);
        rcu_assign_pointer(sk->sk_filter, fp);
        rcu_read_unlock_bh();

        sk_filter_delayed_uncharge(sk, old_fp);
        return 0;
...

So, looks like rcu_dereference() returned NULL. I don't know the
filter code at all, but it seems like it might be a valid case?
sk_detach_filter() seems to handle a NULL sk_filter, at least.


So, this needs review by someone who knows the filter, but it fixes the
problem for me:


Signed-off-by: Olof Johansson <olof@lixom.net>

diff --git a/net/core/filter.c b/net/core/filter.c
index 1f0068e..e0a0694 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -447,7 +447,8 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	rcu_assign_pointer(sk->sk_filter, fp);
 	rcu_read_unlock_bh();
 
-	sk_filter_delayed_uncharge(sk, old_fp);
+	if (old_fp)
+		sk_filter_delayed_uncharge(sk, old_fp);
 	return 0;
 }
Previous Topic: How Inactive may be much greather than cached?
Next Topic: Re: 2.6.23-mm1 s390 driver problem
Goto Forum:
  


Current Time: Thu Jun 01 14:52:09 GMT 2023

Total time taken to generate the page: 0.01509 seconds