9 files changed, 274 insertions, 402 deletions
diff --git a/libgo/go/sync/atomic/atomic_test.go b/libgo/go/sync/atomic/atomic_test.go
index 06dd5f7..d2af4f4 100644
--- a/libgo/go/sync/atomic/atomic_test.go
+++ b/libgo/go/sync/atomic/atomic_test.go
@@ -813,7 +813,7 @@ func hammerSwapUintptr32(uaddr *uint32, count int) {
 		new := uintptr(seed+i)<<16 | uintptr(seed+i)<<16>>16
 		old := SwapUintptr(addr, new)
 		if old>>16 != old<<16>>16 {
-			panic(fmt.Sprintf("SwapUintptr is not atomic: %v", old))
+			panic(fmt.Sprintf("SwapUintptr is not atomic: %#08x", old))
 		}
 	}
 }
@@ -827,7 +827,7 @@ func hammerSwapPointer32(uaddr *uint32, count int) {
 		new := uintptr(seed+i)<<16 | uintptr(seed+i)<<16>>16
 		old := uintptr(SwapPointer(addr, unsafe.Pointer(new)))
 		if old>>16 != old<<16>>16 {
-			panic(fmt.Sprintf("SwapPointer is not atomic: %v", old))
+			panic(fmt.Sprintf("SwapPointer is not atomic: %#08x", old))
 		}
 	}
 }
@@ -1465,6 +1465,9 @@ func TestUnaligned64(t *testing.T) {
 }
 
 func TestNilDeref(t *testing.T) {
+	if p := runtime.GOOS + "/" + runtime.GOARCH; p == "freebsd/arm" || p == "netbsd/arm" {
+		t.Skipf("issue 7338: skipping test on %q", p)
+	}
 	funcs := [...]func(){
 		func() { CompareAndSwapInt32(nil, 0, 0) },
 		func() { CompareAndSwapInt64(nil, 0, 0) },
diff --git a/libgo/go/sync/mutex_test.go b/libgo/go/sync/mutex_test.go
index bf78c6f..151b25c 100644
--- a/libgo/go/sync/mutex_test.go
+++ b/libgo/go/sync/mutex_test.go
@@ -9,7 +9,6 @@ package sync_test
 import (
 	"runtime"
 	. "sync"
-	"sync/atomic"
 	"testing"
 )
 
@@ -90,63 +89,34 @@ func BenchmarkMutexUncontended(b *testing.B) {
 		Mutex
 		pad [128]uint8
 	}
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	for p := 0; p < procs; p++ {
-		go func() {
-			var mu PaddedMutex
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					mu.Lock()
-					mu.Unlock()
-				}
-			}
-			c <- true
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		var mu PaddedMutex
+		for pb.Next() {
+			mu.Lock()
+			mu.Unlock()
+		}
+	})
 }
 
 func benchmarkMutex(b *testing.B, slack, work bool) {
-	const (
-		CallsPerSched  = 1000
-		LocalWork      = 100
-		GoroutineSlack = 10
-	)
-	procs := runtime.GOMAXPROCS(-1)
+	var mu Mutex
 	if slack {
-		procs *= GoroutineSlack
+		b.SetParallelism(10)
 	}
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	var mu Mutex
-	for p := 0; p < procs; p++ {
-		go func() {
-			foo := 0
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					mu.Lock()
-					mu.Unlock()
-					if work {
-						for i := 0; i < LocalWork; i++ {
-							foo *= 2
-							foo /= 2
-						}
-					}
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			mu.Lock()
+			mu.Unlock()
+			if work {
+				for i := 0; i < 100; i++ {
+					foo *= 2
+					foo /= 2
 				}
 			}
-			c <- foo == 42
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+		}
+		_ = foo
+	})
 }
 
 func BenchmarkMutex(b *testing.B) {
diff --git a/libgo/go/sync/once_test.go b/libgo/go/sync/once_test.go
index 183069a..8afda82f 100644
--- a/libgo/go/sync/once_test.go
+++ b/libgo/go/sync/once_test.go
@@ -5,9 +5,7 @@
 package sync_test
 
 import (
-	"runtime"
 	. "sync"
-	"sync/atomic"
 	"testing"
 )
 
@@ -62,24 +60,11 @@ func TestOncePanic(t *testing.T) {
 }
 
 func BenchmarkOnce(b *testing.B) {
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
 	var once Once
 	f := func() {}
-	c := make(chan bool, procs)
-	for p := 0; p < procs; p++ {
-		go func() {
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					once.Do(f)
-				}
-			}
-			c <- true
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			once.Do(f)
+		}
+	})
 }
diff --git a/libgo/go/sync/pool.go b/libgo/go/sync/pool.go
index ca49d21..1f08707 100644
--- a/libgo/go/sync/pool.go
+++ b/libgo/go/sync/pool.go
@@ -10,64 +10,51 @@ import (
 	"unsafe"
 )
 
-const (
-	cacheLineSize = 128
-	poolLocalSize = 2 * cacheLineSize
-	poolLocalCap  = poolLocalSize/unsafe.Sizeof(*(*interface{})(nil)) - 1
-)
-
-// A Pool is a set of temporary objects that may be individually saved
-// and retrieved.
+// A Pool is a set of temporary objects that may be individually saved and
+// retrieved.
 //
-// Any item stored in the Pool may be removed automatically by the
-// implementation at any time without notification.
-// If the Pool holds the only reference when this happens, the item
-// might be deallocated.
+// Any item stored in the Pool may be removed automatically at any time without
+// notification. If the Pool holds the only reference when this happens, the
+// item might be deallocated.
 //
 // A Pool is safe for use by multiple goroutines simultaneously.
 //
-// Pool's intended use is for free lists maintained in global variables,
-// typically accessed by multiple goroutines simultaneously. Using a
-// Pool instead of a custom free list allows the runtime to reclaim
-// entries from the pool when it makes sense to do so. An
-// appropriate use of sync.Pool is to create a pool of temporary buffers
-// shared between independent clients of a global resource. On the
-// other hand, if a free list is maintained as part of an object used
-// only by a single client and freed when the client completes,
-// implementing that free list as a Pool is not appropriate.
+// Pool's purpose is to cache allocated but unused items for later reuse,
+// relieving pressure on the garbage collector. That is, it makes it easy to
+// build efficient, thread-safe free lists. However, it is not suitable for all
+// free lists.
+//
+// An appropriate use of a Pool is to manage a group of temporary items
+// silently shared among and potentially reused by concurrent independent
+// clients of a package. Pool provides a way to amortize allocation overhead
+// across many clients.
+//
+// An example of good use of a Pool is in the fmt package, which maintains a
+// dynamically-sized store of temporary output buffers. The store scales under
+// load (when many goroutines are actively printing) and shrinks when
+// quiescent.
+//
+// On the other hand, a free list maintained as part of a short-lived object is
+// not a suitable use for a Pool, since the overhead does not amortize well in
+// that scenario. It is more efficient to have such objects implement their own
+// free list.
 //
-// This is an experimental type and might not be released.
 type Pool struct {
-	// The following fields are known to runtime.
-	next         *Pool      // for use by runtime
-	local        *poolLocal // local fixed-size per-P pool, actually an array
-	localSize    uintptr    // size of the local array
-	globalOffset uintptr    // offset of global
-	// The rest is not known to runtime.
+	local     unsafe.Pointer // local fixed-size per-P pool, actual type is [P]poolLocal
+	localSize uintptr        // size of the local array
 
 	// New optionally specifies a function to generate
 	// a value when Get would otherwise return nil.
 	// It may not be changed concurrently with calls to Get.
 	New func() interface{}
-
-	pad [cacheLineSize]byte
-	// Read-mostly date above this point, mutable data follows.
-	mu     Mutex
-	global []interface{} // global fallback pool
 }
 
 // Local per-P Pool appendix.
 type poolLocal struct {
-	tail   int
-	unused int
-	buf    [poolLocalCap]interface{}
-}
-
-func init() {
-	var v poolLocal
-	if unsafe.Sizeof(v) != poolLocalSize {
-		panic("sync: incorrect pool size")
-	}
+	private interface{}   // Can be used only by the respective P.
+	shared  []interface{} // Can be used by any P.
+	Mutex                 // Protects shared.
+	pad     [128]byte     // Prevents false sharing.
 }
 
 // Put adds x to the pool.
@@ -82,14 +69,17 @@ func (p *Pool) Put(x interface{}) {
 		return
 	}
 	l := p.pin()
-	t := l.tail
-	if t < int(poolLocalCap) {
-		l.buf[t] = x
-		l.tail = t + 1
-		runtime_procUnpin()
+	if l.private == nil {
+		l.private = x
+		x = nil
+	}
+	runtime_procUnpin()
+	if x == nil {
 		return
 	}
-	p.putSlow(l, x)
+	l.Lock()
+	l.shared = append(l.shared, x)
+	l.Unlock()
 }
 
 // Get selects an arbitrary item from the Pool, removes it from the
@@ -108,72 +98,52 @@ func (p *Pool) Get() interface{} {
 		return nil
 	}
 	l := p.pin()
-	t := l.tail
-	if t > 0 {
-		t -= 1
-		x := l.buf[t]
-		l.tail = t
-		runtime_procUnpin()
+	x := l.private
+	l.private = nil
+	runtime_procUnpin()
+	if x != nil {
 		return x
 	}
-	return p.getSlow()
-}
-
-func (p *Pool) putSlow(l *poolLocal, x interface{}) {
-	// Grab half of items from local pool and put to global pool.
-	// Can not lock the mutex while pinned.
-	const N = int(poolLocalCap/2 + 1)
-	var buf [N]interface{}
-	buf[0] = x
-	for i := 1; i < N; i++ {
-		l.tail--
-		buf[i] = l.buf[l.tail]
+	l.Lock()
+	last := len(l.shared) - 1
+	if last >= 0 {
+		x = l.shared[last]
+		l.shared = l.shared[:last]
 	}
-	runtime_procUnpin()
-
-	p.mu.Lock()
-	p.global = append(p.global, buf[:]...)
-	p.mu.Unlock()
+	l.Unlock()
+	if x != nil {
+		return x
+	}
+	return p.getSlow()
 }
 
 func (p *Pool) getSlow() (x interface{}) {
-	// Grab a batch of items from global pool and put to local pool.
-	// Can not lock the mutex while pinned.
-	runtime_procUnpin()
-	p.mu.Lock()
+	// See the comment in pin regarding ordering of the loads.
+	size := atomic.LoadUintptr(&p.localSize) // load-acquire
+	local := p.local                         // load-consume
+	// Try to steal one element from other procs.
 	pid := runtime_procPin()
-	s := p.localSize
-	l := p.local
-	if uintptr(pid) < s {
-		l = indexLocal(l, pid)
-		// Get the item to return.
-		last := len(p.global) - 1
+	runtime_procUnpin()
+	for i := 0; i < int(size); i++ {
+		l := indexLocal(local, (pid+i+1)%int(size))
+		l.Lock()
+		last := len(l.shared) - 1
 		if last >= 0 {
-			x = p.global[last]
-			p.global = p.global[:last]
-		}
-		// Try to refill local pool, we may have been rescheduled to another P.
-		if last > 0 && l.tail == 0 {
-			n := int(poolLocalCap / 2)
-			gl := len(p.global)
-			if n > gl {
-				n = gl
-			}
-			copy(l.buf[:], p.global[gl-n:])
-			p.global = p.global[:gl-n]
-			l.tail = n
+			x = l.shared[last]
+			l.shared = l.shared[:last]
+			l.Unlock()
+			break
 		}
+		l.Unlock()
 	}
-	runtime_procUnpin()
-	p.mu.Unlock()
 
 	if x == nil && p.New != nil {
 		x = p.New()
 	}
-	return
+	return x
 }
 
-// pin pins current goroutine to P, disables preemption and returns poolLocal pool for the P.
+// pin pins the current goroutine to P, disables preemption and returns poolLocal pool for the P.
 // Caller must call runtime_procUnpin() when done with the pool.
 func (p *Pool) pin() *poolLocal {
 	pid := runtime_procPin()
@@ -191,32 +161,63 @@ func (p *Pool) pin() *poolLocal {
 
 func (p *Pool) pinSlow() *poolLocal {
 	// Retry under the mutex.
+	// Can not lock the mutex while pinned.
 	runtime_procUnpin()
-	p.mu.Lock()
-	defer p.mu.Unlock()
+	allPoolsMu.Lock()
+	defer allPoolsMu.Unlock()
 	pid := runtime_procPin()
+	// poolCleanup won't be called while we are pinned.
 	s := p.localSize
 	l := p.local
 	if uintptr(pid) < s {
 		return indexLocal(l, pid)
 	}
 	if p.local == nil {
-		p.globalOffset = unsafe.Offsetof(p.global)
-		runtime_registerPool(p)
+		allPools = append(allPools, p)
 	}
 	// If GOMAXPROCS changes between GCs, we re-allocate the array and lose the old one.
 	size := runtime.GOMAXPROCS(0)
 	local := make([]poolLocal, size)
-	atomic.StorePointer((*unsafe.Pointer)(unsafe.Pointer(&p.local)), unsafe.Pointer(&local[0])) // store-release
-	atomic.StoreUintptr(&p.localSize, uintptr(size))                                            // store-release
+	atomic.StorePointer((*unsafe.Pointer)(&p.local), unsafe.Pointer(&local[0])) // store-release
+	atomic.StoreUintptr(&p.localSize, uintptr(size))                            // store-release
 	return &local[pid]
 }
 
-func indexLocal(l *poolLocal, i int) *poolLocal {
-	return (*poolLocal)(unsafe.Pointer(uintptr(unsafe.Pointer(l)) + unsafe.Sizeof(*l)*uintptr(i))) // uh...
+func poolCleanup() {
+	// This function is called with the world stopped, at the beginning of a garbage collection.
+	// It must not allocate and probably should not call any runtime functions.
+	// Defensively zero out everything, 2 reasons:
+	// 1. To prevent false retention of whole Pools.
+	// 2. If GC happens while a goroutine works with l.shared in Put/Get,
+	//    it will retain whole Pool. So next cycle memory consumption would be doubled.
+	for i, p := range allPools {
+		allPools[i] = nil
+		for i := 0; i < int(p.localSize); i++ {
+			l := indexLocal(p.local, i)
+			l.private = nil
+			for j := range l.shared {
+				l.shared[j] = nil
+			}
+			l.shared = nil
+		}
+	}
+	allPools = []*Pool{}
+}
+
+var (
+	allPoolsMu Mutex
+	allPools   []*Pool
+)
+
+func init() {
+	runtime_registerPoolCleanup(poolCleanup)
+}
+
+func indexLocal(l unsafe.Pointer, i int) *poolLocal {
+	return &(*[1000000]poolLocal)(l)[i]
 }
 
 // Implemented in runtime.
-func runtime_registerPool(*Pool)
+func runtime_registerPoolCleanup(cleanup func())
 func runtime_procPin() int
 func runtime_procUnpin()
diff --git a/libgo/go/sync/pool_test.go b/libgo/go/sync/pool_test.go
index 39ba7a9..c13477d 100644
--- a/libgo/go/sync/pool_test.go
+++ b/libgo/go/sync/pool_test.go
@@ -25,12 +25,12 @@ func TestPool(t *testing.T) {
 	}
 	p.Put("a")
 	p.Put("b")
-	if g := p.Get(); g != "b" {
-		t.Fatalf("got %#v; want b", g)
-	}
 	if g := p.Get(); g != "a" {
 		t.Fatalf("got %#v; want a", g)
 	}
+	if g := p.Get(); g != "b" {
+		t.Fatalf("got %#v; want b", g)
+	}
 	if g := p.Get(); g != nil {
 		t.Fatalf("got %#v; want nil", g)
 	}
@@ -87,7 +87,7 @@ func TestPoolGC(t *testing.T) {
 	}
 	for i := 0; i < 5; i++ {
 		runtime.GC()
-		time.Sleep(time.Millisecond)
+		time.Sleep(time.Duration(i*100+10) * time.Millisecond)
 		// 1 pointer can remain on stack or elsewhere
 		if atomic.LoadUint32(&fin) >= N-1 {
 			return
@@ -133,42 +133,24 @@ func TestPoolStress(t *testing.T) {
 
 func BenchmarkPool(b *testing.B) {
 	var p Pool
-	var wg WaitGroup
-	n0 := uintptr(b.N)
-	n := n0
-	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			for atomic.AddUintptr(&n, ^uintptr(0)) < n0 {
-				for b := 0; b < 100; b++ {
-					p.Put(1)
-					p.Get()
-				}
-			}
-		}()
-	}
-	wg.Wait()
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			p.Put(1)
+			p.Get()
+		}
+	})
 }
 
 func BenchmarkPoolOverlflow(b *testing.B) {
 	var p Pool
-	var wg WaitGroup
-	n0 := uintptr(b.N)
-	n := n0
-	for i := 0; i < runtime.GOMAXPROCS(0); i++ {
-		wg.Add(1)
-		go func() {
-			defer wg.Done()
-			for atomic.AddUintptr(&n, ^uintptr(0)) < n0 {
-				for b := 0; b < 100; b++ {
-					p.Put(1)
-				}
-				for b := 0; b < 100; b++ {
-					p.Get()
-				}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for b := 0; b < 100; b++ {
+				p.Put(1)
 			}
-		}()
-	}
-	wg.Wait()
+			for b := 0; b < 100; b++ {
+				p.Get()
+			}
+		}
+	})
 }
diff --git a/libgo/go/sync/runtime_sema_test.go b/libgo/go/sync/runtime_sema_test.go
index 57a8dbe..5b7dd3d 100644
--- a/libgo/go/sync/runtime_sema_test.go
+++ b/libgo/go/sync/runtime_sema_test.go
@@ -7,7 +7,6 @@ package sync_test
 import (
 	"runtime"
 	. "sync"
-	"sync/atomic"
 	"testing"
 )
 
@@ -16,72 +15,44 @@ func BenchmarkSemaUncontended(b *testing.B) {
 		sem uint32
 		pad [32]uint32
 	}
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	for p := 0; p < procs; p++ {
-		go func() {
-			sem := new(PaddedSem)
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					Runtime_Semrelease(&sem.sem)
-					Runtime_Semacquire(&sem.sem)
-				}
-			}
-			c <- true
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		sem := new(PaddedSem)
+		for pb.Next() {
+			Runtime_Semrelease(&sem.sem)
+			Runtime_Semacquire(&sem.sem)
+		}
+	})
 }
 
 func benchmarkSema(b *testing.B, block, work bool) {
-	const CallsPerSched = 1000
-	const LocalWork = 100
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	c2 := make(chan bool, procs/2)
 	sem := uint32(0)
 	if block {
-		for p := 0; p < procs/2; p++ {
-			go func() {
-				Runtime_Semacquire(&sem)
-				c2 <- true
-			}()
-		}
-	}
-	for p := 0; p < procs; p++ {
+		done := make(chan bool)
 		go func() {
-			foo := 0
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					Runtime_Semrelease(&sem)
-					if work {
-						for i := 0; i < LocalWork; i++ {
-							foo *= 2
-							foo /= 2
-						}
-					}
-					Runtime_Semacquire(&sem)
-				}
+			for p := 0; p < runtime.GOMAXPROCS(0)/2; p++ {
+				Runtime_Semacquire(&sem)
 			}
-			c <- foo == 42
-			Runtime_Semrelease(&sem)
+			done <- true
+		}()
+		defer func() {
+			<-done
 		}()
 	}
-	if block {
-		for p := 0; p < procs/2; p++ {
-			<-c2
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			Runtime_Semrelease(&sem)
+			if work {
+				for i := 0; i < 100; i++ {
+					foo *= 2
+					foo /= 2
+				}
+			}
+			Runtime_Semacquire(&sem)
 		}
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+		_ = foo
+		Runtime_Semrelease(&sem)
+	})
 }
 
 func BenchmarkSemaSyntNonblock(b *testing.B) {
diff --git a/libgo/go/sync/rwmutex_test.go b/libgo/go/sync/rwmutex_test.go
index 39d5d65..0436f97 100644
--- a/libgo/go/sync/rwmutex_test.go
+++ b/libgo/go/sync/rwmutex_test.go
@@ -160,64 +160,39 @@ func BenchmarkRWMutexUncontended(b *testing.B) {
 		RWMutex
 		pad [32]uint32
 	}
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	for p := 0; p < procs; p++ {
-		go func() {
-			var rwm PaddedRWMutex
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					rwm.RLock()
-					rwm.RLock()
-					rwm.RUnlock()
-					rwm.RUnlock()
-					rwm.Lock()
-					rwm.Unlock()
-				}
-			}
-			c <- true
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		var rwm PaddedRWMutex
+		for pb.Next() {
+			rwm.RLock()
+			rwm.RLock()
+			rwm.RUnlock()
+			rwm.RUnlock()
+			rwm.Lock()
+			rwm.Unlock()
+		}
+	})
 }
 
 func benchmarkRWMutex(b *testing.B, localWork, writeRatio int) {
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
 	var rwm RWMutex
-	for p := 0; p < procs; p++ {
-		go func() {
-			foo := 0
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					foo++
-					if foo%writeRatio == 0 {
-						rwm.Lock()
-						rwm.Unlock()
-					} else {
-						rwm.RLock()
-						for i := 0; i != localWork; i += 1 {
-							foo *= 2
-							foo /= 2
-						}
-						rwm.RUnlock()
-					}
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			foo++
+			if foo%writeRatio == 0 {
+				rwm.Lock()
+				rwm.Unlock()
+			} else {
+				rwm.RLock()
+				for i := 0; i != localWork; i += 1 {
+					foo *= 2
+					foo /= 2
 				}
+				rwm.RUnlock()
 			}
-			c <- foo == 42
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+		}
+		_ = foo
+	})
 }
 
 func BenchmarkRWMutexWrite100(b *testing.B) {
diff --git a/libgo/go/sync/waitgroup.go b/libgo/go/sync/waitgroup.go
index 2268111..4c64dca 100644
--- a/libgo/go/sync/waitgroup.go
+++ b/libgo/go/sync/waitgroup.go
@@ -67,11 +67,13 @@ func (wg *WaitGroup) Add(delta int) {
 		return
 	}
 	wg.m.Lock()
-	for i := int32(0); i < wg.waiters; i++ {
-		runtime_Semrelease(wg.sema)
+	if atomic.LoadInt32(&wg.counter) == 0 {
+		for i := int32(0); i < wg.waiters; i++ {
+			runtime_Semrelease(wg.sema)
+		}
+		wg.waiters = 0
+		wg.sema = nil
 	}
-	wg.waiters = 0
-	wg.sema = nil
 	wg.m.Unlock()
 }
 
diff --git a/libgo/go/sync/waitgroup_test.go b/libgo/go/sync/waitgroup_test.go
index 84c4cfc..4c0a043 100644
--- a/libgo/go/sync/waitgroup_test.go
+++ b/libgo/go/sync/waitgroup_test.go
@@ -5,7 +5,6 @@
 package sync_test
 
 import (
-	"runtime"
 	. "sync"
 	"sync/atomic"
 	"testing"
@@ -61,60 +60,60 @@ func TestWaitGroupMisuse(t *testing.T) {
 	t.Fatal("Should panic")
 }
 
+func TestWaitGroupRace(t *testing.T) {
+	// Run this test for about 1ms.
+	for i := 0; i < 1000; i++ {
+		wg := &WaitGroup{}
+		n := new(int32)
+		// spawn goroutine 1
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(n, 1)
+			wg.Done()
+		}()
+		// spawn goroutine 2
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(n, 1)
+			wg.Done()
+		}()
+		// Wait for goroutine 1 and 2
+		wg.Wait()
+		if atomic.LoadInt32(n) != 2 {
+			t.Fatal("Spurious wakeup from Wait")
+		}
+	}
+}
+
 func BenchmarkWaitGroupUncontended(b *testing.B) {
 	type PaddedWaitGroup struct {
 		WaitGroup
 		pad [128]uint8
 	}
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
-	for p := 0; p < procs; p++ {
-		go func() {
-			var wg PaddedWaitGroup
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					wg.Add(1)
-					wg.Done()
-					wg.Wait()
-				}
-			}
-			c <- true
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+	b.RunParallel(func(pb *testing.PB) {
+		var wg PaddedWaitGroup
+		for pb.Next() {
+			wg.Add(1)
+			wg.Done()
+			wg.Wait()
+		}
+	})
 }
 
 func benchmarkWaitGroupAddDone(b *testing.B, localWork int) {
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
 	var wg WaitGroup
-	for p := 0; p < procs; p++ {
-		go func() {
-			foo := 0
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					wg.Add(1)
-					for i := 0; i < localWork; i++ {
-						foo *= 2
-						foo /= 2
-					}
-					wg.Done()
-				}
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			wg.Add(1)
+			for i := 0; i < localWork; i++ {
+				foo *= 2
+				foo /= 2
 			}
-			c <- foo == 42
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+			wg.Done()
+		}
+		_ = foo
+	})
 }
 
 func BenchmarkWaitGroupAddDone(b *testing.B) {
@@ -126,34 +125,18 @@ func BenchmarkWaitGroupAddDoneWork(b *testing.B) {
 }
 
 func benchmarkWaitGroupWait(b *testing.B, localWork int) {
-	const CallsPerSched = 1000
-	procs := runtime.GOMAXPROCS(-1)
-	N := int32(b.N / CallsPerSched)
-	c := make(chan bool, procs)
 	var wg WaitGroup
-	wg.Add(procs)
-	for p := 0; p < procs; p++ {
-		go wg.Done()
-	}
-	for p := 0; p < procs; p++ {
-		go func() {
-			foo := 0
-			for atomic.AddInt32(&N, -1) >= 0 {
-				runtime.Gosched()
-				for g := 0; g < CallsPerSched; g++ {
-					wg.Wait()
-					for i := 0; i < localWork; i++ {
-						foo *= 2
-						foo /= 2
-					}
-				}
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			wg.Wait()
+			for i := 0; i < localWork; i++ {
+				foo *= 2
+				foo /= 2
 			}
-			c <- foo == 42
-		}()
-	}
-	for p := 0; p < procs; p++ {
-		<-c
-	}
+		}
+		_ = foo
+	})
 }
 
 func BenchmarkWaitGroupWait(b *testing.B) {